arrfab / rpms / glibc

Forked from rpms/glibc 4 years ago
Clone

Blame SOURCES/glibc-ppc64le-31.patch

147e83
# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d
147e83
# Author: Alan Modra <amodra@gmail.com>
147e83
# Date:   Sat Aug 17 18:47:22 2013 +0930
147e83
# 
147e83
#     PowerPC LE memcpy
147e83
#     http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
147e83
#     
147e83
#     LIttle-endian support for memcpy.  I spent some time cleaning up the
147e83
#     64-bit power7 memcpy, in order to avoid the extra alignment traps
147e83
#     power7 takes for little-endian.  It probably would have been better
147e83
#     to copy the linux kernel version of memcpy.
147e83
#     
147e83
#         * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
147e83
#         * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
147e83
#         * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
147e83
#         * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
147e83
#         * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
147e83
#         * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
147e83
#         * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
147e83
#         * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
147e83
#         * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
147e83
#         use of regs.  Use power7 mtocrf.  Tidy function tails.
147e83
# 
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
@@ -205,15 +205,28 @@
147e83
     blt   cr6,5f
147e83
     srwi  7,6,16
147e83
     bgt	  cr6,3f
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    sth   7,0(3)
147e83
+#else
147e83
     sth   6,0(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 3:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,24
147e83
+    stb   6,0(3)
147e83
+    sth   7,1(3)
147e83
+#else
147e83
     stb   7,0(3)
147e83
     sth   6,1(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 5:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,8
147e83
+#endif
147e83
     stb   6,0(3)
147e83
 7:
147e83
     cmplwi	cr1,10,16
147e83
@@ -341,13 +354,23 @@
147e83
     bf      30,1f
147e83
 
147e83
     /* there are at least two words to copy, so copy them */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,6,10
147e83
+    slw   8,7,9
147e83
+#else
147e83
     slw   0,6,10  /* shift 1st src word to left align it in R0 */
147e83
     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
147e83
+#endif
147e83
     or    0,0,8   /* or them to get word to store */
147e83
     lwz   6,8(5)  /* load the 3rd src word */
147e83
     stw   0,0(4)  /* store the 1st dst word */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,7,10
147e83
+    slw   8,6,9
147e83
+#else
147e83
     slw   0,7,10  /* now left align 2nd src word into R0 */
147e83
     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
147e83
+#endif
147e83
     or    0,0,8   /* or them to get word to store */
147e83
     lwz   7,12(5)
147e83
     stw   0,4(4)  /* store the 2nd dst word */
147e83
@@ -355,8 +378,13 @@
147e83
     addi  5,5,16
147e83
     bf    31,4f
147e83
     /* there is a third word to copy, so copy it */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,6,10
147e83
+    slw   8,7,9
147e83
+#else
147e83
     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
147e83
     srw   8,7,9   /* shift 4th src word to right align it in R8 */
147e83
+#endif
147e83
     or    0,0,8   /* or them to get word to store */
147e83
     stw   0,0(4)  /* store 3rd dst word */
147e83
     mr    6,7
147e83
@@ -366,8 +394,13 @@
147e83
     b     4f
147e83
     .align 4
147e83
 1:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw     0,6,10
147e83
+    slw     8,7,9
147e83
+#else
147e83
     slw     0,6,10  /* shift 1st src word to left align it in R0 */
147e83
     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
147e83
+#endif
147e83
     addi  5,5,8
147e83
     or    0,0,8   /* or them to get word to store */
147e83
     bf    31,4f
147e83
@@ -380,23 +413,43 @@
147e83
     .align  4
147e83
 4:
147e83
     /* copy 16 bytes at a time */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,6,10
147e83
+    slw   8,7,9
147e83
+#else
147e83
     slw   0,6,10
147e83
     srw   8,7,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     lwz   6,0(5)
147e83
     stw   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,7,10
147e83
+    slw   8,6,9
147e83
+#else
147e83
     slw   0,7,10
147e83
     srw   8,6,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     lwz   7,4(5)
147e83
     stw   0,4(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,6,10
147e83
+    slw   8,7,9
147e83
+#else
147e83
     slw   0,6,10
147e83
     srw   8,7,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     lwz   6,8(5)
147e83
     stw   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,7,10
147e83
+    slw   8,6,9
147e83
+#else
147e83
     slw   0,7,10
147e83
     srw   8,6,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     lwz   7,12(5)
147e83
     stw   0,12(4)
147e83
@@ -405,8 +458,13 @@
147e83
     bdnz+ 4b
147e83
 8:
147e83
     /* calculate and store the final word */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srw   0,6,10
147e83
+    slw   8,7,9
147e83
+#else
147e83
     slw   0,6,10
147e83
     srw   8,7,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     stw   0,0(4)
147e83
 3:
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
@@ -221,15 +221,28 @@
147e83
     blt   cr6,5f
147e83
     srwi  7,6,16
147e83
     bgt	  cr6,3f
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    sth   7,0(3)
147e83
+#else
147e83
     sth   6,0(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 3:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,24
147e83
+    stb   6,0(3)
147e83
+    sth   7,1(3)
147e83
+#else
147e83
     stb   7,0(3)
147e83
     sth   6,1(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 5:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,8
147e83
+#endif
147e83
     stb   6,0(3)
147e83
 7:
147e83
     cmplwi	cr1,10,16
147e83
@@ -579,7 +592,11 @@
147e83
     lwz     6,-1(4)
147e83
     cmplwi  cr6,31,4
147e83
     srwi    8,31,5    /* calculate the 32 byte loop count */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srwi    6,6,8
147e83
+#else
147e83
     slwi    6,6,8
147e83
+#endif
147e83
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
147e83
     blt     cr5,L(wdu1_32tail)
147e83
     mtctr   8
147e83
@@ -587,8 +604,12 @@
147e83
 
147e83
     lwz   8,3(4)
147e83
     lwz   7,4(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,24,32
147e83
+#else
147e83
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
147e83
     rlwimi 6,8,8,(32-8),31
147e83
+#endif
147e83
     b      L(wdu1_loop32x)
147e83
     .align  4
147e83
 L(wdu1_loop32):
147e83
@@ -597,8 +618,12 @@
147e83
     lwz   7,4(4)
147e83
     stw   10,-8(3)
147e83
     stw   11,-4(3)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,24,32
147e83
+#else
147e83
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
147e83
     rlwimi 6,8,8,(32-8),31
147e83
+#endif
147e83
 L(wdu1_loop32x):
147e83
     lwz   10,8(4)
147e83
     lwz   11,12(4)
147e83
@@ -615,7 +640,11 @@
147e83
     stw   6,16(3)
147e83
     stw   7,20(3)
147e83
     addi  3,3,32
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srwi  6,8,8
147e83
+#else
147e83
     slwi  6,8,8
147e83
+#endif
147e83
     bdnz+ L(wdu1_loop32)
147e83
     stw   10,-8(3)
147e83
     stw   11,-4(3)
147e83
@@ -626,8 +655,12 @@
147e83
     blt     cr6,L(wdu_4tail)
147e83
     /* calculate and store the final word */
147e83
     lwz   8,3(4)
147e83
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,24,32
147e83
+#else
147e83
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
147e83
     rlwimi 6,8,8,(32-8),31
147e83
+#endif
147e83
     b     L(wdu_32tailx)
147e83
 
147e83
 L(wdu2_32):
147e83
@@ -635,7 +668,11 @@
147e83
     lwz     6,-2(4)
147e83
     cmplwi  cr6,31,4
147e83
     srwi    8,31,5    /* calculate the 32 byte loop count */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srwi    6,6,16
147e83
+#else
147e83
     slwi    6,6,16
147e83
+#endif
147e83
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
147e83
     blt     cr5,L(wdu2_32tail)
147e83
     mtctr   8
147e83
@@ -643,8 +680,11 @@
147e83
 
147e83
     lwz   8,2(4)
147e83
     lwz   7,4(4)
147e83
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,16,32
147e83
+#else
147e83
     rlwimi 6,8,16,(32-16),31
147e83
+#endif
147e83
     b      L(wdu2_loop32x)
147e83
     .align  4
147e83
 L(wdu2_loop32):
147e83
@@ -653,8 +693,11 @@
147e83
     lwz   7,4(4)
147e83
     stw   10,-8(3)
147e83
     stw   11,-4(3)
147e83
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,16,32
147e83
+#else
147e83
     rlwimi 6,8,16,(32-16),31
147e83
+#endif
147e83
 L(wdu2_loop32x):
147e83
     lwz   10,8(4)
147e83
     lwz   11,12(4)
147e83
@@ -672,7 +715,11 @@
147e83
     stw   6,16(3)
147e83
     stw   7,20(3)
147e83
     addi  3,3,32
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srwi  6,8,16
147e83
+#else
147e83
     slwi  6,8,16
147e83
+#endif
147e83
     bdnz+ L(wdu2_loop32)
147e83
     stw   10,-8(3)
147e83
     stw   11,-4(3)
147e83
@@ -683,8 +730,11 @@
147e83
     blt     cr6,L(wdu_4tail)
147e83
     /* calculate and store the final word */
147e83
     lwz   8,2(4)
147e83
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,16,32
147e83
+#else
147e83
     rlwimi 6,8,16,(32-16),31
147e83
+#endif
147e83
     b     L(wdu_32tailx)
147e83
 
147e83
 L(wdu3_32):
147e83
@@ -692,7 +742,11 @@
147e83
     lwz     6,-3(4)
147e83
     cmplwi  cr6,31,4
147e83
     srwi    8,31,5    /* calculate the 32 byte loop count */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srwi    6,6,24
147e83
+#else
147e83
     slwi    6,6,24
147e83
+#endif
147e83
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
147e83
     blt     cr5,L(wdu3_32tail)
147e83
     mtctr   8
147e83
@@ -700,8 +754,11 @@
147e83
 
147e83
     lwz   8,1(4)
147e83
     lwz   7,4(4)
147e83
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,8,32
147e83
+#else
147e83
     rlwimi 6,8,24,(32-24),31
147e83
+#endif
147e83
     b      L(wdu3_loop32x)
147e83
     .align  4
147e83
 L(wdu3_loop32):
147e83
@@ -710,8 +767,11 @@
147e83
     lwz   7,4(4)
147e83
     stw   10,-8(3)
147e83
     stw   11,-4(3)
147e83
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,8,32
147e83
+#else
147e83
     rlwimi 6,8,24,(32-24),31
147e83
+#endif
147e83
 L(wdu3_loop32x):
147e83
     lwz   10,8(4)
147e83
     lwz   11,12(4)
147e83
@@ -728,7 +788,11 @@
147e83
     stw   6,16(3)
147e83
     stw   7,20(3)
147e83
     addi  3,3,32
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srwi  6,8,24
147e83
+#else
147e83
     slwi  6,8,24
147e83
+#endif
147e83
     bdnz+ L(wdu3_loop32)
147e83
     stw   10,-8(3)
147e83
     stw   11,-4(3)
147e83
@@ -739,8 +803,11 @@
147e83
     blt     cr6,L(wdu_4tail)
147e83
     /* calculate and store the final word */
147e83
     lwz   8,1(4)
147e83
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rldimi 6,8,8,32
147e83
+#else
147e83
     rlwimi 6,8,24,(32-24),31
147e83
+#endif
147e83
     b     L(wdu_32tailx)
147e83
     .align  4
147e83
 L(wdu_32tailx):
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
@@ -385,7 +385,7 @@
147e83
 
147e83
 	beq    L(copy_GE_32_unaligned_cont)
147e83
 
147e83
-	/* SRC is not quadword aligned, get it aligned.  */
147e83
+	/* DST is not quadword aligned, get it aligned.  */
147e83
 
147e83
 	mtcrf   0x01,0
147e83
 	subf    31,0,5
147e83
@@ -437,13 +437,21 @@
147e83
 	mr      11,12
147e83
 	mtcrf   0x01,9
147e83
 	cmplwi  cr6,9,1
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	lvsr    5,0,12
147e83
+#else
147e83
 	lvsl    5,0,12
147e83
+#endif
147e83
 	lvx     3,0,12
147e83
 	bf      31,L(setup_unaligned_loop)
147e83
 
147e83
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
147e83
 	lvx     4,12,6
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   6,4,3,5
147e83
+#else
147e83
 	vperm   6,3,4,5
147e83
+#endif
147e83
 	addi    11,12,16
147e83
 	addi    10,3,16
147e83
 	stvx    6,0,3
147e83
@@ -463,11 +471,17 @@
147e83
 	vector instructions though.  */
147e83
 
147e83
 	lvx	4,11,6	      /* vr4 = r11+16.  */
147e83
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
147e83
-			      of vr3/vr4 into vr6.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   6,4,3,5
147e83
+#else
147e83
+	vperm   6,3,4,5
147e83
+#endif
147e83
 	lvx	3,11,7	      /* vr3 = r11+32.  */
147e83
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
147e83
-			      of vr3/vr4 into vr10.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   10,3,4,5
147e83
+#else
147e83
+	vperm   10,4,3,5
147e83
+#endif
147e83
 	addi    11,11,32
147e83
 	stvx    6,0,10
147e83
 	stvx    10,10,6
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
@@ -327,7 +327,7 @@
147e83
 
147e83
 	beq	L(copy_GE_32_unaligned_cont)
147e83
 
147e83
-	/* SRC is not quadword aligned, get it aligned.  */
147e83
+	/* DST is not quadword aligned, get it aligned.  */
147e83
 
147e83
 	mtcrf	0x01,0
147e83
 	subf	31,0,5
147e83
@@ -379,13 +379,21 @@
147e83
 	mr	11,12
147e83
 	mtcrf	0x01,9
147e83
 	cmplwi	cr6,9,1
147e83
-	lvsl	5,0,12
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	lvsr    5,0,12
147e83
+#else
147e83
+	lvsl    5,0,12
147e83
+#endif
147e83
 	lvx	3,0,12
147e83
 	bf	31,L(setup_unaligned_loop)
147e83
 
147e83
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
147e83
 	lvx	4,12,6
147e83
-	vperm	6,3,4,5
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   6,4,3,5
147e83
+#else
147e83
+	vperm   6,3,4,5
147e83
+#endif
147e83
 	addi	11,12,16
147e83
 	addi	10,3,16
147e83
 	stvx	6,0,3
147e83
@@ -405,11 +413,17 @@
147e83
 	vector instructions though.  */
147e83
 
147e83
 	lvx	4,11,6	      /* vr4 = r11+16.  */
147e83
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
147e83
-				 of vr3/vr4 into vr6.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   6,4,3,5
147e83
+#else
147e83
+	vperm   6,3,4,5
147e83
+#endif
147e83
 	lvx	3,11,7	      /* vr3 = r11+32.  */
147e83
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
147e83
-				 of vr3/vr4 into vr10.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   10,3,4,5
147e83
+#else
147e83
+	vperm   10,4,3,5
147e83
+#endif
147e83
 	addi	11,11,32
147e83
 	stvx	6,0,10
147e83
 	stvx	10,10,6
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
@@ -214,15 +214,28 @@
147e83
     blt   cr6,5f
147e83
     srdi  7,6,16
147e83
     bgt	  cr6,3f
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    sth   7,0(3)
147e83
+#else
147e83
     sth   6,0(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 3:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,24
147e83
+    stb   6,0(3)
147e83
+    sth   7,1(3)
147e83
+#else
147e83
     stb   7,0(3)
147e83
     sth   6,1(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 5:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,8
147e83
+#endif
147e83
     stb   6,0(3)
147e83
 7:
147e83
     cmpldi	cr1,10,16
147e83
@@ -330,7 +343,11 @@
147e83
     ld    7,8(5)
147e83
     subfic  9,10,64
147e83
     beq   2f
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd   0,6,10
147e83
+#else
147e83
     sld   0,6,10
147e83
+#endif
147e83
     cmpldi  11,1
147e83
     mr    6,7
147e83
     addi  4,4,-8
147e83
@@ -338,15 +355,25 @@
147e83
     b     1f
147e83
 2:  addi  5,5,8
147e83
     .align  4
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+0:  srd   0,6,10
147e83
+    sld   8,7,9
147e83
+#else
147e83
 0:  sld   0,6,10
147e83
     srd   8,7,9
147e83
+#endif
147e83
     cmpldi  11,2
147e83
     ld    6,8(5)
147e83
     or    0,0,8
147e83
     addi  11,11,-2
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd   0,7,10
147e83
+1:  sld   8,6,9
147e83
+#else
147e83
     sld   0,7,10
147e83
 1:  srd   8,6,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     beq   8f
147e83
     ld    7,16(5)
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:05:51.000000000 -0500
147e83
@@ -1,5 +1,5 @@
147e83
 /* Optimized memcpy implementation for PowerPC64.
147e83
-   Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
147e83
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
147e83
    This file is part of the GNU C Library.
147e83
 
147e83
    The GNU C Library is free software; you can redistribute it and/or
147e83
@@ -17,26 +17,24 @@
147e83
    <http://www.gnu.org/licenses/>.  */
147e83
 
147e83
 #include <sysdep.h>
147e83
-#include <bp-sym.h>
147e83
-#include <bp-asm.h>
147e83
 
147e83
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
147e83
    Returns 'dst'.
147e83
 
147e83
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
147e83
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
147e83
-   with the appropriate combination of byte and halfword load/stores. 
147e83
-   There is minimal effort to optimize the alignment of short moves.  
147e83
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
147e83
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
147e83
+   with the appropriate combination of byte and halfword load/stores.
147e83
+   There is minimal effort to optimize the alignment of short moves.
147e83
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
147e83
-   of handling unligned load/stores that do not cross 32-byte boundries.
147e83
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
147e83
 
147e83
    Longer moves (>= 32-bytes) justify the effort to get at least the
147e83
    destination doubleword (8-byte) aligned.  Further optimization is
147e83
-   posible when both source and destination are doubleword aligned.
147e83
+   possible when both source and destination are doubleword aligned.
147e83
    Each case has a optimized unrolled loop.   */
147e83
 
147e83
 	.machine power4
147e83
-EALIGN (BP_SYM (memcpy), 5, 0)
147e83
+EALIGN (memcpy, 5, 0)
147e83
 	CALL_MCOUNT 3
147e83
 
147e83
     cmpldi cr1,5,31
147e83
@@ -44,20 +42,20 @@
147e83
     std   3,-16(1)
147e83
     std   31,-8(1)
147e83
     cfi_offset(31,-8)
147e83
-    andi. 11,3,7	/* check alignement of dst.  */
147e83
+    andi. 11,3,7	/* check alignment of dst.  */
147e83
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
147e83
-    clrldi 10,4,61	/* check alignement of src.  */
147e83
+    clrldi 10,4,61	/* check alignment of src.  */
147e83
     cmpldi cr6,5,8
147e83
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
147e83
-    cmpld cr6,10,11     
147e83
+    cmpld cr6,10,11
147e83
     mr    12,4
147e83
     srdi  9,5,3		/* Number of full double words remaining.  */
147e83
     mtcrf 0x01,0
147e83
     mr    31,5
147e83
     beq   .L0
147e83
-  
147e83
+
147e83
     subf  31,0,5
147e83
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.  */
147e83
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
147e83
 1:  bf    31,2f
147e83
     lbz   6,0(12)
147e83
     addi  12,12,1
147e83
@@ -74,17 +72,17 @@
147e83
     stw   6,0(3)
147e83
     addi  3,3,4
147e83
 0:
147e83
-    clrldi 10,12,61	/* check alignement of src again.  */     
147e83
+    clrldi 10,12,61	/* check alignment of src again.  */
147e83
     srdi  9,31,3	/* Number of full double words remaining.  */
147e83
-    
147e83
-  /* Copy doublewords from source to destination, assumpting the
147e83
+
147e83
+  /* Copy doublewords from source to destination, assuming the
147e83
      destination is aligned on a doubleword boundary.
147e83
 
147e83
      At this point we know there are at least 25 bytes left (32-7) to copy.
147e83
-     The next step is to determine if the source is also doubleword aligned. 
147e83
+     The next step is to determine if the source is also doubleword aligned.
147e83
      If not branch to the unaligned move code at .L6. which uses
147e83
      a load, shift, store strategy.
147e83
-     
147e83
+
147e83
      Otherwise source and destination are doubleword aligned, and we can
147e83
      the optimized doubleword copy loop.  */
147e83
 .L0:
147e83
@@ -97,14 +95,14 @@
147e83
      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
147e83
      If the copy is not an exact multiple of 32 bytes, 1-3
147e83
      doublewords are copied as needed to set up the main loop.  After
147e83
-     the main loop exits there may be a tail of 1-7 bytes. These byte are 
147e83
+     the main loop exits there may be a tail of 1-7 bytes. These byte are
147e83
      copied a word/halfword/byte at a time as needed to preserve alignment.  */
147e83
 
147e83
     srdi  8,31,5
147e83
     cmpldi	cr1,9,4
147e83
     cmpldi	cr6,11,0
147e83
     mr    11,12
147e83
-    
147e83
+
147e83
     bf    30,1f
147e83
     ld    6,0(12)
147e83
     ld    7,8(12)
147e83
@@ -115,7 +113,7 @@
147e83
     addi  10,3,16
147e83
     bf    31,4f
147e83
     ld    0,16(12)
147e83
-    std   0,16(3)    
147e83
+    std   0,16(3)
147e83
     blt   cr1,3f
147e83
     addi  11,12,24
147e83
     addi  10,3,24
147e83
@@ -129,7 +127,7 @@
147e83
     addi  11,12,8
147e83
     std   6,0(3)
147e83
     addi  10,3,8
147e83
-    
147e83
+
147e83
     .align  4
147e83
 4:
147e83
     ld    6,0(11)
147e83
@@ -144,7 +142,7 @@
147e83
     std   0,24(10)
147e83
     addi  10,10,32
147e83
     bdnz  4b
147e83
-3:  
147e83
+3:
147e83
 
147e83
     rldicr 0,31,0,60
147e83
     mtcrf 0x01,31
147e83
@@ -152,9 +150,9 @@
147e83
 .L9:
147e83
     add   3,3,0
147e83
     add   12,12,0
147e83
-    
147e83
+
147e83
 /*  At this point we have a tail of 0-7 bytes and we know that the
147e83
-    destiniation is double word aligned.  */
147e83
+    destination is double word aligned.  */
147e83
 4:  bf    29,2f
147e83
     lwz   6,0(12)
147e83
     addi  12,12,4
147e83
@@ -173,29 +171,29 @@
147e83
     ld 31,-8(1)
147e83
     ld 3,-16(1)
147e83
     blr
147e83
-       
147e83
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
147e83
-   bytes.  Each case is handled without loops, using binary (1,2,4,8) 
147e83
-   tests.  
147e83
-   
147e83
+
147e83
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
147e83
+   bytes.  Each case is handled without loops, using binary (1,2,4,8)
147e83
+   tests.
147e83
+
147e83
    In the short (0-8 byte) case no attempt is made to force alignment
147e83
-   of either source or destination.  The hardware will handle the 
147e83
-   unaligned load/stores with small delays for crossing 32- 64-byte, and 
147e83
+   of either source or destination.  The hardware will handle the
147e83
+   unaligned load/stores with small delays for crossing 32- 64-byte, and
147e83
    4096-byte boundaries. Since these short moves are unlikely to be
147e83
-   unaligned or cross these boundaries, the overhead to force 
147e83
+   unaligned or cross these boundaries, the overhead to force
147e83
    alignment is not justified.
147e83
-   
147e83
+
147e83
    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
147e83
    boundaries.  Since only loads are sensitive to the 32-/64-byte
147e83
-   boundaries it is more important to align the source then the 
147e83
+   boundaries it is more important to align the source then the
147e83
    destination.  If the source is not already word aligned, we first
147e83
-   move 1-3 bytes as needed.  Since we are only word aligned we don't 
147e83
-   use double word load/stores to insure that all loads are aligned. 
147e83
+   move 1-3 bytes as needed.  Since we are only word aligned we don't
147e83
+   use double word load/stores to insure that all loads are aligned.
147e83
    While the destination and stores may still be unaligned, this
147e83
    is only an issue for page (4096 byte boundary) crossing, which
147e83
    should be rare for these short moves.  The hardware handles this
147e83
-   case automatically with a small delay.  */ 
147e83
-   
147e83
+   case automatically with a small delay.  */
147e83
+
147e83
     .align  4
147e83
 .L2:
147e83
     mtcrf 0x01,5
147e83
@@ -216,15 +214,28 @@
147e83
     blt   cr6,5f
147e83
     srdi  7,6,16
147e83
     bgt	  cr6,3f
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    sth   7,0(3)
147e83
+#else
147e83
     sth   6,0(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 3:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,24
147e83
+    stb   6,0(3)
147e83
+    sth   7,1(3)
147e83
+#else
147e83
     stb   7,0(3)
147e83
     sth   6,1(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 5:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,8
147e83
+#endif
147e83
     stb   6,0(3)
147e83
 7:
147e83
     cmpldi	cr1,10,16
147e83
@@ -258,11 +269,11 @@
147e83
     lwz   6,0(12)
147e83
     addi  12,12,4
147e83
     stw   6,0(3)
147e83
-    addi  3,3,4    
147e83
+    addi  3,3,4
147e83
 2:  /* Move 2-3 bytes.  */
147e83
     bf    30,1f
147e83
     lhz   6,0(12)
147e83
-    sth   6,0(3) 
147e83
+    sth   6,0(3)
147e83
     bf    31,0f
147e83
     lbz   7,2(12)
147e83
     stb   7,2(3)
147e83
@@ -283,8 +294,8 @@
147e83
     mr    12,4
147e83
     bne   cr6,4f
147e83
 /* Would have liked to use use ld/std here but the 630 processors are
147e83
-   slow for load/store doubles that are not at least word aligned.  
147e83
-   Unaligned Load/Store word execute with only a 1 cycle penaltity.  */
147e83
+   slow for load/store doubles that are not at least word aligned.
147e83
+   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
147e83
     lwz   6,0(4)
147e83
     lwz   7,4(4)
147e83
     stw   6,0(3)
147e83
@@ -299,14 +310,14 @@
147e83
 6:
147e83
     bf    30,5f
147e83
     lhz   7,4(4)
147e83
-    sth   7,4(3) 
147e83
+    sth   7,4(3)
147e83
     bf    31,0f
147e83
     lbz   8,6(4)
147e83
     stb   8,6(3)
147e83
     ld 3,-16(1)
147e83
     blr
147e83
     .align  4
147e83
-5:  
147e83
+5:
147e83
     bf    31,0f
147e83
     lbz   6,4(4)
147e83
     stb   6,4(3)
147e83
@@ -336,13 +347,23 @@
147e83
     bf      30,1f
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd     0,6,10
147e83
+    sld     8,7,9
147e83
+#else
147e83
     sld     0,6,10
147e83
     srd     8,7,9
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd     0,7,10
147e83
+    sld     8,6,9
147e83
+#else
147e83
     sld     0,7,10
147e83
     srd     8,6,9
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -351,8 +372,13 @@
147e83
     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,4f
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd     0,6,10
147e83
+    sld     8,7,9
147e83
+#else
147e83
     sld     0,6,10
147e83
     srd     8,7,9
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -363,8 +389,13 @@
147e83
     b       4f
147e83
     .align 4
147e83
 1:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd     0,6,10
147e83
+    sld     8,7,9
147e83
+#else
147e83
     sld     0,6,10
147e83
     srd     8,7,9
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,4f
147e83
@@ -375,23 +406,44 @@
147e83
     addi    4,4,8
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
-4:  sld   0,6,10
147e83
+4:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd   0,6,10
147e83
+    sld   8,7,9
147e83
+#else
147e83
+    sld   0,6,10
147e83
     srd   8,7,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd   0,7,10
147e83
+    sld   8,6,9
147e83
+#else
147e83
     sld   0,7,10
147e83
     srd   8,6,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd   0,6,10
147e83
+    sld   8,7,9
147e83
+#else
147e83
     sld   0,6,10
147e83
     srd   8,7,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd   0,7,10
147e83
+    sld   8,6,9
147e83
+#else
147e83
     sld   0,7,10
147e83
     srd   8,6,9
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -401,9 +453,14 @@
147e83
     .align 4
147e83
 8:
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srd   0,6,10
147e83
+    sld   8,7,9
147e83
+#else
147e83
     sld   0,6,10
147e83
     srd   8,7,9
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
 3:
147e83
     rldicr 0,31,0,60
147e83
@@ -413,5 +470,5 @@
147e83
     ld 31,-8(1)
147e83
     ld 3,-16(1)
147e83
     blr
147e83
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
147e83
+END_GEN_TB (memcpy,TB_TOCLESS)
147e83
 libc_hidden_builtin_def (memcpy)
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:05:27.000000000 -0500
147e83
@@ -1,5 +1,5 @@
147e83
 /* Optimized memcpy implementation for PowerPC64.
147e83
-   Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.
147e83
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
147e83
    This file is part of the GNU C Library.
147e83
 
147e83
    The GNU C Library is free software; you can redistribute it and/or
147e83
@@ -17,52 +17,50 @@
147e83
    <http://www.gnu.org/licenses/>.  */
147e83
 
147e83
 #include <sysdep.h>
147e83
-#include <bp-sym.h>
147e83
-#include <bp-asm.h>
147e83
 
147e83
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
147e83
    Returns 'dst'.
147e83
 
147e83
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
147e83
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
147e83
-   with the appropriate combination of byte and halfword load/stores. 
147e83
-   There is minimal effort to optimize the alignment of short moves.  
147e83
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
147e83
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
147e83
+   with the appropriate combination of byte and halfword load/stores.
147e83
+   There is minimal effort to optimize the alignment of short moves.
147e83
    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
147e83
-   of handling unligned load/stores that do not cross 32-byte boundries.
147e83
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
147e83
 
147e83
    Longer moves (>= 32-bytes) justify the effort to get at least the
147e83
    destination doubleword (8-byte) aligned.  Further optimization is
147e83
-   posible when both source and destination are doubleword aligned.
147e83
-   Each case has a optimized unrolled loop.  
147e83
-     
147e83
-   For POWER6 unaligned loads will take a 20+ cycle hicup for any
147e83
+   possible when both source and destination are doubleword aligned.
147e83
+   Each case has a optimized unrolled loop.
147e83
+
147e83
+   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
147e83
    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
147e83
-   is more forgiving and does not take a hicup until page or 
147e83
-   segment boundaries.  So we require doubleword alignment for 
147e83
+   is more forgiving and does not take a hiccup until page or
147e83
+   segment boundaries.  So we require doubleword alignment for
147e83
    the source but may take a risk and only require word alignment
147e83
    for the destination.  */
147e83
 
147e83
 	.machine	"power6"
147e83
-EALIGN (BP_SYM (memcpy), 7, 0)
147e83
+EALIGN (memcpy, 7, 0)
147e83
 	CALL_MCOUNT 3
147e83
 
147e83
     cmpldi cr1,5,31
147e83
     neg   0,3
147e83
     std   3,-16(1)
147e83
     std   31,-8(1)
147e83
-    andi. 11,3,7	/* check alignement of dst.  */
147e83
+    andi. 11,3,7	/* check alignment of dst.  */
147e83
     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
147e83
-    clrldi 10,4,61	/* check alignement of src.  */
147e83
+    clrldi 10,4,61	/* check alignment of src.  */
147e83
     cmpldi cr6,5,8
147e83
     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
147e83
     mtcrf 0x01,0
147e83
-    cmpld cr6,10,11  
147e83
+    cmpld cr6,10,11
147e83
     srdi  9,5,3		/* Number of full double words remaining.  */
147e83
     beq   .L0
147e83
-  
147e83
+
147e83
     subf  5,0,5
147e83
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.
147e83
-     Duplicate some code to maximize fall-throught and minimize agen delays.  */
147e83
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
147e83
+     Duplicate some code to maximize fall-through and minimize agen delays.  */
147e83
 1:  bf    31,2f
147e83
     lbz   6,0(4)
147e83
     stb   6,0(3)
147e83
@@ -78,7 +76,7 @@
147e83
     lwz   6,1(4)
147e83
     stw   6,1(3)
147e83
     b     0f
147e83
-    
147e83
+
147e83
 2:  bf    30,4f
147e83
     lhz   6,0(4)
147e83
     sth   6,0(3)
147e83
@@ -86,26 +84,26 @@
147e83
     lwz   6,2(4)
147e83
     stw   6,2(3)
147e83
     b     0f
147e83
-    
147e83
+
147e83
 4:  bf    29,0f
147e83
     lwz   6,0(4)
147e83
     stw   6,0(3)
147e83
-0: 
147e83
+0:
147e83
 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
147e83
     add   4,4,0
147e83
     add   3,3,0
147e83
-    
147e83
-    clrldi 10,4,61	/* check alignement of src again.  */     
147e83
+
147e83
+    clrldi 10,4,61	/* check alignment of src again.  */
147e83
     srdi  9,5,3	/* Number of full double words remaining.  */
147e83
-    
147e83
-  /* Copy doublewords from source to destination, assumpting the
147e83
+
147e83
+  /* Copy doublewords from source to destination, assuming the
147e83
      destination is aligned on a doubleword boundary.
147e83
 
147e83
      At this point we know there are at least 25 bytes left (32-7) to copy.
147e83
-     The next step is to determine if the source is also doubleword aligned. 
147e83
+     The next step is to determine if the source is also doubleword aligned.
147e83
      If not branch to the unaligned move code at .L6. which uses
147e83
      a load, shift, store strategy.
147e83
-     
147e83
+
147e83
      Otherwise source and destination are doubleword aligned, and we can
147e83
      the optimized doubleword copy loop.  */
147e83
     .align  4
147e83
@@ -123,14 +121,14 @@
147e83
      the main loop exits there may be a tail of 1-7 bytes. These byte
147e83
      are copied a word/halfword/byte at a time as needed to preserve
147e83
      alignment.
147e83
-     
147e83
+
147e83
      For POWER6 the L1 is store-through and the L2 is store-in.  The
147e83
      L2 is clocked at half CPU clock so we can store 16 bytes every
147e83
      other cycle.  POWER6 also has a load/store bypass so we can do
147e83
-     load, load, store, store every 2 cycles.  
147e83
-     
147e83
+     load, load, store, store every 2 cycles.
147e83
+
147e83
      The following code is sensitive to cache line alignment.  Do not
147e83
-     make any change with out first making sure thay don't result in
147e83
+     make any change with out first making sure they don't result in
147e83
      splitting ld/std pairs across a cache line.  */
147e83
 
147e83
     mtcrf 0x02,5
147e83
@@ -273,7 +271,7 @@
147e83
     std   8,16+96(10)
147e83
     std   0,24+96(10)
147e83
     ble   cr5,L(das_loop_e)
147e83
-    
147e83
+
147e83
     mtctr   12
147e83
     .align  4
147e83
 L(das_loop2):
147e83
@@ -326,10 +324,10 @@
147e83
     .align  4
147e83
 L(das_tail):
147e83
     beq   cr1,0f
147e83
-    
147e83
+
147e83
 L(das_tail2):
147e83
 /*  At this point we have a tail of 0-7 bytes and we know that the
147e83
-    destiniation is double word aligned.  */
147e83
+    destination is double word aligned.  */
147e83
 4:  bf    29,2f
147e83
     lwz   6,0(4)
147e83
     stw   6,0(3)
147e83
@@ -344,7 +342,7 @@
147e83
     lbz   6,4(4)
147e83
     stb   6,4(3)
147e83
     b     0f
147e83
-  
147e83
+
147e83
 2:  bf    30,1f
147e83
     lhz   6,0(4)
147e83
     sth   6,0(3)
147e83
@@ -352,7 +350,7 @@
147e83
     lbz   6,2(4)
147e83
     stb   6,2(3)
147e83
     b     0f
147e83
-    
147e83
+
147e83
 1:  bf    31,0f
147e83
     lbz   6,0(4)
147e83
     stb   6,0(3)
147e83
@@ -361,7 +359,7 @@
147e83
     ld 3,-16(1)
147e83
     blr
147e83
 
147e83
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
147e83
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
147e83
    bytes.  Each case is handled without loops, using binary (1,2,4,8)
147e83
    tests.
147e83
 
147e83
@@ -402,15 +400,28 @@
147e83
     blt   cr6,5f
147e83
     srdi  7,6,16
147e83
     bgt	  cr6,3f
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    sth   7,0(3)
147e83
+#else
147e83
     sth   6,0(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 3:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,24
147e83
+    stb   6,0(3)
147e83
+    sth   7,1(3)
147e83
+#else
147e83
     stb   7,0(3)
147e83
     sth   6,1(3)
147e83
+#endif
147e83
     b     7f
147e83
     .align  4
147e83
 5:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    rotlwi 6,6,8
147e83
+#endif
147e83
     stb   6,0(3)
147e83
 7:
147e83
     cmpldi	cr1,10,16
147e83
@@ -421,7 +432,7 @@
147e83
 /* At least 6 bytes left and the source is word aligned.  This allows
147e83
    some speculative loads up front.  */
147e83
 /* We need to special case the fall-through because the biggest delays
147e83
-   are due to address computation not being ready in time for the 
147e83
+   are due to address computation not being ready in time for the
147e83
    AGEN.  */
147e83
     lwz   6,0(12)
147e83
     lwz   7,4(12)
147e83
@@ -452,7 +463,7 @@
147e83
     ld    3,-16(1)
147e83
     blr
147e83
     .align  4
147e83
-L(dus_tail16p8):  /* less then 8 bytes left.  */
147e83
+L(dus_tail16p8):  /* less than 8 bytes left.  */
147e83
     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
147e83
     cmpldi	cr1,10,20
147e83
     bf    29,L(dus_tail16p2)
147e83
@@ -466,7 +477,7 @@
147e83
     ld    3,-16(1)
147e83
     blr
147e83
     .align  4
147e83
-L(dus_tail16p4):  /* less then 4 bytes left.  */
147e83
+L(dus_tail16p4):  /* less than 4 bytes left.  */
147e83
     addi  12,12,24
147e83
     addi  3,3,24
147e83
     bgt   cr0,L(dus_tail2)
147e83
@@ -474,7 +485,7 @@
147e83
     ld    3,-16(1)
147e83
     blr
147e83
     .align  4
147e83
-L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
147e83
+L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
147e83
     addi  12,12,16
147e83
     addi  3,3,16
147e83
     b     L(dus_tail2)
147e83
@@ -499,7 +510,7 @@
147e83
     ld    3,-16(1)
147e83
     blr
147e83
     .align  4
147e83
-L(dus_tail8p4):  /* less then 4 bytes left.  */
147e83
+L(dus_tail8p4):  /* less than 4 bytes left.  */
147e83
     addi  12,12,8
147e83
     addi  3,3,8
147e83
     bgt   cr1,L(dus_tail2)
147e83
@@ -510,14 +521,14 @@
147e83
     .align  4
147e83
 L(dus_tail4):  /* Move 4 bytes.  */
147e83
 /*  r6 already loaded speculatively.  If we are here we know there is
147e83
-    more then 4 bytes left.  So there is no need to test.  */
147e83
+    more than 4 bytes left.  So there is no need to test.  */
147e83
     addi  12,12,4
147e83
     stw   6,0(3)
147e83
     addi  3,3,4
147e83
 L(dus_tail2):  /* Move 2-3 bytes.  */
147e83
     bf    30,L(dus_tail1)
147e83
     lhz   6,0(12)
147e83
-    sth   6,0(3) 
147e83
+    sth   6,0(3)
147e83
     bf    31,L(dus_tailX)
147e83
     lbz   7,2(12)
147e83
     stb   7,2(3)
147e83
@@ -537,7 +548,7 @@
147e83
 .LE8:
147e83
     mr    12,4
147e83
     bne   cr6,L(dus_4)
147e83
-/* Exactly 8 bytes.  We may cross a 32-/128-byte boundry and take a ~20
147e83
+/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
147e83
    cycle delay.  This case should be rare and any attempt to avoid this
147e83
    would take most of 20 cycles any way.  */
147e83
     ld   6,0(4)
147e83
@@ -552,7 +563,7 @@
147e83
     stw   6,0(3)
147e83
     bf    30,L(dus_5)
147e83
     lhz   7,4(4)
147e83
-    sth   7,4(3) 
147e83
+    sth   7,4(3)
147e83
     bf    31,L(dus_0)
147e83
     lbz   8,6(4)
147e83
     stb   8,6(3)
147e83
@@ -590,20 +601,31 @@
147e83
     bge     cr0, L(du4_do)
147e83
     blt     cr5, L(du1_do)
147e83
     beq     cr5, L(du2_do)
147e83
-    b       L(du3_do) 
147e83
-       
147e83
+    b       L(du3_do)
147e83
+
147e83
     .align 4
147e83
 L(du1_do):
147e83
     bf      30,L(du1_1dw)
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+    /* FIXME: can combine last shift and "or" into "rldimi" */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 8
147e83
+    sldi     8,7, 64-8
147e83
+#else
147e83
     sldi     0,6, 8
147e83
     srdi     8,7, 64-8
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,7, 8
147e83
+    sldi     8,6, 64-8
147e83
+#else
147e83
     sldi     0,7, 8
147e83
     srdi     8,6, 64-8
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -612,8 +634,13 @@
147e83
     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,L(du1_loop)
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 8
147e83
+    sldi     8,7, 64-8
147e83
+#else
147e83
     sldi     0,6, 8
147e83
     srdi     8,7, 64-8
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -624,8 +651,13 @@
147e83
     b       L(du1_loop)
147e83
     .align 4
147e83
 L(du1_1dw):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 8
147e83
+    sldi     8,7, 64-8
147e83
+#else
147e83
     sldi     0,6, 8
147e83
     srdi     8,7, 64-8
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,L(du1_loop)
147e83
@@ -637,23 +669,43 @@
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
 L(du1_loop):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 8
147e83
+    sldi   8,7, 64-8
147e83
+#else
147e83
     sldi   0,6, 8
147e83
     srdi   8,7, 64-8
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 8
147e83
+    sldi   8,6, 64-8
147e83
+#else
147e83
     sldi   0,7, 8
147e83
     srdi   8,6, 64-8
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 8
147e83
+    sldi   8,7, 64-8
147e83
+#else
147e83
     sldi   0,6, 8
147e83
     srdi   8,7, 64-8
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 8
147e83
+    sldi   8,6, 64-8
147e83
+#else
147e83
     sldi   0,7, 8
147e83
     srdi   8,6, 64-8
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -663,9 +715,14 @@
147e83
     .align 4
147e83
 L(du1_fini):
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 8
147e83
+    sldi   8,7, 64-8
147e83
+#else
147e83
     sldi   0,6, 8
147e83
     srdi   8,7, 64-8
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
     b     L(du_done)
147e83
 
147e83
@@ -674,13 +731,23 @@
147e83
     bf      30,L(du2_1dw)
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 16
147e83
+    sldi     8,7, 64-16
147e83
+#else
147e83
     sldi     0,6, 16
147e83
     srdi     8,7, 64-16
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,7, 16
147e83
+    sldi     8,6, 64-16
147e83
+#else
147e83
     sldi     0,7, 16
147e83
     srdi     8,6, 64-16
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -689,8 +756,13 @@
147e83
     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,L(du2_loop)
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 16
147e83
+    sldi     8,7, 64-16
147e83
+#else
147e83
     sldi     0,6, 16
147e83
     srdi     8,7, 64-16
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -701,8 +773,13 @@
147e83
     b       L(du2_loop)
147e83
     .align 4
147e83
 L(du2_1dw):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 16
147e83
+    sldi     8,7, 64-16
147e83
+#else
147e83
     sldi     0,6, 16
147e83
     srdi     8,7, 64-16
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,L(du2_loop)
147e83
@@ -714,23 +791,43 @@
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
 L(du2_loop):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 16
147e83
+    sldi   8,7, 64-16
147e83
+#else
147e83
     sldi   0,6, 16
147e83
     srdi   8,7, 64-16
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 16
147e83
+    sldi   8,6, 64-16
147e83
+#else
147e83
     sldi   0,7, 16
147e83
     srdi   8,6, 64-16
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 16
147e83
+    sldi   8,7, 64-16
147e83
+#else
147e83
     sldi   0,6, 16
147e83
     srdi   8,7, 64-16
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 16
147e83
+    sldi   8,6, 64-16
147e83
+#else
147e83
     sldi   0,7, 16
147e83
     srdi   8,6, 64-16
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -740,9 +837,14 @@
147e83
     .align 4
147e83
 L(du2_fini):
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 16
147e83
+    sldi   8,7, 64-16
147e83
+#else
147e83
     sldi   0,6, 16
147e83
     srdi   8,7, 64-16
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
     b     L(du_done)
147e83
 
147e83
@@ -751,13 +853,23 @@
147e83
     bf      30,L(du3_1dw)
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 24
147e83
+    sldi     8,7, 64-24
147e83
+#else
147e83
     sldi     0,6, 24
147e83
     srdi     8,7, 64-24
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,7, 24
147e83
+    sldi     8,6, 64-24
147e83
+#else
147e83
     sldi     0,7, 24
147e83
     srdi     8,6, 64-24
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -766,8 +878,13 @@
147e83
     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,L(du3_loop)
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 24
147e83
+    sldi     8,7, 64-24
147e83
+#else
147e83
     sldi     0,6, 24
147e83
     srdi     8,7, 64-24
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -778,8 +895,13 @@
147e83
     b       L(du3_loop)
147e83
     .align 4
147e83
 L(du3_1dw):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 24
147e83
+    sldi     8,7, 64-24
147e83
+#else
147e83
     sldi     0,6, 24
147e83
     srdi     8,7, 64-24
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,L(du3_loop)
147e83
@@ -791,23 +913,43 @@
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
 L(du3_loop):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 24
147e83
+    sldi   8,7, 64-24
147e83
+#else
147e83
     sldi   0,6, 24
147e83
     srdi   8,7, 64-24
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 24
147e83
+    sldi   8,6, 64-24
147e83
+#else
147e83
     sldi   0,7, 24
147e83
     srdi   8,6, 64-24
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 24
147e83
+    sldi   8,7, 64-24
147e83
+#else
147e83
     sldi   0,6, 24
147e83
     srdi   8,7, 64-24
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 24
147e83
+    sldi   8,6, 64-24
147e83
+#else
147e83
     sldi   0,7, 24
147e83
     srdi   8,6, 64-24
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -817,9 +959,14 @@
147e83
     .align 4
147e83
 L(du3_fini):
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 24
147e83
+    sldi   8,7, 64-24
147e83
+#else
147e83
     sldi   0,6, 24
147e83
     srdi   8,7, 64-24
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
     b     L(du_done)
147e83
 
147e83
@@ -834,13 +981,23 @@
147e83
     bf      30,L(du4_1dw)
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 32
147e83
+    sldi     8,7, 64-32
147e83
+#else
147e83
     sldi     0,6, 32
147e83
     srdi     8,7, 64-32
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,7, 32
147e83
+    sldi     8,6, 64-32
147e83
+#else
147e83
     sldi     0,7, 32
147e83
     srdi     8,6, 64-32
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -849,8 +1006,13 @@
147e83
     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,L(du4_loop)
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 32
147e83
+    sldi     8,7, 64-32
147e83
+#else
147e83
     sldi     0,6, 32
147e83
     srdi     8,7, 64-32
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -861,8 +1023,13 @@
147e83
     b       L(du4_loop)
147e83
     .align 4
147e83
 L(du4_1dw):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 32
147e83
+    sldi     8,7, 64-32
147e83
+#else
147e83
     sldi     0,6, 32
147e83
     srdi     8,7, 64-32
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,L(du4_loop)
147e83
@@ -874,23 +1041,43 @@
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
 L(du4_loop):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 32
147e83
+    sldi   8,7, 64-32
147e83
+#else
147e83
     sldi   0,6, 32
147e83
     srdi   8,7, 64-32
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 32
147e83
+    sldi   8,6, 64-32
147e83
+#else
147e83
     sldi   0,7, 32
147e83
     srdi   8,6, 64-32
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 32
147e83
+    sldi   8,7, 64-32
147e83
+#else
147e83
     sldi   0,6, 32
147e83
     srdi   8,7, 64-32
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 32
147e83
+    sldi   8,6, 64-32
147e83
+#else
147e83
     sldi   0,7, 32
147e83
     srdi   8,6, 64-32
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -900,9 +1087,14 @@
147e83
     .align 4
147e83
 L(du4_fini):
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 32
147e83
+    sldi   8,7, 64-32
147e83
+#else
147e83
     sldi   0,6, 32
147e83
     srdi   8,7, 64-32
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
     b     L(du_done)
147e83
 
147e83
@@ -911,13 +1103,23 @@
147e83
     bf      30,L(du5_1dw)
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 40
147e83
+    sldi     8,7, 64-40
147e83
+#else
147e83
     sldi     0,6, 40
147e83
     srdi     8,7, 64-40
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,7, 40
147e83
+    sldi     8,6, 64-40
147e83
+#else
147e83
     sldi     0,7, 40
147e83
     srdi     8,6, 64-40
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -926,8 +1128,13 @@
147e83
     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,L(du5_loop)
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 40
147e83
+    sldi     8,7, 64-40
147e83
+#else
147e83
     sldi     0,6, 40
147e83
     srdi     8,7, 64-40
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -938,8 +1145,13 @@
147e83
     b       L(du5_loop)
147e83
     .align 4
147e83
 L(du5_1dw):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 40
147e83
+    sldi     8,7, 64-40
147e83
+#else
147e83
     sldi     0,6, 40
147e83
     srdi     8,7, 64-40
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,L(du5_loop)
147e83
@@ -951,23 +1163,43 @@
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
 L(du5_loop):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 40
147e83
+    sldi   8,7, 64-40
147e83
+#else
147e83
     sldi   0,6, 40
147e83
     srdi   8,7, 64-40
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 40
147e83
+    sldi   8,6, 64-40
147e83
+#else
147e83
     sldi   0,7, 40
147e83
     srdi   8,6, 64-40
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 40
147e83
+    sldi   8,7, 64-40
147e83
+#else
147e83
     sldi   0,6, 40
147e83
     srdi   8,7, 64-40
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 40
147e83
+    sldi   8,6, 64-40
147e83
+#else
147e83
     sldi   0,7, 40
147e83
     srdi   8,6, 64-40
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -977,9 +1209,14 @@
147e83
     .align 4
147e83
 L(du5_fini):
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 40
147e83
+    sldi   8,7, 64-40
147e83
+#else
147e83
     sldi   0,6, 40
147e83
     srdi   8,7, 64-40
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
     b     L(du_done)
147e83
 
147e83
@@ -988,13 +1225,23 @@
147e83
     bf      30,L(du6_1dw)
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 48
147e83
+    sldi     8,7, 64-48
147e83
+#else
147e83
     sldi     0,6, 48
147e83
     srdi     8,7, 64-48
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,7, 48
147e83
+    sldi     8,6, 64-48
147e83
+#else
147e83
     sldi     0,7, 48
147e83
     srdi     8,6, 64-48
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -1003,8 +1250,13 @@
147e83
     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,L(du6_loop)
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 48
147e83
+    sldi     8,7, 64-48
147e83
+#else
147e83
     sldi     0,6, 48
147e83
     srdi     8,7, 64-48
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -1015,8 +1267,13 @@
147e83
     b       L(du6_loop)
147e83
     .align 4
147e83
 L(du6_1dw):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 48
147e83
+    sldi     8,7, 64-48
147e83
+#else
147e83
     sldi     0,6, 48
147e83
     srdi     8,7, 64-48
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,L(du6_loop)
147e83
@@ -1028,23 +1285,43 @@
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
 L(du6_loop):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 48
147e83
+    sldi   8,7, 64-48
147e83
+#else
147e83
     sldi   0,6, 48
147e83
     srdi   8,7, 64-48
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 48
147e83
+    sldi   8,6, 64-48
147e83
+#else
147e83
     sldi   0,7, 48
147e83
     srdi   8,6, 64-48
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 48
147e83
+    sldi   8,7, 64-48
147e83
+#else
147e83
     sldi   0,6, 48
147e83
     srdi   8,7, 64-48
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 48
147e83
+    sldi   8,6, 64-48
147e83
+#else
147e83
     sldi   0,7, 48
147e83
     srdi   8,6, 64-48
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -1054,9 +1331,14 @@
147e83
     .align 4
147e83
 L(du6_fini):
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 48
147e83
+    sldi   8,7, 64-48
147e83
+#else
147e83
     sldi   0,6, 48
147e83
     srdi   8,7, 64-48
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
     b     L(du_done)
147e83
 
147e83
@@ -1065,13 +1347,23 @@
147e83
     bf      30,L(du7_1dw)
147e83
 
147e83
     /* there are at least two DWs to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 56
147e83
+    sldi     8,7, 64-56
147e83
+#else
147e83
     sldi     0,6, 56
147e83
     srdi     8,7, 64-56
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      6,16(5)
147e83
     std     0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,7, 56
147e83
+    sldi     8,6, 64-56
147e83
+#else
147e83
     sldi     0,7, 56
147e83
     srdi     8,6, 64-56
147e83
+#endif
147e83
     or      0,0,8
147e83
     ld      7,24(5)
147e83
     std     0,8(4)
147e83
@@ -1080,8 +1372,13 @@
147e83
     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
147e83
     bf      31,L(du7_loop)
147e83
     /* there is a third DW to copy */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 56
147e83
+    sldi     8,7, 64-56
147e83
+#else
147e83
     sldi     0,6, 56
147e83
     srdi     8,7, 64-56
147e83
+#endif
147e83
     or      0,0,8
147e83
     std     0,0(4)
147e83
     mr      6,7
147e83
@@ -1092,8 +1389,13 @@
147e83
     b       L(du7_loop)
147e83
     .align 4
147e83
 L(du7_1dw):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi     0,6, 56
147e83
+    sldi     8,7, 64-56
147e83
+#else
147e83
     sldi     0,6, 56
147e83
     srdi     8,7, 64-56
147e83
+#endif
147e83
     addi    5,5,16
147e83
     or      0,0,8
147e83
     bf      31,L(du7_loop)
147e83
@@ -1105,23 +1407,43 @@
147e83
     .align 4
147e83
 /* copy 32 bytes at a time */
147e83
 L(du7_loop):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 56
147e83
+    sldi   8,7, 64-56
147e83
+#else
147e83
     sldi   0,6, 56
147e83
     srdi   8,7, 64-56
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,0(5)
147e83
     std   0,0(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 56
147e83
+    sldi   8,6, 64-56
147e83
+#else
147e83
     sldi   0,7, 56
147e83
     srdi   8,6, 64-56
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,8(5)
147e83
     std   0,8(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 56
147e83
+    sldi   8,7, 64-56
147e83
+#else
147e83
     sldi   0,6, 56
147e83
     srdi   8,7, 64-56
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    6,16(5)
147e83
     std   0,16(4)
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,7, 56
147e83
+    sldi   8,6, 64-56
147e83
+#else
147e83
     sldi   0,7, 56
147e83
     srdi   8,6, 64-56
147e83
+#endif
147e83
     or    0,0,8
147e83
     ld    7,24(5)
147e83
     std   0,24(4)
147e83
@@ -1131,12 +1453,17 @@
147e83
     .align 4
147e83
 L(du7_fini):
147e83
     /* calculate and store the final DW */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+    srdi   0,6, 56
147e83
+    sldi   8,7, 64-56
147e83
+#else
147e83
     sldi   0,6, 56
147e83
     srdi   8,7, 64-56
147e83
-    or    0,0,8  
147e83
+#endif
147e83
+    or    0,0,8
147e83
     std   0,0(4)
147e83
     b     L(du_done)
147e83
-    
147e83
+
147e83
     .align 4
147e83
 L(du_done):
147e83
     rldicr 0,31,0,60
147e83
@@ -1144,9 +1471,9 @@
147e83
     beq   cr1,0f	/* If the tail is 0 bytes we are done!  */
147e83
 
147e83
     add   3,3,0
147e83
-    add   12,12,0    
147e83
+    add   12,12,0
147e83
 /*  At this point we have a tail of 0-7 bytes and we know that the
147e83
-    destiniation is double word aligned.  */
147e83
+    destination is double word aligned.  */
147e83
 4:  bf    29,2f
147e83
     lwz   6,0(12)
147e83
     addi  12,12,4
147e83
@@ -1165,5 +1492,5 @@
147e83
     ld 31,-8(1)
147e83
     ld 3,-16(1)
147e83
     blr
147e83
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
147e83
+END_GEN_TB (memcpy,TB_TOCLESS)
147e83
 libc_hidden_builtin_def (memcpy)
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:05:40.000000000 -0500
147e83
@@ -1,5 +1,5 @@
147e83
 /* Optimized memcpy implementation for PowerPC64/POWER7.
147e83
-   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
147e83
+   Copyright (C) 2010-2014 Free Software Foundation, Inc.
147e83
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
147e83
    This file is part of the GNU C Library.
147e83
 
147e83
@@ -18,425 +18,366 @@
147e83
    <http://www.gnu.org/licenses/>.  */
147e83
 
147e83
 #include <sysdep.h>
147e83
-#include <bp-sym.h>
147e83
-#include <bp-asm.h>
147e83
 
147e83
 
147e83
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
147e83
    Returns 'dst'.  */
147e83
 
147e83
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
147e83
+#define src 4
147e83
+#define cnt 5
147e83
+
147e83
 	.machine power7
147e83
-EALIGN (BP_SYM (memcpy), 5, 0)
147e83
+EALIGN (memcpy, 5, 0)
147e83
 	CALL_MCOUNT 3
147e83
 
147e83
-	cmpldi  cr1,5,31
147e83
+	cmpldi	cr1,cnt,31
147e83
 	neg	0,3
147e83
-	std	3,-16(1)
147e83
-	std	31,-8(1)
147e83
-	cfi_offset(31,-8)
147e83
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
147e83
 				    code.  */
147e83
 
147e83
-	andi.   11,3,7	      /* Check alignment of DST.  */
147e83
-
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
147e83
+   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
147e83
+   loop is only used for quadword aligned copies.  */
147e83
+	andi.	10,3,15
147e83
+	clrldi	11,4,60
147e83
+#else
147e83
+	andi.	10,3,7		/* Check alignment of DST.  */
147e83
+	clrldi	11,4,61		/* Check alignment of SRC.  */
147e83
+#endif
147e83
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
147e83
 
147e83
-	clrldi  10,4,61       /* Check alignment of SRC.  */
147e83
-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
147e83
-	mr	12,4
147e83
-	mr	31,5
147e83
+	mr	dst,3
147e83
 	bne	cr6,L(copy_GE_32_unaligned)
147e83
+	beq	L(aligned_copy)
147e83
 
147e83
-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
147e83
-
147e83
-	beq    L(copy_GE_32_aligned_cont)
147e83
-
147e83
-	clrldi  0,0,61
147e83
-	mtcrf   0x01,0
147e83
-	subf    31,0,5
147e83
-
147e83
-	/* Get the SRC aligned to 8 bytes.  */
147e83
-
147e83
-1:	bf	31,2f
147e83
-	lbz	6,0(12)
147e83
-	addi    12,12,1
147e83
-	stb	6,0(3)
147e83
-	addi    3,3,1
147e83
-2:	bf      30,4f
147e83
-	lhz     6,0(12)
147e83
-	addi    12,12,2
147e83
-	sth     6,0(3)
147e83
-	addi    3,3,2
147e83
-4:	bf      29,0f
147e83
-	lwz     6,0(12)
147e83
-	addi    12,12,4
147e83
-	stw     6,0(3)
147e83
-	addi    3,3,4
147e83
-0:
147e83
-	clrldi  10,12,61      /* Check alignment of SRC again.  */
147e83
-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
147e83
-
147e83
-L(copy_GE_32_aligned_cont):
147e83
-
147e83
-	clrldi  11,31,61
147e83
-	mtcrf   0x01,9
147e83
-
147e83
-	srdi    8,31,5
147e83
-	cmpldi  cr1,9,4
147e83
-	cmpldi  cr6,11,0
147e83
-	mr	11,12
147e83
+	mtocrf	0x01,0
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	clrldi	0,0,60
147e83
+#else
147e83
+	clrldi	0,0,61
147e83
+#endif
147e83
 
147e83
-	/* Copy 1~3 doublewords so the main loop starts
147e83
-	at a multiple of 32 bytes.  */
147e83
-
147e83
-	bf	30,1f
147e83
-	ld      6,0(12)
147e83
-	ld      7,8(12)
147e83
-	addi    11,12,16
147e83
-	mtctr   8
147e83
-	std     6,0(3)
147e83
-	std     7,8(3)
147e83
-	addi    10,3,16
147e83
-	bf      31,4f
147e83
-	ld      0,16(12)
147e83
-	std     0,16(3)
147e83
-	blt     cr1,3f
147e83
-	addi    11,12,24
147e83
-	addi    10,3,24
147e83
-	b       4f
147e83
-
147e83
-	.align  4
147e83
-1:	/* Copy 1 doubleword and set the counter.  */
147e83
-	mr	10,3
147e83
-	mtctr   8
147e83
-	bf      31,4f
147e83
-	ld      6,0(12)
147e83
-	addi    11,12,8
147e83
-	std     6,0(3)
147e83
-	addi    10,3,8
147e83
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
147e83
+1:
147e83
+	bf	31,2f
147e83
+	lbz	6,0(src)
147e83
+	addi	src,src,1
147e83
+	stb	6,0(dst)
147e83
+	addi	dst,dst,1
147e83
+2:
147e83
+	bf	30,4f
147e83
+	lhz	6,0(src)
147e83
+	addi	src,src,2
147e83
+	sth	6,0(dst)
147e83
+	addi	dst,dst,2
147e83
+4:
147e83
+	bf	29,8f
147e83
+	lwz	6,0(src)
147e83
+	addi	src,src,4
147e83
+	stw	6,0(dst)
147e83
+	addi	dst,dst,4
147e83
+8:
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	bf	28,16f
147e83
+	ld	6,0(src)
147e83
+	addi	src,src,8
147e83
+	std	6,0(dst)
147e83
+	addi	dst,dst,8
147e83
+16:
147e83
+#endif
147e83
+	subf	cnt,0,cnt
147e83
 
147e83
+/* Main aligned copy loop. Copies 128 bytes at a time. */
147e83
 L(aligned_copy):
147e83
-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
147e83
-	.align  4
147e83
-4:
147e83
-	/* check for any 32-byte or 64-byte lumps that are outside of a
147e83
-	   nice 128-byte range.  R8 contains the number of 32-byte
147e83
-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
147e83
-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
147e83
-	   unrolled 128-bytes-at-a-time copy loop. */
147e83
-	mtocrf	1,8
147e83
-	li	6,16	# 16() index
147e83
-	li	7,32	# 32() index
147e83
-	li	8,48	# 48() index
147e83
-
147e83
-L(aligned_32byte):
147e83
-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
147e83
-	bns	cr7,L(aligned_64byte)
147e83
-	lxvd2x	6,0,11
147e83
-	lxvd2x	7,11,6
147e83
-	addi	11,11,32
147e83
-	stxvd2x	6,0,10
147e83
-	stxvd2x	7,10,6
147e83
-	addi	10,10,32
147e83
-
147e83
-L(aligned_64byte):
147e83
-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
147e83
-	bne	cr7,L(aligned_128setup)
147e83
-	lxvd2x	6,0,11
147e83
-	lxvd2x	7,11,6
147e83
-	lxvd2x	8,11,7
147e83
-	lxvd2x	9,11,8
147e83
-	addi	11,11,64
147e83
-	stxvd2x	6,0,10
147e83
-	stxvd2x	7,10,6
147e83
-	stxvd2x	8,10,7
147e83
-	stxvd2x	9,10,8
147e83
-	addi	10,10,64
147e83
-
147e83
-L(aligned_128setup):
147e83
-	/* Set up for the 128-byte at a time copy loop.  */
147e83
-	srdi	8,31,7
147e83
-	cmpdi	8,0	# Any 4x lumps left?
147e83
-	beq	3f	# if not, move along.
147e83
-	lxvd2x	6,0,11
147e83
-	lxvd2x	7,11,6
147e83
-	mtctr	8	# otherwise, load the ctr and begin.
147e83
-	li	8,48	# 48() index
147e83
+	li	6,16
147e83
+	li	7,32
147e83
+	li	8,48
147e83
+	mtocrf	0x02,cnt
147e83
+	srdi	12,cnt,7
147e83
+	cmpdi	12,0
147e83
+	beq	L(aligned_tail)
147e83
+	lxvd2x	6,0,src
147e83
+	lxvd2x	7,src,6
147e83
+	mtctr	12
147e83
 	b	L(aligned_128loop)
147e83
 
147e83
+	.align  4
147e83
 L(aligned_128head):
147e83
 	/* for the 2nd + iteration of this loop. */
147e83
-	lxvd2x	6,0,11
147e83
-	lxvd2x	7,11,6
147e83
+	lxvd2x	6,0,src
147e83
+	lxvd2x	7,src,6
147e83
 L(aligned_128loop):
147e83
-	lxvd2x	8,11,7
147e83
-	lxvd2x	9,11,8
147e83
-	stxvd2x	6,0,10
147e83
-	addi	11,11,64
147e83
-	stxvd2x	7,10,6
147e83
-	stxvd2x	8,10,7
147e83
-	stxvd2x	9,10,8
147e83
-	lxvd2x	6,0,11
147e83
-	lxvd2x	7,11,6
147e83
-	addi	10,10,64
147e83
-	lxvd2x	8,11,7
147e83
-	lxvd2x	9,11,8
147e83
-	addi	11,11,64
147e83
-	stxvd2x	6,0,10
147e83
-	stxvd2x	7,10,6
147e83
-	stxvd2x	8,10,7
147e83
-	stxvd2x	9,10,8
147e83
-	addi	10,10,64
147e83
+	lxvd2x	8,src,7
147e83
+	lxvd2x	9,src,8
147e83
+	stxvd2x	6,0,dst
147e83
+	addi	src,src,64
147e83
+	stxvd2x	7,dst,6
147e83
+	stxvd2x	8,dst,7
147e83
+	stxvd2x	9,dst,8
147e83
+	lxvd2x	6,0,src
147e83
+	lxvd2x	7,src,6
147e83
+	addi	dst,dst,64
147e83
+	lxvd2x	8,src,7
147e83
+	lxvd2x	9,src,8
147e83
+	addi	src,src,64
147e83
+	stxvd2x	6,0,dst
147e83
+	stxvd2x	7,dst,6
147e83
+	stxvd2x	8,dst,7
147e83
+	stxvd2x	9,dst,8
147e83
+	addi	dst,dst,64
147e83
 	bdnz	L(aligned_128head)
147e83
 
147e83
-3:
147e83
-	/* Check for tail bytes.  */
147e83
-	rldicr  0,31,0,60
147e83
-	mtcrf   0x01,31
147e83
-	beq	cr6,0f
147e83
-
147e83
-.L9:
147e83
-	add	3,3,0
147e83
-	add	12,12,0
147e83
-
147e83
-	/*  At this point we have a tail of 0-7 bytes and we know that the
147e83
-	destination is doubleword-aligned.  */
147e83
-4:	/* Copy 4 bytes.  */
147e83
-	bf	29,2f
147e83
-
147e83
-	lwz     6,0(12)
147e83
-	addi    12,12,4
147e83
-	stw     6,0(3)
147e83
-	addi    3,3,4
147e83
-2:	/* Copy 2 bytes.  */
147e83
-	bf	30,1f
147e83
-
147e83
-	lhz     6,0(12)
147e83
-	addi    12,12,2
147e83
-	sth     6,0(3)
147e83
-	addi    3,3,2
147e83
-1:	/* Copy 1 byte.  */
147e83
-	bf	31,0f
147e83
-
147e83
-	lbz	6,0(12)
147e83
-	stb	6,0(3)
147e83
-0:	/* Return original DST pointer.  */
147e83
-	ld	31,-8(1)
147e83
-	ld	3,-16(1)
147e83
+L(aligned_tail):
147e83
+	mtocrf	0x01,cnt
147e83
+	bf	25,32f
147e83
+	lxvd2x	6,0,src
147e83
+	lxvd2x	7,src,6
147e83
+	lxvd2x	8,src,7
147e83
+	lxvd2x	9,src,8
147e83
+	addi	src,src,64
147e83
+	stxvd2x	6,0,dst
147e83
+	stxvd2x	7,dst,6
147e83
+	stxvd2x	8,dst,7
147e83
+	stxvd2x	9,dst,8
147e83
+	addi	dst,dst,64
147e83
+32:
147e83
+	bf	26,16f
147e83
+	lxvd2x	6,0,src
147e83
+	lxvd2x	7,src,6
147e83
+	addi	src,src,32
147e83
+	stxvd2x	6,0,dst
147e83
+	stxvd2x	7,dst,6
147e83
+	addi	dst,dst,32
147e83
+16:
147e83
+	bf	27,8f
147e83
+	lxvd2x	6,0,src
147e83
+	addi	src,src,16
147e83
+	stxvd2x	6,0,dst
147e83
+	addi	dst,dst,16
147e83
+8:
147e83
+	bf	28,4f
147e83
+	ld	6,0(src)
147e83
+	addi	src,src,8
147e83
+	std     6,0(dst)
147e83
+	addi	dst,dst,8
147e83
+4:	/* Copies 4~7 bytes.  */
147e83
+	bf	29,L(tail2)
147e83
+	lwz	6,0(src)
147e83
+	stw     6,0(dst)
147e83
+	bf      30,L(tail5)
147e83
+	lhz     7,4(src)
147e83
+	sth     7,4(dst)
147e83
+	bflr	31
147e83
+	lbz     8,6(src)
147e83
+	stb     8,6(dst)
147e83
+	/* Return original DST pointer.  */
147e83
 	blr
147e83
 
147e83
-	/* Handle copies of 0~31 bytes.  */
147e83
-	.align  4
147e83
+
147e83
+/* Handle copies of 0~31 bytes.  */
147e83
+	.align	4
147e83
 L(copy_LT_32):
147e83
-	cmpldi  cr6,5,8
147e83
-	mr	12,4
147e83
-	mtcrf   0x01,5
147e83
+	mr	dst,3
147e83
+	cmpldi	cr6,cnt,8
147e83
+	mtocrf	0x01,cnt
147e83
 	ble	cr6,L(copy_LE_8)
147e83
 
147e83
 	/* At least 9 bytes to go.  */
147e83
 	neg	8,4
147e83
-	clrrdi  11,4,2
147e83
-	andi.   0,8,3
147e83
-	cmpldi  cr1,5,16
147e83
-	mr	10,5
147e83
+	andi.	0,8,3
147e83
+	cmpldi	cr1,cnt,16
147e83
 	beq	L(copy_LT_32_aligned)
147e83
 
147e83
-	/* Force 4-bytes alignment for SRC.  */
147e83
-	mtocrf  0x01,0
147e83
-	subf    10,0,5
147e83
-2:	bf	30,1f
147e83
-
147e83
-	lhz	6,0(12)
147e83
-	addi    12,12,2
147e83
-	sth	6,0(3)
147e83
-	addi    3,3,2
147e83
-1:	bf	31,L(end_4bytes_alignment)
147e83
-
147e83
-	lbz	6,0(12)
147e83
-	addi    12,12,1
147e83
-	stb	6,0(3)
147e83
-	addi    3,3,1
147e83
+	/* Force 4-byte alignment for SRC.  */
147e83
+	mtocrf	0x01,0
147e83
+	subf	cnt,0,cnt
147e83
+2:
147e83
+	bf	30,1f
147e83
+	lhz	6,0(src)
147e83
+	addi	src,src,2
147e83
+	sth	6,0(dst)
147e83
+	addi	dst,dst,2
147e83
+1:
147e83
+	bf	31,L(end_4bytes_alignment)
147e83
+	lbz	6,0(src)
147e83
+	addi	src,src,1
147e83
+	stb	6,0(dst)
147e83
+	addi	dst,dst,1
147e83
 
147e83
-	.align  4
147e83
+	.align	4
147e83
 L(end_4bytes_alignment):
147e83
-	cmpldi  cr1,10,16
147e83
-	mtcrf   0x01,10
147e83
+	cmpldi	cr1,cnt,16
147e83
+	mtocrf	0x01,cnt
147e83
 
147e83
 L(copy_LT_32_aligned):
147e83
 	/* At least 6 bytes to go, and SRC is word-aligned.  */
147e83
 	blt	cr1,8f
147e83
 
147e83
 	/* Copy 16 bytes.  */
147e83
-	lwz	6,0(12)
147e83
-	lwz     7,4(12)
147e83
-	stw     6,0(3)
147e83
-	lwz     8,8(12)
147e83
-	stw     7,4(3)
147e83
-	lwz     6,12(12)
147e83
-	addi    12,12,16
147e83
-	stw     8,8(3)
147e83
-	stw     6,12(3)
147e83
-	addi    3,3,16
147e83
+	lwz	6,0(src)
147e83
+	lwz	7,4(src)
147e83
+	stw	6,0(dst)
147e83
+	lwz	8,8(src)
147e83
+	stw	7,4(dst)
147e83
+	lwz	6,12(src)
147e83
+	addi	src,src,16
147e83
+	stw	8,8(dst)
147e83
+	stw	6,12(dst)
147e83
+	addi	dst,dst,16
147e83
 8:	/* Copy 8 bytes.  */
147e83
-	bf	28,4f
147e83
+	bf	28,L(tail4)
147e83
+	lwz	6,0(src)
147e83
+	lwz	7,4(src)
147e83
+	addi	src,src,8
147e83
+	stw	6,0(dst)
147e83
+	stw	7,4(dst)
147e83
+	addi	dst,dst,8
147e83
+
147e83
+	.align	4
147e83
+/* Copies 4~7 bytes.  */
147e83
+L(tail4):
147e83
+	bf	29,L(tail2)
147e83
+	lwz	6,0(src)
147e83
+	stw	6,0(dst)
147e83
+	bf	30,L(tail5)
147e83
+	lhz	7,4(src)
147e83
+	sth	7,4(dst)
147e83
+	bflr	31
147e83
+	lbz	8,6(src)
147e83
+	stb	8,6(dst)
147e83
+	/* Return original DST pointer.  */
147e83
+	blr
147e83
 
147e83
-	lwz     6,0(12)
147e83
-	lwz     7,4(12)
147e83
-	addi    12,12,8
147e83
-	stw     6,0(3)
147e83
-	stw     7,4(3)
147e83
-	addi    3,3,8
147e83
-4:	/* Copy 4 bytes.  */
147e83
-	bf	29,2f
147e83
-
147e83
-	lwz     6,0(12)
147e83
-	addi    12,12,4
147e83
-	stw     6,0(3)
147e83
-	addi    3,3,4
147e83
-2:	/* Copy 2-3 bytes.  */
147e83
+	.align	4
147e83
+/* Copies 2~3 bytes.  */
147e83
+L(tail2):
147e83
 	bf	30,1f
147e83
-
147e83
-	lhz     6,0(12)
147e83
-	sth     6,0(3)
147e83
-	bf      31,0f
147e83
-	lbz     7,2(12)
147e83
-	stb     7,2(3)
147e83
-	ld	3,-16(1)
147e83
+	lhz	6,0(src)
147e83
+	sth	6,0(dst)
147e83
+	bflr	31
147e83
+	lbz	7,2(src)
147e83
+	stb	7,2(dst)
147e83
 	blr
147e83
 
147e83
-	.align  4
147e83
-1:	/* Copy 1 byte.  */
147e83
-	bf	31,0f
147e83
+	.align	4
147e83
+L(tail5):
147e83
+	bflr	31
147e83
+	lbz	6,4(src)
147e83
+	stb	6,4(dst)
147e83
+	blr
147e83
 
147e83
-	lbz	6,0(12)
147e83
-	stb	6,0(3)
147e83
-0:	/* Return original DST pointer.  */
147e83
-	ld	3,-16(1)
147e83
+	.align	4
147e83
+1:
147e83
+	bflr	31
147e83
+	lbz	6,0(src)
147e83
+	stb	6,0(dst)
147e83
+	/* Return original DST pointer.  */
147e83
 	blr
147e83
 
147e83
-	/* Handles copies of 0~8 bytes.  */
147e83
-	.align  4
147e83
+
147e83
+/* Handles copies of 0~8 bytes.  */
147e83
+	.align	4
147e83
 L(copy_LE_8):
147e83
-	bne	cr6,4f
147e83
+	bne	cr6,L(tail4)
147e83
 
147e83
 	/* Though we could've used ld/std here, they are still
147e83
 	slow for unaligned cases.  */
147e83
 
147e83
-	lwz	6,0(4)
147e83
-	lwz     7,4(4)
147e83
-	stw     6,0(3)
147e83
-	stw     7,4(3)
147e83
-	ld      3,-16(1)      /* Return original DST pointers.  */
147e83
+	lwz	6,0(src)
147e83
+	lwz	7,4(src)
147e83
+	stw	6,0(dst)
147e83
+	stw	7,4(dst)
147e83
 	blr
147e83
 
147e83
-	.align  4
147e83
-4:	/* Copies 4~7 bytes.  */
147e83
-	bf	29,2b
147e83
 
147e83
-	lwz	6,0(4)
147e83
-	stw     6,0(3)
147e83
-	bf      30,5f
147e83
-	lhz     7,4(4)
147e83
-	sth     7,4(3)
147e83
-	bf      31,0f
147e83
-	lbz     8,6(4)
147e83
-	stb     8,6(3)
147e83
-	ld	3,-16(1)
147e83
-	blr
147e83
-
147e83
-	.align  4
147e83
-5:	/* Copy 1 byte.  */
147e83
-	bf	31,0f
147e83
-
147e83
-	lbz	6,4(4)
147e83
-	stb	6,4(3)
147e83
-
147e83
-0:	/* Return original DST pointer.  */
147e83
-	ld	3,-16(1)
147e83
-	blr
147e83
-
147e83
-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
147e83
-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
147e83
-	the data, allowing for aligned DST stores.  */
147e83
-	.align  4
147e83
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
147e83
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
147e83
+   the data, allowing for aligned DST stores.  */
147e83
+	.align	4
147e83
 L(copy_GE_32_unaligned):
147e83
-	clrldi  0,0,60	      /* Number of bytes until the 1st
147e83
-			      quadword.  */
147e83
-	andi.   11,3,15       /* Check alignment of DST (against
147e83
-			      quadwords).  */
147e83
-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
147e83
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
147e83
+#ifndef __LITTLE_ENDIAN__
147e83
+	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
147e83
+#endif
147e83
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
147e83
 
147e83
 	beq	L(copy_GE_32_unaligned_cont)
147e83
 
147e83
-	/* SRC is not quadword aligned, get it aligned.  */
147e83
+	/* DST is not quadword aligned, get it aligned.  */
147e83
 
147e83
-	mtcrf   0x01,0
147e83
-	subf    31,0,5
147e83
+	mtocrf	0x01,0
147e83
+	subf	cnt,0,cnt
147e83
 
147e83
 	/* Vector instructions work best when proper alignment (16-bytes)
147e83
 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
147e83
-1:	/* Copy 1 byte.  */
147e83
+1:
147e83
 	bf	31,2f
147e83
-
147e83
-	lbz	6,0(12)
147e83
-	addi    12,12,1
147e83
-	stb	6,0(3)
147e83
-	addi    3,3,1
147e83
-2:	/* Copy 2 bytes.  */
147e83
+	lbz	6,0(src)
147e83
+	addi	src,src,1
147e83
+	stb	6,0(dst)
147e83
+	addi	dst,dst,1
147e83
+2:
147e83
 	bf	30,4f
147e83
-
147e83
-	lhz     6,0(12)
147e83
-	addi    12,12,2
147e83
-	sth     6,0(3)
147e83
-	addi    3,3,2
147e83
-4:	/* Copy 4 bytes.  */
147e83
+	lhz	6,0(src)
147e83
+	addi	src,src,2
147e83
+	sth	6,0(dst)
147e83
+	addi	dst,dst,2
147e83
+4:
147e83
 	bf	29,8f
147e83
-
147e83
-	lwz     6,0(12)
147e83
-	addi    12,12,4
147e83
-	stw     6,0(3)
147e83
-	addi    3,3,4
147e83
-8:	/* Copy 8 bytes.  */
147e83
+	lwz	6,0(src)
147e83
+	addi	src,src,4
147e83
+	stw	6,0(dst)
147e83
+	addi	dst,dst,4
147e83
+8:
147e83
 	bf	28,0f
147e83
-
147e83
-	ld	6,0(12)
147e83
-	addi    12,12,8
147e83
-	std	6,0(3)
147e83
-	addi    3,3,8
147e83
+	ld	6,0(src)
147e83
+	addi	src,src,8
147e83
+	std	6,0(dst)
147e83
+	addi	dst,dst,8
147e83
 0:
147e83
-	clrldi  10,12,60      /* Check alignment of SRC.  */
147e83
-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
147e83
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
147e83
 
147e83
 	/* The proper alignment is present, it is OK to copy the bytes now.  */
147e83
 L(copy_GE_32_unaligned_cont):
147e83
 
147e83
 	/* Setup two indexes to speed up the indexed vector operations.  */
147e83
-	clrldi  11,31,60
147e83
-	li      6,16	      /* Index for 16-bytes offsets.  */
147e83
+	clrldi	10,cnt,60
147e83
+	li	6,16	      /* Index for 16-bytes offsets.  */
147e83
 	li	7,32	      /* Index for 32-bytes offsets.  */
147e83
-	cmpldi  cr1,11,0
147e83
-	srdi    8,31,5	      /* Setup the loop counter.  */
147e83
-	mr      10,3
147e83
-	mr      11,12
147e83
-	mtcrf   0x01,9
147e83
-	cmpldi  cr6,9,1
147e83
-	lvsl    5,0,12
147e83
-	lvx     3,0,12
147e83
-	bf      31,L(setup_unaligned_loop)
147e83
-
147e83
-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
147e83
-	lvx     4,12,6
147e83
-	vperm   6,3,4,5
147e83
-	addi    11,12,16
147e83
-	addi    10,3,16
147e83
-	stvx    6,0,3
147e83
+	cmpldi	cr1,10,0
147e83
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
147e83
+	mtocrf	0x01,9
147e83
+	cmpldi	cr6,9,1
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	lvsr	5,0,src
147e83
+#else
147e83
+	lvsl	5,0,src
147e83
+#endif
147e83
+	lvx	3,0,src
147e83
+	li	0,0
147e83
+	bf	31,L(setup_unaligned_loop)
147e83
+
147e83
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
147e83
+	lvx	4,src,6
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm	6,4,3,5
147e83
+#else
147e83
+	vperm	6,3,4,5
147e83
+#endif
147e83
+	addi	src,src,16
147e83
+	stvx	6,0,dst
147e83
+	addi	dst,dst,16
147e83
 	vor	3,4,4
147e83
+	clrrdi	0,src,60
147e83
 
147e83
 L(setup_unaligned_loop):
147e83
-	mtctr   8
147e83
-	ble     cr6,L(end_unaligned_loop)
147e83
+	mtctr	8
147e83
+	ble	cr6,L(end_unaligned_loop)
147e83
 
147e83
 	/* Copy 32 bytes at a time using vector instructions.  */
147e83
-	.align  4
147e83
+	.align	4
147e83
 L(unaligned_loop):
147e83
 
147e83
 	/* Note: vr6/vr10 may contain data that was already copied,
147e83
@@ -444,63 +385,56 @@
147e83
 	some portions again. This is faster than having unaligned
147e83
 	vector instructions though.  */
147e83
 
147e83
-	lvx	4,11,6	      /* vr4 = r11+16.  */
147e83
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
147e83
-			      of vr3/vr4 into vr6.  */
147e83
-	lvx	3,11,7	      /* vr3 = r11+32.  */
147e83
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
147e83
-			      of vr3/vr4 into vr10.  */
147e83
-	addi    11,11,32
147e83
-	stvx    6,0,10
147e83
-	stvx    10,10,6
147e83
-	addi    10,10,32
147e83
-
147e83
+	lvx	4,src,6
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm	6,4,3,5
147e83
+#else
147e83
+	vperm	6,3,4,5
147e83
+#endif
147e83
+	lvx	3,src,7
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm	10,3,4,5
147e83
+#else
147e83
+	vperm	10,4,3,5
147e83
+#endif
147e83
+	addi	src,src,32
147e83
+	stvx	6,0,dst
147e83
+	stvx	10,dst,6
147e83
+	addi	dst,dst,32
147e83
 	bdnz	L(unaligned_loop)
147e83
 
147e83
-	.align  4
147e83
+	clrrdi	0,src,60
147e83
+
147e83
+	.align	4
147e83
 L(end_unaligned_loop):
147e83
 
147e83
 	/* Check for tail bytes.  */
147e83
-	rldicr  0,31,0,59
147e83
-	mtcrf   0x01,31
147e83
-	beq	cr1,0f
147e83
+	mtocrf	0x01,cnt
147e83
+	beqlr	cr1
147e83
 
147e83
-	add	3,3,0
147e83
-	add	12,12,0
147e83
+	add	src,src,0
147e83
 
147e83
 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
147e83
-8:	/* Copy 8 bytes.  */
147e83
+	/* Copy 8 bytes.  */
147e83
 	bf	28,4f
147e83
-
147e83
-	lwz	6,0(12)
147e83
-	lwz	7,4(12)
147e83
-	addi    12,12,8
147e83
-	stw	6,0(3)
147e83
-	stw	7,4(3)
147e83
-	addi    3,3,8
147e83
-4:	/* Copy 4 bytes.  */
147e83
-	bf	29,2f
147e83
-
147e83
-	lwz	6,0(12)
147e83
-	addi    12,12,4
147e83
-	stw	6,0(3)
147e83
-	addi    3,3,4
147e83
-2:	/* Copy 2~3 bytes.  */
147e83
-	bf	30,1f
147e83
-
147e83
-	lhz	6,0(12)
147e83
-	addi    12,12,2
147e83
-	sth	6,0(3)
147e83
-	addi    3,3,2
147e83
-1:	/* Copy 1 byte.  */
147e83
-	bf	31,0f
147e83
-
147e83
-	lbz	6,0(12)
147e83
-	stb	6,0(3)
147e83
-0:	/* Return original DST pointer.  */
147e83
-	ld	31,-8(1)
147e83
-	ld	3,-16(1)
147e83
+	lwz	6,0(src)
147e83
+	lwz	7,4(src)
147e83
+	addi	src,src,8
147e83
+	stw	6,0(dst)
147e83
+	stw	7,4(dst)
147e83
+	addi	dst,dst,8
147e83
+4:	/* Copy 4~7 bytes.  */
147e83
+	bf	29,L(tail2)
147e83
+	lwz	6,0(src)
147e83
+	stw	6,0(dst)
147e83
+	bf	30,L(tail5)
147e83
+	lhz	7,4(src)
147e83
+	sth	7,4(dst)
147e83
+	bflr	31
147e83
+	lbz	8,6(src)
147e83
+	stb	8,6(dst)
147e83
+	/* Return original DST pointer.  */
147e83
 	blr
147e83
 
147e83
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
147e83
+END_GEN_TB (memcpy,TB_TOCLESS)
147e83
 libc_hidden_builtin_def (memcpy)
147e83
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S
147e83
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
147e83
@@ -367,13 +367,21 @@
147e83
 	mr	11,12
147e83
 	mtcrf	0x01,9
147e83
 	cmpldi	cr6,9,1
147e83
-	lvsl	5,0,12
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	lvsr    5,0,12
147e83
+#else
147e83
+	lvsl    5,0,12
147e83
+#endif
147e83
 	lvx	3,0,12
147e83
 	bf	31,L(setup_unaligned_loop)
147e83
 
147e83
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
147e83
 	lvx	4,12,6
147e83
-	vperm	6,3,4,5
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   6,4,3,5
147e83
+#else
147e83
+	vperm   6,3,4,5
147e83
+#endif
147e83
 	addi	11,12,16
147e83
 	addi	10,3,16
147e83
 	stvx	6,0,3
147e83
@@ -393,11 +401,17 @@
147e83
 	vector instructions though.  */
147e83
 
147e83
 	lvx	4,11,6	      /* vr4 = r11+16.  */
147e83
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
147e83
-				 of vr3/vr4 into vr6.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   6,4,3,5
147e83
+#else
147e83
+	vperm   6,3,4,5
147e83
+#endif
147e83
 	lvx	3,11,7	      /* vr3 = r11+32.  */
147e83
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
147e83
-				 of vr3/vr4 into vr10.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vperm   10,3,4,5
147e83
+#else
147e83
+	vperm   10,4,3,5
147e83
+#endif
147e83
 	addi	11,11,32
147e83
 	stvx	6,0,10
147e83
 	stvx	10,10,6