Blame SOURCES/glibc-rh1385004-15.patch

147e83
From 71ae86478edc7b21872464f43fb29ff650c1681a Mon Sep 17 00:00:00 2001
147e83
From: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
147e83
Date: Tue, 15 Jul 2014 12:19:09 -0400
147e83
Subject: [PATCH] PowerPC: memset optimization for POWER8/PPC64
147e83
147e83
This patch adds an optimized memset implementation for POWER8.  For
147e83
sizes from 0 to 255 bytes, a word/doubleword algorithm similar to
147e83
POWER7 optimized one is used.
147e83
147e83
For size higher than 255 two strategies are used:
147e83
147e83
1. If the constant is different than 0, the memory is written with
147e83
   altivec vector instruction;
147e83
147e83
2. If constant is 0, dbcz instructions are used.  The loop is unrolled
147e83
   to clear 512 byte at time.
147e83
147e83
Using vector instructions increases throughput considerable, with a
147e83
double performance for sizes larger than 1024.  The dcbz loops unrolls
147e83
also shows performance improvement, by doubling throughput for sizes
147e83
larger than 8192 bytes.
147e83
---
147e83
 ChangeLog                                          |  15 +
147e83
 benchtests/bench-memset.c                          |   5 +
147e83
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   2 +-
147e83
 sysdeps/powerpc/powerpc64/multiarch/bzero.c        |  11 +-
147e83
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
147e83
 .../powerpc/powerpc64/multiarch/memset-power8.S    |  43 ++
147e83
 sysdeps/powerpc/powerpc64/multiarch/memset.c       |  11 +-
147e83
 sysdeps/powerpc/powerpc64/power8/memset.S          | 449 +++++++++++++++++++++
147e83
 8 files changed, 533 insertions(+), 9 deletions(-)
147e83
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
147e83
 create mode 100644 sysdeps/powerpc/powerpc64/power8/memset.S
147e83
147e83
diff --git a/ChangeLog b/ChangeLog
147e83
index ddaf70f..dc61c87 100644
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
147e83
index 0de3804..abc9d2e 100644
147e83
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
147e83
@@ -1,7 +1,8 @@ ifeq ($(subdir),string)
147e83
 ifeq ($(subdir),string)
147e83
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
147e83
                   memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
147e83
-                  memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
147e83
+                  memcmp-ppc64 memset-power8 memset-power7 memset-power6 \
147e83
+                  memset-power4 \
147e83
                   memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
147e83
                   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
147e83
                   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
147e83
index ed83541..298cf00 100644
147e83
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
147e83
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
147e83
 extern __typeof (bzero) __bzero_power4 attribute_hidden;
147e83
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
147e83
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
147e83
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
147e83
 
147e83
 libc_ifunc (__bzero,
147e83
-            (hwcap & PPC_FEATURE_HAS_VSX)
147e83
-            ? __bzero_power7 :
147e83
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
147e83
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
147e83
+            ? __bzero_power8 :
147e83
+	      (hwcap & PPC_FEATURE_HAS_VSX)
147e83
+	      ? __bzero_power7 :
147e83
+		(hwcap & PPC_FEATURE_ARCH_2_05)
147e83
 		? __bzero_power6 :
147e83
 		  (hwcap & PPC_FEATURE_POWER4)
147e83
-		? __bzero_power4
147e83
+		  ? __bzero_power4
147e83
             : __bzero_ppc);
147e83
 
147e83
 weak_alias (__bzero, bzero)
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
147e83
index a574487..06d5be9 100644
147e83
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
147e83
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
147e83
 
147e83
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
147e83
   IFUNC_IMPL (i, name, memset,
147e83
+             IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
147e83
+                             __memset_power8)
147e83
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
147e83
                              __memset_power7)
147e83
              IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
147e83
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
147e83
 
147e83
   /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c.  */
147e83
   IFUNC_IMPL (i, name, bzero,
147e83
+             IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
147e83
+                             __bzero_power8)
147e83
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
147e83
                              __bzero_power7)
147e83
              IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
147e83
new file mode 100644
147e83
index 0000000..e8a604b
147e83
--- /dev/null
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
147e83
@@ -0,0 +1,43 @@
147e83
+/* Optimized memset implementation for PowerPC64/POWER8.
147e83
+   Copyright (C) 2014 Free Software Foundation, Inc.
147e83
+   This file is part of the GNU C Library.
147e83
+
147e83
+   The GNU C Library is free software; you can redistribute it and/or
147e83
+   modify it under the terms of the GNU Lesser General Public
147e83
+   License as published by the Free Software Foundation; either
147e83
+   version 2.1 of the License, or (at your option) any later version.
147e83
+
147e83
+   The GNU C Library is distributed in the hope that it will be useful,
147e83
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
147e83
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
147e83
+   Lesser General Public License for more details.
147e83
+
147e83
+   You should have received a copy of the GNU Lesser General Public
147e83
+   License along with the GNU C Library; if not, see
147e83
+   <http://www.gnu.org/licenses/>.  */
147e83
+
147e83
+#include <sysdep.h>
147e83
+
147e83
+#undef EALIGN
147e83
+#define EALIGN(name, alignt, words)				\
147e83
+  .section ".text";						\
147e83
+  ENTRY_2(__memset_power8)					\
147e83
+  .align ALIGNARG(alignt);					\
147e83
+  EALIGN_W_##words;						\
147e83
+  BODY_LABEL(__memset_power8):					\
147e83
+  cfi_startproc;						\
147e83
+  LOCALENTRY(__memset_power8)
147e83
+
147e83
+#undef END_GEN_TB
147e83
+#define END_GEN_TB(name, mask)					\
147e83
+  cfi_endproc;							\
147e83
+  TRACEBACK_MASK(__memset_power8,mask)				\
147e83
+  END_2(__memset_power8)
147e83
+
147e83
+#undef libc_hidden_builtin_def
147e83
+#define libc_hidden_builtin_def(name)
147e83
+
147e83
+#undef __bzero
147e83
+#define __bzero __bzero_power8
147e83
+
147e83
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
147e83
index aa2ae70..9c7ed10 100644
147e83
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
147e83
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
147e83
 extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
147e83
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
147e83
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
147e83
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
147e83
 
147e83
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
147e83
    ifunc symbol properly.  */
147e83
 libc_ifunc (__libc_memset,
147e83
-            (hwcap & PPC_FEATURE_HAS_VSX)
147e83
-            ? __memset_power7 :
147e83
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
147e83
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
147e83
+            ? __memset_power8 :
147e83
+	      (hwcap & PPC_FEATURE_HAS_VSX)
147e83
+	      ? __memset_power7 :
147e83
+		(hwcap & PPC_FEATURE_ARCH_2_05)
147e83
 		? __memset_power6 :
147e83
 		  (hwcap & PPC_FEATURE_POWER4)
147e83
-		? __memset_power4
147e83
+		  ? __memset_power4
147e83
             : __memset_ppc);
147e83
 
147e83
 #undef memset
147e83
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
147e83
new file mode 100644
147e83
index 0000000..191a4df
147e83
--- /dev/null
147e83
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
147e83
@@ -0,0 +1,449 @@
147e83
+/* Optimized memset implementation for PowerPC64/POWER8.
147e83
+   Copyright (C) 2014 Free Software Foundation, Inc.
147e83
+   This file is part of the GNU C Library.
147e83
+
147e83
+   The GNU C Library is free software; you can redistribute it and/or
147e83
+   modify it under the terms of the GNU Lesser General Public
147e83
+   License as published by the Free Software Foundation; either
147e83
+   version 2.1 of the License, or (at your option) any later version.
147e83
+
147e83
+   The GNU C Library is distributed in the hope that it will be useful,
147e83
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
147e83
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
147e83
+   Lesser General Public License for more details.
147e83
+
147e83
+   You should have received a copy of the GNU Lesser General Public
147e83
+   License along with the GNU C Library; if not, see
147e83
+   <http://www.gnu.org/licenses/>.  */
147e83
+
147e83
+#include <sysdep.h>
147e83
+
147e83
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
147e83
+   Returns 's'.  */
147e83
+
147e83
+	.machine power8
147e83
+EALIGN (memset, 5, 0)
147e83
+	CALL_MCOUNT 3
147e83
+
147e83
+L(_memset):
147e83
+	cmpldi	cr7,r5,31
147e83
+	neg	r0,r3
147e83
+	mr	r10,r3
147e83
+
147e83
+	insrdi	r4,r4,8,48
147e83
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
147e83
+	ble	cr7,L(write_LT_32)
147e83
+
147e83
+	andi.	r11,r10,15	/* Check alignment of DST.  */
147e83
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
147e83
+
147e83
+	beq	L(big_aligned)
147e83
+
147e83
+	mtocrf	0x01,r0
147e83
+	clrldi	r0,r0,60
147e83
+
147e83
+	/* Get DST aligned to 16 bytes.  */
147e83
+1:	bf	31,2f
147e83
+	stb	r4,0(r10)
147e83
+	addi	r10,r10,1
147e83
+
147e83
+2:	bf	30,4f
147e83
+	sth	r4,0(r10)
147e83
+	addi	r10,r10,2
147e83
+
147e83
+4:	bf	29,8f
147e83
+	stw	r4,0(r10)
147e83
+	addi	r10,r10,4
147e83
+
147e83
+8:	bf      28,16f
147e83
+	std     r4,0(r10)
147e83
+	addi    r10,r10,8
147e83
+
147e83
+16:	subf	r5,r0,r5
147e83
+
147e83
+	.align	4
147e83
+L(big_aligned):
147e83
+	/* For sizes larger than 255 two possible paths:
147e83
+	   - if constant is '0', zero full cache lines with dcbz
147e83
+	   - otherwise uses vector instructions.  */
147e83
+	cmpldi	cr5,r5,255
147e83
+	dcbtst	0,r10
147e83
+	cmpldi	cr6,r4,0
147e83
+	crand	27,26,21
147e83
+	bt	27,L(huge_dcbz)
147e83
+	bge	cr5,L(huge_vector)
147e83
+
147e83
+
147e83
+	/* Size between 32 and 255 bytes with constant different than 0, use
147e83
+	   doubleword store instruction to achieve best throughput.  */
147e83
+	srdi    r8,r5,5
147e83
+	clrldi  r11,r5,59
147e83
+	cmpldi  cr6,r11,0
147e83
+	cmpdi	r8,0
147e83
+	beq     L(tail_bytes)
147e83
+	mtctr   r8
147e83
+
147e83
+	/* Main aligned write loop, writes 32-bytes at a time.  */
147e83
+	.align  4
147e83
+L(big_loop):
147e83
+	std     r4,0(r10)
147e83
+	std     r4,8(r10)
147e83
+	std     r4,16(r10)
147e83
+	std     r4,24(r10)
147e83
+	addi    r10,r10,32
147e83
+	bdz     L(tail_bytes)
147e83
+
147e83
+	std     r4,0(r10)
147e83
+	std     r4,8(r10)
147e83
+	std     r4,16(r10)
147e83
+	std     r4,24(r10)
147e83
+	addi    r10,10,32
147e83
+	bdnz    L(big_loop)
147e83
+
147e83
+	b       L(tail_bytes)
147e83
+
147e83
+	/* Write remaining 1~31 bytes.  */
147e83
+	.align  4
147e83
+L(tail_bytes):
147e83
+	beqlr   cr6
147e83
+
147e83
+	srdi    r7,r11,4
147e83
+	clrldi  r8,r11,60
147e83
+	mtocrf  0x01,r7
147e83
+
147e83
+	.align	4
147e83
+	bf	31,8f
147e83
+	std	r4,0(r10)
147e83
+	std	r4,8(r10)
147e83
+	addi	r10,r10,16
147e83
+
147e83
+	.align	4
147e83
+8:	mtocrf	0x1,r8
147e83
+	bf	28,4f
147e83
+	std	r4,0(r10)
147e83
+	addi	r10,r10,8
147e83
+
147e83
+	.align	4
147e83
+4:	bf      29,2f
147e83
+	stw     4,0(10)
147e83
+	addi    10,10,4
147e83
+
147e83
+	.align 	4
147e83
+2:	bf      30,1f
147e83
+	sth     4,0(10)
147e83
+	addi    10,10,2
147e83
+
147e83
+	.align  4
147e83
+1:      bflr    31
147e83
+	stb     4,0(10)
147e83
+	blr
147e83
+
147e83
+	/* Size larger than 255 bytes with constant different than 0, use
147e83
+	   vector instruction to achieve best throughput.  */
147e83
+L(huge_vector):
147e83
+	/* Replicate set byte to quadword in VMX register.  */
147e83
+	mtvsrd	 v1,r4
147e83
+	xxpermdi 32,v0,v1,0
147e83
+	vspltb	 v2,v0,15
147e83
+
147e83
+	/* Main aligned write loop: 128 bytes at a time.  */
147e83
+	li	r6,16
147e83
+	li	r7,32
147e83
+	li	r8,48
147e83
+	mtocrf	0x02,r5
147e83
+	srdi	r12,r5,7
147e83
+	cmpdi	r12,0
147e83
+	beq	L(aligned_tail)
147e83
+	mtctr	r12
147e83
+	b	L(aligned_128loop)
147e83
+
147e83
+	.align  4
147e83
+L(aligned_128loop):
147e83
+	stvx	v2,0,r10
147e83
+	stvx	v2,r10,r6
147e83
+	stvx	v2,r10,r7
147e83
+	stvx	v2,r10,r8
147e83
+	addi	r10,r10,64
147e83
+	stvx	v2,0,r10
147e83
+	stvx	v2,r10,r6
147e83
+	stvx	v2,r10,r7
147e83
+	stvx	v2,r10,r8
147e83
+	addi	r10,r10,64
147e83
+	bdnz	L(aligned_128loop)
147e83
+
147e83
+	/* Write remaining 1~127 bytes.  */
147e83
+L(aligned_tail):
147e83
+	mtocrf	0x01,r5
147e83
+	bf	25,32f
147e83
+	stvx	v2,0,r10
147e83
+	stvx	v2,r10,r6
147e83
+	stvx	v2,r10,r7
147e83
+	stvx	v2,r10,r8
147e83
+	addi	r10,r10,64
147e83
+
147e83
+32:	bf	26,16f
147e83
+	stvx	v2,0,r10
147e83
+	stvx	v2,r10,r6
147e83
+	addi	r10,r10,32
147e83
+
147e83
+16:	bf	27,8f
147e83
+	stvx	v2,0,r10
147e83
+	addi	r10,r10,16
147e83
+
147e83
+8:	bf	28,4f
147e83
+	std     r4,0(r10)
147e83
+	addi	r10,r10,8
147e83
+
147e83
+	/* Copies 4~7 bytes.  */
147e83
+4:	bf	29,L(tail2)
147e83
+	stw     r4,0(r10)
147e83
+	bf      30,L(tail5)
147e83
+	sth     r4,4(r10)
147e83
+	bflr	31
147e83
+	stb     r4,6(r10)
147e83
+	/* Return original DST pointer.  */
147e83
+	blr
147e83
+
147e83
+	/* Special case when value is 0 and we have a long length to deal
147e83
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
147e83
+	   Before using dcbz though, we need to get the destination 128-byte
147e83
+	   aligned.  */
147e83
+	.align	4
147e83
+L(huge_dcbz):
147e83
+	andi.	r11,r10,127
147e83
+	neg	r0,r10
147e83
+	beq	L(huge_dcbz_aligned)
147e83
+
147e83
+	clrldi	r0,r0,57
147e83
+	subf	r5,r0,r5
147e83
+	srdi	r0,r0,3
147e83
+	mtocrf	0x01,r0
147e83
+
147e83
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
147e83
+8:	bf	28,4f
147e83
+
147e83
+	std	r4,0(r10)
147e83
+	std	r4,8(r10)
147e83
+	std	r4,16(r10)
147e83
+	std	r4,24(r10)
147e83
+	std	r4,32(r10)
147e83
+	std	r4,40(r10)
147e83
+	std	r4,48(r10)
147e83
+	std	r4,56(r10)
147e83
+	addi	r10,r10,64
147e83
+
147e83
+	.align	4
147e83
+4:	bf	29,2f
147e83
+	std	r4,0(r10)
147e83
+	std	r4,8(r10)
147e83
+	std	r4,16(r10)
147e83
+	std	r4,24(r10)
147e83
+	addi	r10,r10,32
147e83
+
147e83
+	.align	4
147e83
+2:	bf	30,1f
147e83
+	std	r4,0(r10)
147e83
+	std	r4,8(r10)
147e83
+	addi	r10,r10,16
147e83
+
147e83
+	.align	4
147e83
+1:	bf	31,L(huge_dcbz_aligned)
147e83
+	std	r4,0(r10)
147e83
+	addi	r10,r10,8
147e83
+
147e83
+L(huge_dcbz_aligned):
147e83
+	/* Setup dcbz unroll offsets and count numbers.  */
147e83
+	srdi	r8,r5,9
147e83
+	clrldi	r11,r5,55
147e83
+	cmpldi	cr6,r11,0
147e83
+	li	r9,128
147e83
+	cmpdi	r8,0
147e83
+	beq     L(huge_tail)
147e83
+	li	r7,256
147e83
+	li	r6,384
147e83
+	mtctr	r8
147e83
+
147e83
+	.align	4
147e83
+L(huge_loop):
147e83
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
147e83
+	   a throughput boost for large sizes (2048 bytes or higher).  */
147e83
+	dcbz	0,r10
147e83
+	dcbz	r9,r10
147e83
+	dcbz	r7,r10
147e83
+	dcbz	r6,r10
147e83
+	addi	r10,r10,512
147e83
+	bdnz	L(huge_loop)
147e83
+
147e83
+	beqlr	cr6
147e83
+
147e83
+L(huge_tail):
147e83
+	srdi    r6,r11,8
147e83
+	srdi    r7,r11,4
147e83
+	clrldi  r8,r11,4
147e83
+	cmpldi  cr6,r8,0
147e83
+	mtocrf  0x01,r6
147e83
+
147e83
+	beq	cr6,L(tail)
147e83
+
147e83
+	/* We have 1~511 bytes remaining.  */
147e83
+	.align	4
147e83
+32:	bf	31,16f
147e83
+	dcbz	0,r10
147e83
+	dcbz	r9,r10
147e83
+	addi	r10,r10,256
147e83
+
147e83
+	.align	4
147e83
+16:	mtocrf  0x01,r7
147e83
+	bf	28,8f
147e83
+	dcbz	0,r10
147e83
+	addi	r10,r10,128
147e83
+
147e83
+	.align 	4
147e83
+8:	bf	29,4f
147e83
+	std	r4,0(r10)
147e83
+	std	r4,8(r10)
147e83
+	std	r4,16(r10)
147e83
+	std	r4,24(r10)
147e83
+	std	r4,32(r10)
147e83
+	std	r4,40(r10)
147e83
+	std	r4,48(r10)
147e83
+	std	r4,56(r10)
147e83
+	addi	r10,r10,64
147e83
+
147e83
+	.align	4
147e83
+4:	bf	30,2f
147e83
+	std	r4,0(r10)
147e83
+	std	r4,8(r10)
147e83
+	std	r4,16(r10)
147e83
+	std	r4,24(r10)
147e83
+	addi	r10,r10,32
147e83
+
147e83
+	.align	4
147e83
+2:	bf	31,L(tail)
147e83
+	std	r4,0(r10)
147e83
+	std	r4,8(r10)
147e83
+	addi	r10,r10,16
147e83
+	.align	4
147e83
+
147e83
+	/* Remaining 1~15 bytes.  */
147e83
+L(tail):
147e83
+	mtocrf  0x01,r8
147e83
+
147e83
+	.align
147e83
+8:	bf	28,4f
147e83
+	std	r4,0(r10)
147e83
+	addi	r10,r10,8
147e83
+
147e83
+	.align	4
147e83
+4:	bf	29,2f
147e83
+	stw	r4,0(r10)
147e83
+	addi	r10,r10,4
147e83
+
147e83
+	.align	4
147e83
+2:	bf	30,1f
147e83
+	sth	r4,0(r10)
147e83
+	addi	r10,r10,2
147e83
+
147e83
+	.align	4
147e83
+1:	bflr	31
147e83
+	stb	r4,0(r10)
147e83
+	blr
147e83
+
147e83
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
147e83
+	   by just unrolling all operations.  */
147e83
+	.align	4
147e83
+L(write_LT_32):
147e83
+	cmpldi	cr6,5,8
147e83
+	mtocrf	0x01,r5
147e83
+	ble	cr6,L(write_LE_8)
147e83
+
147e83
+	/* At least 9 bytes to go.  */
147e83
+	neg	r8,r4
147e83
+	andi.	r0,r8,3
147e83
+	cmpldi	cr1,r5,16
147e83
+	beq	L(write_LT_32_aligned)
147e83
+
147e83
+	/* Force 4-byte alignment for SRC.  */
147e83
+	mtocrf	0x01,r0
147e83
+	subf	r5,r0,r5
147e83
+
147e83
+2:	bf	30,1f
147e83
+	sth	r4,0(r10)
147e83
+	addi	r10,r10,2
147e83
+
147e83
+1:	bf	31,L(end_4bytes_alignment)
147e83
+	stb	r4,0(r10)
147e83
+	addi	r10,r10,1
147e83
+
147e83
+	.align	4
147e83
+L(end_4bytes_alignment):
147e83
+	cmpldi	cr1,r5,16
147e83
+	mtocrf	0x01,r5
147e83
+
147e83
+L(write_LT_32_aligned):
147e83
+	blt	cr1,8f
147e83
+
147e83
+	stw	r4,0(r10)
147e83
+	stw	r4,4(r10)
147e83
+	stw	r4,8(r10)
147e83
+	stw	r4,12(r10)
147e83
+	addi	r10,r10,16
147e83
+
147e83
+8:	bf	28,L(tail4)
147e83
+	stw	r4,0(r10)
147e83
+	stw	r4,4(r10)
147e83
+	addi	r10,r10,8
147e83
+
147e83
+	.align	4
147e83
+	/* Copies 4~7 bytes.  */
147e83
+L(tail4):
147e83
+	bf	29,L(tail2)
147e83
+	stw	r4,0(r10)
147e83
+	bf	30,L(tail5)
147e83
+	sth	r4,4(r10)
147e83
+	bflr	31
147e83
+	stb	r4,6(r10)
147e83
+	blr
147e83
+
147e83
+	.align	4
147e83
+	/* Copies 2~3 bytes.  */
147e83
+L(tail2):
147e83
+	bf	30,1f
147e83
+	sth	r4,0(r10)
147e83
+	bflr	31
147e83
+	stb	r4,2(r10)
147e83
+	blr
147e83
+
147e83
+	.align	4
147e83
+L(tail5):
147e83
+	bflr	31
147e83
+	stb	r4,4(r10)
147e83
+	blr
147e83
+
147e83
+	.align	4
147e83
+1: 	bflr	31
147e83
+	stb	r4,0(r10)
147e83
+	blr
147e83
+
147e83
+	/* Handles copies of 0~8 bytes.  */
147e83
+	.align	4
147e83
+L(write_LE_8):
147e83
+	bne	cr6,L(tail4)
147e83
+
147e83
+	stw	r4,0(r10)
147e83
+	stw	r4,4(r10)
147e83
+	blr
147e83
+END_GEN_TB (memset,TB_TOCLESS)
147e83
+libc_hidden_builtin_def (memset)
147e83
+
147e83
+/* Copied from bzero.S to prevent the linker from inserting a stub
147e83
+   between bzero and memset.  */
147e83
+ENTRY (__bzero)
147e83
+	CALL_MCOUNT 3
147e83
+	mr	r5,r4
147e83
+	li	r4,0
147e83
+	b	L(_memset)
147e83
+END (__bzero)
147e83
+#ifndef __bzero
147e83
+weak_alias (__bzero, bzero)
147e83
+#endif
147e83
-- 
147e83
2.1.0
147e83