Blame SOURCES/glibc-rh1385004-12.patch

147e83
From 0d3555b9b4d5cefe116c32bfa38ac70f1d6c25cb Mon Sep 17 00:00:00 2001
147e83
From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
147e83
Date: Wed, 11 Nov 2015 17:31:28 -0200
147e83
Subject: [PATCH] powerpc: Optimization for strlen for POWER8.
147e83
147e83
This implementation takes advantage of vectorization to improve performance of
147e83
the loop over the current strlen implementation for POWER7.
147e83
147e83
(cherry picked from commit 1b045ee53e0b8bed75745b931b33f27d21c9ed22)
147e83
---
147e83
 ChangeLog                                          |  13 +
147e83
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   2 +-
147e83
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   2 +
147e83
 .../powerpc/powerpc64/multiarch/strlen-power8.S    |  39 +++
147e83
 sysdeps/powerpc/powerpc64/multiarch/strlen.c       |   9 +-
147e83
 sysdeps/powerpc/powerpc64/power8/strlen.S          | 297 +++++++++++++++++++++
147e83
 6 files changed, 358 insertions(+), 4 deletions(-)
147e83
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
147e83
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strlen.S
147e83
147e83
diff --git a/ChangeLog b/ChangeLog
147e83
index f030b68..e7ea58a 100644
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
147e83
index 7ed56bf..57abe8f 100644
147e83
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
147e83
@@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
147e83
                   strncpy-power8 strncpy-power7 strncpy-ppc64 \
147e83
                   strncat-power7 \
147e83
                   strstr-power7 strstr-ppc64 \
147e83
-                  strspn-power8 strspn-ppc64 \
147e83
+                  strspn-power8 strspn-ppc64 strlen-power8 \
147e83
                   rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
147e83
                   strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
147e83
                   strncase-power7 strncase_l-power7 \
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
147e83
index f6c70ba..583885c 100644
147e83
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
147e83
@@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
147e83
 
147e83
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
147e83
   IFUNC_IMPL (i, name, strlen,
147e83
+             IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
147e83
+                             __strlen_power8)
147e83
              IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
147e83
                              __strlen_power7)
147e83
              IFUNC_IMPL_ADD (array, i, strlen, 1,
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
147e83
new file mode 100644
147e83
index 0000000..686dc3d
147e83
--- /dev/null
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
147e83
@@ -0,0 +1,39 @@
147e83
+/* Optimized strlen implementation for POWER8.
147e83
+   Copyright (C) 2016 Free Software Foundation, Inc.
147e83
+   This file is part of the GNU C Library.
147e83
+
147e83
+   The GNU C Library is free software; you can redistribute it and/or
147e83
+   modify it under the terms of the GNU Lesser General Public
147e83
+   License as published by the Free Software Foundation; either
147e83
+   version 2.1 of the License, or (at your option) any later version.
147e83
+
147e83
+   The GNU C Library is distributed in the hope that it will be useful,
147e83
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
147e83
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
147e83
+   Lesser General Public License for more details.
147e83
+
147e83
+   You should have received a copy of the GNU Lesser General Public
147e83
+   License along with the GNU C Library; if not, see
147e83
+   <http://www.gnu.org/licenses/>.  */
147e83
+
147e83
+#include <sysdep.h>
147e83
+
147e83
+#undef EALIGN
147e83
+#define EALIGN(name, alignt, words)				\
147e83
+  .section ".text";						\
147e83
+  ENTRY_2(__strlen_power8)					\
147e83
+  .align ALIGNARG(alignt);					\
147e83
+  EALIGN_W_##words;						\
147e83
+  BODY_LABEL(__strlen_power8):					\
147e83
+  cfi_startproc;						\
147e83
+  LOCALENTRY(__strlen_power8)
147e83
+#undef END
147e83
+#define END(name)						\
147e83
+  cfi_endproc;							\
147e83
+  TRACEBACK(__strlen_power8)					\
147e83
+  END_2(__strlen_power8)
147e83
+
147e83
+#undef libc_hidden_builtin_def
147e83
+#define libc_hidden_builtin_def(name)
147e83
+
147e83
+#include <sysdeps/powerpc/powerpc64/power8/strlen.S>
147e83
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
147e83
index 79a53d9..4b400a5 100644
147e83
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
147e83
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
147e83
@@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen;
147e83
 
147e83
 extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
147e83
 extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
147e83
+extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
147e83
 
147e83
 libc_ifunc (__libc_strlen,
147e83
-            (hwcap & PPC_FEATURE_HAS_VSX)
147e83
-            ? __strlen_power7
147e83
-            : __strlen_ppc);
147e83
+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
147e83
+	    ? __strlen_power8 :
147e83
+	      (hwcap & PPC_FEATURE_HAS_VSX)
147e83
+	      ? __strlen_power7
147e83
+	      : __strlen_ppc);
147e83
 
147e83
 #undef strlen
147e83
 strong_alias (__libc_strlen, strlen)
147e83
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
147e83
new file mode 100644
147e83
index 0000000..0142747
147e83
--- /dev/null
147e83
+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S
147e83
@@ -0,0 +1,297 @@
147e83
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
147e83
+   loop.
147e83
+   Copyright (C) 2016 Free Software Foundation, Inc.
147e83
+   This file is part of the GNU C Library.
147e83
+
147e83
+   The GNU C Library is free software; you can redistribute it and/or
147e83
+   modify it under the terms of the GNU Lesser General Public
147e83
+   License as published by the Free Software Foundation; either
147e83
+   version 2.1 of the License, or (at your option) any later version.
147e83
+
147e83
+   The GNU C Library is distributed in the hope that it will be useful,
147e83
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
147e83
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
147e83
+   Lesser General Public License for more details.
147e83
+
147e83
+   You should have received a copy of the GNU Lesser General Public
147e83
+   License along with the GNU C Library; if not, see
147e83
+   <http://www.gnu.org/licenses/>.  */
147e83
+
147e83
+#include <sysdep.h>
147e83
+
147e83
+/* TODO: change these to the actual instructions when the minimum required
147e83
+   binutils allows it.  */
147e83
+#define MFVRD(r,v)	.long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
147e83
+#define VBPERMQ(t,a,b)	.long (0x1000054c \
147e83
+			       | ((t)<<(32-11))	\
147e83
+			       | ((a)<<(32-16))	\
147e83
+			       | ((b)<<(32-21)) )
147e83
+
147e83
+/* int [r3] strlen (char *s [r3])  */
147e83
+
147e83
+/* TODO: change this to .machine power8 when the minimum required binutils
147e83
+   allows it.  */
147e83
+	.machine  power7
147e83
+EALIGN (strlen, 4, 0)
147e83
+	CALL_MCOUNT 1
147e83
+	dcbt	0,r3
147e83
+	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
147e83
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
147e83
+	li	r0,0	      /* Doubleword with null chars to use
147e83
+				 with cmpb.  */
147e83
+	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
147e83
+	ld	r12,0(r4)     /* Load doubleword from memory.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	sld	r5,r5,r6
147e83
+#else
147e83
+	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
147e83
+#endif
147e83
+	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
147e83
+	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
147e83
+	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
147e83
+	bne	cr7,L(done)
147e83
+
147e83
+	/* For shorter strings (< 64 bytes), we will not use vector registers,
147e83
+	   as the overhead isn't worth it.  So, let's use GPRs instead.  This
147e83
+	   will be done the same way as we do in the POWER7 implementation.
147e83
+	   Let's see if we are aligned to a quadword boundary.  If so, we can
147e83
+	   jump to the first (non-vectorized) loop.  Otherwise, we have to
147e83
+	   handle the next DWORD first.  */
147e83
+	mtcrf	0x01,r4
147e83
+	mr	r9,r4
147e83
+	addi	r9,r9,8
147e83
+	bt	28,L(align64)
147e83
+
147e83
+	/* Handle the next 8 bytes so we are aligned to a quadword
147e83
+	   boundary.  */
147e83
+	ldu	r5,8(r4)
147e83
+	cmpb	r10,r5,r0
147e83
+	cmpdi	cr7,r10,0
147e83
+	addi	r9,r9,8
147e83
+	bne	cr7,L(done)
147e83
+
147e83
+L(align64):
147e83
+	/* Proceed to the old (POWER7) implementation, checking two doublewords
147e83
+	   per iteraction.  For the first 56 bytes, we will just check for null
147e83
+	   characters.  After that, we will also check if we are 64-byte aligned
147e83
+	   so we can jump to the vectorized implementation.  We will unroll
147e83
+	   these loops to avoid excessive branching.  */
147e83
+	ld	r6,8(r4)
147e83
+	ldu	r5,16(r4)
147e83
+	cmpb	r10,r6,r0
147e83
+	cmpb	r11,r5,r0
147e83
+	or	r5,r10,r11
147e83
+	cmpdi	cr7,r5,0
147e83
+	addi	r9,r9,16
147e83
+	bne	cr7,L(dword_zero)
147e83
+
147e83
+	ld	r6,8(r4)
147e83
+	ldu	r5,16(r4)
147e83
+	cmpb	r10,r6,r0
147e83
+	cmpb	r11,r5,r0
147e83
+	or	r5,r10,r11
147e83
+	cmpdi	cr7,r5,0
147e83
+	addi	r9,r9,16
147e83
+	bne	cr7,L(dword_zero)
147e83
+
147e83
+	ld	r6,8(r4)
147e83
+	ldu	r5,16(r4)
147e83
+	cmpb	r10,r6,r0
147e83
+	cmpb	r11,r5,r0
147e83
+	or	r5,r10,r11
147e83
+	cmpdi	cr7,r5,0
147e83
+	addi	r9,r9,16
147e83
+	bne	cr7,L(dword_zero)
147e83
+
147e83
+	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
147e83
+	   Note: aligning to 64-byte will necessarily slow down performance for
147e83
+	   strings around 64 bytes in length due to the extra comparisons
147e83
+	   required to check alignment for the vectorized loop.  This is a
147e83
+	   necessary tradeoff we are willing to take in order to speed up the
147e83
+	   calculation for larger strings.  */
147e83
+	andi.	r10,r9,63
147e83
+	beq	cr0,L(preloop)
147e83
+	ld	r6,8(r4)
147e83
+	ldu	r5,16(r4)
147e83
+	cmpb	r10,r6,r0
147e83
+	cmpb	r11,r5,r0
147e83
+	or	r5,r10,r11
147e83
+	cmpdi	cr7,r5,0
147e83
+	addi	r9,r9,16
147e83
+	bne	cr7,L(dword_zero)
147e83
+
147e83
+	andi.	r10,r9,63
147e83
+	beq	cr0,L(preloop)
147e83
+	ld	r6,8(r4)
147e83
+	ldu	r5,16(r4)
147e83
+	cmpb	r10,r6,r0
147e83
+	cmpb	r11,r5,r0
147e83
+	or	r5,r10,r11
147e83
+	cmpdi	cr7,r5,0
147e83
+	addi	r9,r9,16
147e83
+	bne	cr7,L(dword_zero)
147e83
+
147e83
+	andi.	r10,r9,63
147e83
+	beq	cr0,L(preloop)
147e83
+	ld	r6,8(r4)
147e83
+	ldu	r5,16(r4)
147e83
+	cmpb	r10,r6,r0
147e83
+	cmpb	r11,r5,r0
147e83
+	or	r5,r10,r11
147e83
+	cmpdi	cr7,r5,0
147e83
+	addi	r9,r9,16
147e83
+	bne	cr7,L(dword_zero)
147e83
+
147e83
+	andi.	r10,r9,63
147e83
+	beq	cr0,L(preloop)
147e83
+	ld	r6,8(r4)
147e83
+	ldu	r5,16(r4)
147e83
+	cmpb	r10,r6,r0
147e83
+	cmpb	r11,r5,r0
147e83
+	or	r5,r10,r11
147e83
+	cmpdi	cr7,r5,0
147e83
+	addi	r9,r9,16
147e83
+
147e83
+	/* At this point, we are necessarily 64-byte aligned.  If no zeroes were
147e83
+	   found, jump to the vectorized loop.  */
147e83
+	beq	cr7,L(preloop)
147e83
+
147e83
+L(dword_zero):
147e83
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
147e83
+	   the first doubleword and decrement the address in case the first
147e83
+	   doubleword really contains a null byte.  */
147e83
+
147e83
+	cmpdi	cr6,r10,0
147e83
+	addi	r4,r4,-8
147e83
+	bne	cr6,L(done)
147e83
+
147e83
+	/* The null byte must be in the second doubleword.  Adjust the address
147e83
+	   again and move the result of cmpb to r10 so we can calculate the
147e83
+	   length.  */
147e83
+
147e83
+	mr	r10,r11
147e83
+	addi	r4,r4,8
147e83
+
147e83
+	/* If the null byte was found in the non-vectorized code, compute the
147e83
+	   final length.  r10 has the output of the cmpb instruction, that is,
147e83
+	   it contains 0xff in the same position as the null byte in the
147e83
+	   original doubleword from the string.  Use that to calculate the
147e83
+	   length.  */
147e83
+L(done):
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
147e83
+	andc	r9, r9,r10
147e83
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
147e83
+#else
147e83
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
147e83
+#endif
147e83
+	subf	r5,r3,r4
147e83
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
147e83
+	add	r3,r5,r0      /* Compute final length.  */
147e83
+	blr
147e83
+
147e83
+	/* Vectorized implementation starts here.  */
147e83
+	.p2align  4
147e83
+L(preloop):
147e83
+	/* Set up for the loop.  */
147e83
+	mr	r4,r9
147e83
+	li	r7, 16	      /* Load required offsets.  */
147e83
+	li	r8, 32
147e83
+	li	r9, 48
147e83
+	li	r12, 8
147e83
+	vxor	v0,v0,v0      /* VR with null chars to use with
147e83
+				 vcmpequb.  */
147e83
+
147e83
+	/* Main loop to look for the end of the string.  We will read in
147e83
+	   64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
147e83
+	   leverage the icache performance.  */
147e83
+	.p2align  5
147e83
+L(loop):
147e83
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
147e83
+	lvx	  v2,r4,r7
147e83
+	lvx	  v3,r4,r8
147e83
+	lvx	  v4,r4,r9
147e83
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
147e83
+	vminub	  v6,v3,v4
147e83
+	vminub	  v7,v5,v6
147e83
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
147e83
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
147e83
+	bne	  cr6,L(vmx_zero)
147e83
+
147e83
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
147e83
+	lvx	  v2,r4,r7
147e83
+	lvx	  v3,r4,r8
147e83
+	lvx	  v4,r4,r9
147e83
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
147e83
+	vminub	  v6,v3,v4
147e83
+	vminub	  v7,v5,v6
147e83
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
147e83
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
147e83
+	bne	  cr6,L(vmx_zero)
147e83
+
147e83
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
147e83
+	lvx	  v2,r4,r7
147e83
+	lvx	  v3,r4,r8
147e83
+	lvx	  v4,r4,r9
147e83
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
147e83
+	vminub	  v6,v3,v4
147e83
+	vminub	  v7,v5,v6
147e83
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
147e83
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
147e83
+	beq	  cr6,L(loop)
147e83
+
147e83
+L(vmx_zero):
147e83
+	/* OK, we found a null byte.  Let's look for it in the current 64-byte
147e83
+	   block and mark it in its corresponding VR.  */
147e83
+	vcmpequb  v1,v1,v0
147e83
+	vcmpequb  v2,v2,v0
147e83
+	vcmpequb  v3,v3,v0
147e83
+	vcmpequb  v4,v4,v0
147e83
+
147e83
+	/* We will now 'compress' the result into a single doubleword, so it
147e83
+	   can be moved to a GPR for the final calculation.  First, we
147e83
+	   generate an appropriate mask for vbpermq, so we can permute bits into
147e83
+	   the first halfword.  */
147e83
+	vspltisb  v10,3
147e83
+	lvsl	  v11,r0,r0
147e83
+	vslb	  v10,v11,v10
147e83
+
147e83
+	/* Permute the first bit of each byte into bits 48-63.  */
147e83
+	VBPERMQ(v1,v1,v10)
147e83
+	VBPERMQ(v2,v2,v10)
147e83
+	VBPERMQ(v3,v3,v10)
147e83
+	VBPERMQ(v4,v4,v10)
147e83
+
147e83
+	/* Shift each component into its correct position for merging.  */
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	vsldoi  v2,v2,v2,2
147e83
+	vsldoi  v3,v3,v3,4
147e83
+	vsldoi  v4,v4,v4,6
147e83
+#else
147e83
+	vsldoi	v1,v1,v1,6
147e83
+	vsldoi	v2,v2,v2,4
147e83
+	vsldoi	v3,v3,v3,2
147e83
+#endif
147e83
+
147e83
+	/* Merge the results and move to a GPR.  */
147e83
+	vor	v1,v2,v1
147e83
+	vor	v2,v3,v4
147e83
+	vor	v4,v1,v2
147e83
+	MFVRD(r10,v4)
147e83
+
147e83
+	 /* Adjust address to the begninning of the current 64-byte block.  */
147e83
+	addi	r4,r4,-64
147e83
+
147e83
+#ifdef __LITTLE_ENDIAN__
147e83
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
147e83
+	andc	r9, r9,r10
147e83
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
147e83
+#else
147e83
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
147e83
+#endif
147e83
+	subf	r5,r3,r4
147e83
+	add	r3,r5,r0      /* Compute final length.  */
147e83
+	blr
147e83
+
147e83
+END (strlen)
147e83
+libc_hidden_builtin_def (strlen)
147e83
-- 
147e83
2.1.0
147e83