Blame SOURCES/glibc-rh1385004-16.patch

147e83
From 2ffa8b8660a7a17572ae5a398171c8be59985eb3 Mon Sep 17 00:00:00 2001
147e83
From: "Gabriel F. T. Gomes" <gftg@linux.vnet.ibm.com>
147e83
Date: Mon, 25 Jan 2016 10:50:34 -0500
147e83
Subject: [PATCH] powerpc: Zero pad using memset in strncpy/stpncpy
147e83
147e83
Call __memset_power8 to pad, with zeros, the remaining bytes in the
147e83
dest string on __strncpy_power8 and __stpncpy_power8.  This improves
147e83
performance when n is larger than the input string, giving ~30% gain for
147e83
larger strings without impacting much shorter strings.
147e83
147e83
(cherry picked from commit 72c11b353ede72931cc474c9071d143d9a05c0d7)
147e83
---
147e83
 ChangeLog                                  |   5 ++
147e83
 sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
147e83
 2 files changed, 61 insertions(+), 67 deletions(-)
147e83
147e83
diff --git a/ChangeLog b/ChangeLog
147e83
index 5537fc6..8d0e296 100644
147e83
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
147e83
index 5fda953..80136cc 100644
147e83
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
147e83
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
147e83
@@ -24,6 +24,8 @@
147e83
 # define FUNC_NAME strncpy
147e83
 #endif
147e83
 
147e83
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
147e83
+
147e83
 /* Implements the function
147e83
 
147e83
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
147e83
@@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
147e83
 	addi	r10,r4,16
147e83
 	rlwinm	r9,r4,0,19,19
147e83
 
147e83
-	/* Since it is a leaf function, save some non-volatile registers on the
147e83
-	   protected/red zone.  */
147e83
+	/* Save some non-volatile registers on the stack.  */
147e83
 	std	r26,-48(r1)
147e83
 	std	r27,-40(r1)
147e83
 
147e83
@@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
147e83
 	std	r30,-16(r1)
147e83
 	std	r31,-8(r1)
147e83
 
147e83
+	/* Update CFI.  */
147e83
+	cfi_offset(r26, -48)
147e83
+	cfi_offset(r27, -40)
147e83
+	cfi_offset(r28, -32)
147e83
+	cfi_offset(r29, -24)
147e83
+	cfi_offset(r30, -16)
147e83
+	cfi_offset(r31, -8)
147e83
+
147e83
 	beq	cr7,L(unaligned_lt_16)
147e83
 	rldicl	r9,r4,0,61
147e83
 	subfic	r8,r9,8
147e83
@@ -144,74 +153,58 @@ L(short_path_loop_end):
147e83
 	ld	r31,-8(r1)
147e83
 	blr
147e83
 
147e83
-	/* This code pads the remainder dest with NULL bytes.  The algorithm
147e83
-	   calculate the remanining size and issues a doubleword unrolled
147e83
-	   loops followed by a byte a byte set.  */
147e83
+	/* This code pads the remainder of dest with NULL bytes.  The algorithm
147e83
+	   calculates the remaining size and calls memset.  */
147e83
 	.align	4
147e83
 L(zero_pad_start):
147e83
 	mr	r5,r10
147e83
 	mr	r9,r6
147e83
 L(zero_pad_start_1):
147e83
-	srdi.	r8,r5,r3
147e83
-	mr	r10,r9
147e83
-#ifdef USE_AS_STPNCPY
147e83
-	mr	r3,r9
147e83
+	/* At this point:
147e83
+	     - r5 holds the number of bytes that still have to be written to
147e83
+	       dest.
147e83
+	     - r9 points to the position, in dest, where the first null byte
147e83
+	       will be written.
147e83
+	   The above statements are true both when control reaches this label
147e83
+	   from a branch or when falling through the previous lines.  */
147e83
+#ifndef USE_AS_STPNCPY
147e83
+	mr	r30,r3       /* Save the return value of strncpy.  */
147e83
+#endif
147e83
+	/* Prepare the call to memset.  */
147e83
+	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
147e83
+	li	r4,0         /* Byte to be written (zero).  */
147e83
+
147e83
+	/* We delayed the creation of the stack frame, as well as the saving of
147e83
+	   the link register, because only at this point, we are sure that
147e83
+	   doing so is actually needed.  */
147e83
+
147e83
+	/* Save the link register.  */
147e83
+	mflr	r0
147e83
+	std	r0,16(r1)
147e83
+	cfi_offset(lr, 16)
147e83
+
147e83
+	/* Create the stack frame.  */
147e83
+	stdu	r1,-FRAMESIZE(r1)
147e83
+	cfi_adjust_cfa_offset(FRAMESIZE)
147e83
+
147e83
+	bl	__memset_power8
147e83
+	nop
147e83
+
147e83
+	/* Restore the stack frame.  */
147e83
+	addi	r1,r1,FRAMESIZE
147e83
+	cfi_adjust_cfa_offset(-FRAMESIZE)
147e83
+	/* Restore the link register.  */
147e83
+	ld	r0,16(r1)
147e83
+	mtlr	r0
147e83
+
147e83
+#ifndef USE_AS_STPNCPY
147e83
+	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
147e83
+				dest.  For stpncpy, the return value is the
147e83
+				same as return value of memset.  */
147e83
 #endif
147e83
-	beq-	cr0,L(zero_pad_loop_b_start)
147e83
-	cmpldi	cr7,r8,1
147e83
-	li	cr7,0
147e83
-	std	r7,0(r9)
147e83
-	beq	cr7,L(zero_pad_loop_b_prepare)
147e83
-	addic.	r8,r8,-2
147e83
-	addi	r10,r9,r16
147e83
-	std	r7,8(r9)
147e83
-	beq	cr0,L(zero_pad_loop_dw_2)
147e83
-	std	r7,16(r9)
147e83
-	li	r9,0
147e83
-	b	L(zero_pad_loop_dw_1)
147e83
-
147e83
-	.align	4
147e83
-L(zero_pad_loop_dw):
147e83
-	addi	r10,r10,16
147e83
-	std	r9,-8(r10)
147e83
-	beq	cr0,L(zero_pad_loop_dw_2)
147e83
-	std	r9,0(r10)
147e83
-L(zero_pad_loop_dw_1):
147e83
-	cmpldi	cr7,r8,1
147e83
-	std	r9,0(r10)
147e83
-	addic.	r8,r8,-2
147e83
-	bne	cr7,L(zero_pad_loop_dw)
147e83
-	addi	r10,r10,8
147e83
-L(zero_pad_loop_dw_2):
147e83
-	rldicl	r5,r5,0,61
147e83
-L(zero_pad_loop_b_start):
147e83
-	cmpdi	cr7,r5,0
147e83
-	addi	r5,r5,-1
147e83
-	addi	r9,r10,-1
147e83
-	add	r10,r10,5
147e83
-	subf	r10,r9,r10
147e83
-	li	r8,0
147e83
-	beq-	cr7,L(short_path_loop_end)
147e83
-
147e83
-	/* Write remaining 1-8 bytes.  */
147e83
-        .align  4
147e83
-	addi	r9,r9,1
147e83
-	mtocrf	0x1,r10
147e83
-	bf	29,4f
147e83
-        stw     r8,0(r9)
147e83
-        addi	r9,r9,4
147e83
-
147e83
-        .align  4
147e83
-4:      bf      30,2f
147e83
-        sth     r8,0(r9)
147e83
-        addi	r9,r9,2
147e83
-
147e83
-        .align  4
147e83
-2:      bf	31,1f
147e83
-        stb	r8,0(r9)
147e83
 
147e83
-	/* Restore non-volatile registers.  */
147e83
-1:	ld	r26,-48(r1)
147e83
+	/* Restore non-volatile registers and return.  */
147e83
+	ld	r26,-48(r1)
147e83
 	ld	r27,-40(r1)
147e83
 	ld	r28,-32(r1)
147e83
 	ld	r29,-24(r1)
147e83
@@ -407,10 +400,6 @@ L(short_path_prepare_2_3):
147e83
 	mr	r4,r28
147e83
 	mr	r9,r29
147e83
 	b	L(short_path_2)
147e83
-L(zero_pad_loop_b_prepare):
147e83
-	addi	r10,r9,8
147e83
-	rldicl	r5,r5,0,61
147e83
-	b	L(zero_pad_loop_b_start)
147e83
 L(zero_pad_start_prepare_1):
147e83
 	mr	r5,r6
147e83
 	mr	r9,r8
147e83
-- 
147e83
2.1.0
147e83