Blame SOURCES/glibc-rh1335286.patch

147e83
From 143ce75a4203a78d79549b00e570a5bb429c44cf Mon Sep 17 00:00:00 2001
147e83
From: Ondrej Bilka <neleai@seznam.cz>
147e83
Date: Mon, 20 May 2013 08:26:00 +0200
147e83
Subject: [PATCH] Faster memset on x64
147e83
147e83
This implementation speed up memset in several ways. First is
147e83
avoiding expensive computed jump. Second is using fact that arguments
147e83
of memset are most of time aligned to 8 bytes.
147e83
147e83
Benchmark results on:
147e83
147e83
kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_result27_04_13.tar.bz2
147e83
147e83
(cherry picked from commit b2b671b677d92429a3d41bf451668f476aa267ed)
147e83
---
147e83
 sysdeps/x86_64/memset.S | 1406 +++--------------------------------------------
147e83
 1 file changed, 91 insertions(+), 1315 deletions(-)
147e83
147e83
Index: glibc-2.17-c758a686/sysdeps/x86_64/memset.S
147e83
===================================================================
147e83
--- glibc-2.17-c758a686.orig/sysdeps/x86_64/memset.S
147e83
+++ glibc-2.17-c758a686/sysdeps/x86_64/memset.S
147e83
@@ -19,17 +19,31 @@
147e83
 
147e83
 #include <sysdep.h>
147e83
 
147e83
-#define __STOS_LOWER_BOUNDARY	$8192
147e83
-#define __STOS_UPPER_BOUNDARY	$65536
147e83
+#ifndef ALIGN
147e83
+# define ALIGN(n) .p2align n
147e83
+#endif
147e83
 
147e83
 	.text
147e83
 #if IS_IN (libc) && !defined USE_MULTIARCH
147e83
 ENTRY(__bzero)
147e83
-	mov	%rsi,%rdx	/* Adjust parameter.  */
147e83
-	xorl	%esi,%esi	/* Fill with 0s.  */
147e83
-	jmp	L(memset_entry)
147e83
+	movq	%rdi, %rax /* Set return value.  */
147e83
+	movq	%rsi, %rdx /* Set n.  */
147e83
+	pxor	%xmm8, %xmm8
147e83
+	jmp	L(entry_from_bzero)
147e83
 END(__bzero)
147e83
 weak_alias (__bzero, bzero)
147e83
+
147e83
+/* Like memset but takes additional parameter with return value.  */
147e83
+ENTRY(__memset_tail)
147e83
+	movq	%rcx, %rax /* Set return value.  */
147e83
+
147e83
+	movd	%esi, %xmm8
147e83
+	punpcklbw	%xmm8, %xmm8
147e83
+	punpcklwd	%xmm8, %xmm8
147e83
+	pshufd	$0, %xmm8, %xmm8
147e83
+
147e83
+	jmp	L(entry_from_bzero)
147e83
+END(__memset_tail)
147e83
 #endif
147e83
 
147e83
 #if defined PIC && IS_IN (libc)
147e83
@@ -38,1318 +52,80 @@ ENTRY_CHK (__memset_chk)
147e83
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
147e83
 END_CHK (__memset_chk)
147e83
 #endif
147e83
-ENTRY (memset)
147e83
-L(memset_entry):
147e83
-	cmp    $0x1,%rdx
147e83
-	mov    %rdi,%rax	/* memset returns the dest address.  */
147e83
-	jne    L(ck2)
147e83
-	mov    %sil,(%rdi)
147e83
-	retq
147e83
-L(ck2):
147e83
-	mov    $0x101010101010101,%r9
147e83
-	mov    %rdx,%r8
147e83
-	movzbq %sil,%rdx
147e83
-	imul   %r9,%rdx
147e83
-L(now_dw_aligned):
147e83
-	cmp    $0x90,%r8
147e83
-	ja     L(ck_mem_ops_method)
147e83
-L(now_dw_aligned_small):
147e83
-	add    %r8,%rdi
147e83
-#ifndef PIC
147e83
-	lea    L(setPxQx)(%rip),%r11
147e83
-	jmpq   *(%r11,%r8,8)
147e83
-#else
147e83
-	lea    L(Got0)(%rip),%r11
147e83
-	lea    L(setPxQx)(%rip),%rcx
147e83
-	movswq (%rcx,%r8,2),%rcx
147e83
-	lea    (%rcx,%r11,1),%r11
147e83
-	jmpq   *%r11
147e83
-#endif
147e83
-
147e83
-L(Got0):
147e83
-	retq
147e83
-
147e83
-	.pushsection .rodata
147e83
-	.balign     16
147e83
-#ifndef PIC
147e83
-L(setPxQx):
147e83
-	.quad       L(Got0), L(P1Q0), L(P2Q0), L(P3Q0)
147e83
-	.quad       L(P4Q0), L(P5Q0), L(P6Q0), L(P7Q0)
147e83
-	.quad       L(P0Q1), L(P1Q1), L(P2Q1), L(P3Q1)
147e83
-	.quad       L(P4Q1), L(P5Q1), L(P6Q1), L(P7Q1)
147e83
-	.quad       L(P0Q2), L(P1Q2), L(P2Q2), L(P3Q2)
147e83
-	.quad       L(P4Q2), L(P5Q2), L(P6Q2), L(P7Q2)
147e83
-	.quad       L(P0Q3), L(P1Q3), L(P2Q3), L(P3Q3)
147e83
-	.quad       L(P4Q3), L(P5Q3), L(P6Q3), L(P7Q3)
147e83
-	.quad       L(P0Q4), L(P1Q4), L(P2Q4), L(P3Q4)
147e83
-	.quad       L(P4Q4), L(P5Q4), L(P6Q4), L(P7Q4)
147e83
-	.quad       L(P0Q5), L(P1Q5), L(P2Q5), L(P3Q5)
147e83
-	.quad       L(P4Q5), L(P5Q5), L(P6Q5), L(P7Q5)
147e83
-	.quad       L(P0Q6), L(P1Q6), L(P2Q6), L(P3Q6)
147e83
-	.quad       L(P4Q6), L(P5Q6), L(P6Q6), L(P7Q6)
147e83
-	.quad       L(P0Q7), L(P1Q7), L(P2Q7), L(P3Q7)
147e83
-	.quad       L(P4Q7), L(P5Q7), L(P6Q7), L(P7Q7)
147e83
-	.quad       L(P0Q8), L(P1Q8), L(P2Q8), L(P3Q8)
147e83
-	.quad       L(P4Q8), L(P5Q8), L(P6Q8), L(P7Q8)
147e83
-	.quad       L(P0Q9), L(P1Q9), L(P2Q9), L(P3Q9)
147e83
-	.quad       L(P4Q9), L(P5Q9), L(P6Q9), L(P7Q9)
147e83
-	.quad       L(P0QA), L(P1QA), L(P2QA), L(P3QA)
147e83
-	.quad       L(P4QA), L(P5QA), L(P6QA), L(P7QA)
147e83
-	.quad       L(P0QB), L(P1QB), L(P2QB), L(P3QB)
147e83
-	.quad       L(P4QB), L(P5QB), L(P6QB), L(P7QB)
147e83
-	.quad       L(P0QC), L(P1QC), L(P2QC), L(P3QC)
147e83
-	.quad       L(P4QC), L(P5QC), L(P6QC), L(P7QC)
147e83
-	.quad       L(P0QD), L(P1QD), L(P2QD), L(P3QD)
147e83
-	.quad       L(P4QD), L(P5QD), L(P6QD), L(P7QD)
147e83
-	.quad       L(P0QE), L(P1QE), L(P2QE), L(P3QE)
147e83
-	.quad       L(P4QE), L(P5QE), L(P6QE), L(P7QE)
147e83
-	.quad       L(P0QF), L(P1QF), L(P2QF), L(P3QF)
147e83
-	.quad       L(P4QF), L(P5QF), L(P6QF), L(P7QF)
147e83
-	.quad       L(P0QG), L(P1QG), L(P2QG), L(P3QG)
147e83
-	.quad       L(P4QG), L(P5QG), L(P6QG), L(P7QG)
147e83
-	.quad       L(P0QH), L(P1QH), L(P2QH), L(P3QH)
147e83
-	.quad       L(P4QH), L(P5QH), L(P6QH), L(P7QH)
147e83
-	.quad       L(P0QI)
147e83
-# ifdef USE_EXTRA_TABLE
147e83
-	.quad       L(P1QI), L(P2QI), L(P3QI), L(P4QI)
147e83
-	.quad       L(P5QI), L(P6QI), L(P7QI)
147e83
-# endif
147e83
-#else
147e83
-L(setPxQx):
147e83
-	.short     L(Got0)-L(Got0)
147e83
-	.short     L(P1Q0)-L(Got0)
147e83
-	.short     L(P2Q0)-L(Got0)
147e83
-	.short     L(P3Q0)-L(Got0)
147e83
-	.short     L(P4Q0)-L(Got0)
147e83
-	.short     L(P5Q0)-L(Got0)
147e83
-	.short     L(P6Q0)-L(Got0)
147e83
-	.short     L(P7Q0)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q1)-L(Got0)
147e83
-	.short     L(P1Q1)-L(Got0)
147e83
-	.short     L(P2Q1)-L(Got0)
147e83
-	.short     L(P3Q1)-L(Got0)
147e83
-	.short     L(P4Q1)-L(Got0)
147e83
-	.short     L(P5Q1)-L(Got0)
147e83
-	.short     L(P6Q1)-L(Got0)
147e83
-	.short     L(P7Q1)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q2)-L(Got0)
147e83
-	.short     L(P1Q2)-L(Got0)
147e83
-	.short     L(P2Q2)-L(Got0)
147e83
-	.short     L(P3Q2)-L(Got0)
147e83
-	.short     L(P4Q2)-L(Got0)
147e83
-	.short     L(P5Q2)-L(Got0)
147e83
-	.short     L(P6Q2)-L(Got0)
147e83
-	.short     L(P7Q2)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q3)-L(Got0)
147e83
-	.short     L(P1Q3)-L(Got0)
147e83
-	.short     L(P2Q3)-L(Got0)
147e83
-	.short     L(P3Q3)-L(Got0)
147e83
-	.short     L(P4Q3)-L(Got0)
147e83
-	.short     L(P5Q3)-L(Got0)
147e83
-	.short     L(P6Q3)-L(Got0)
147e83
-	.short     L(P7Q3)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q4)-L(Got0)
147e83
-	.short     L(P1Q4)-L(Got0)
147e83
-	.short     L(P2Q4)-L(Got0)
147e83
-	.short     L(P3Q4)-L(Got0)
147e83
-	.short     L(P4Q4)-L(Got0)
147e83
-	.short     L(P5Q4)-L(Got0)
147e83
-	.short     L(P6Q4)-L(Got0)
147e83
-	.short     L(P7Q4)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q5)-L(Got0)
147e83
-	.short     L(P1Q5)-L(Got0)
147e83
-	.short     L(P2Q5)-L(Got0)
147e83
-	.short     L(P3Q5)-L(Got0)
147e83
-	.short     L(P4Q5)-L(Got0)
147e83
-	.short     L(P5Q5)-L(Got0)
147e83
-	.short     L(P6Q5)-L(Got0)
147e83
-	.short     L(P7Q5)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q6)-L(Got0)
147e83
-	.short     L(P1Q6)-L(Got0)
147e83
-	.short     L(P2Q6)-L(Got0)
147e83
-	.short     L(P3Q6)-L(Got0)
147e83
-	.short     L(P4Q6)-L(Got0)
147e83
-	.short     L(P5Q6)-L(Got0)
147e83
-	.short     L(P6Q6)-L(Got0)
147e83
-	.short     L(P7Q6)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q7)-L(Got0)
147e83
-	.short     L(P1Q7)-L(Got0)
147e83
-	.short     L(P2Q7)-L(Got0)
147e83
-	.short     L(P3Q7)-L(Got0)
147e83
-	.short     L(P4Q7)-L(Got0)
147e83
-	.short     L(P5Q7)-L(Got0)
147e83
-	.short     L(P6Q7)-L(Got0)
147e83
-	.short     L(P7Q7)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q8)-L(Got0)
147e83
-	.short     L(P1Q8)-L(Got0)
147e83
-	.short     L(P2Q8)-L(Got0)
147e83
-	.short     L(P3Q8)-L(Got0)
147e83
-	.short     L(P4Q8)-L(Got0)
147e83
-	.short     L(P5Q8)-L(Got0)
147e83
-	.short     L(P6Q8)-L(Got0)
147e83
-	.short     L(P7Q8)-L(Got0)
147e83
-
147e83
-	.short     L(P0Q9)-L(Got0)
147e83
-	.short     L(P1Q9)-L(Got0)
147e83
-	.short     L(P2Q9)-L(Got0)
147e83
-	.short     L(P3Q9)-L(Got0)
147e83
-	.short     L(P4Q9)-L(Got0)
147e83
-	.short     L(P5Q9)-L(Got0)
147e83
-	.short     L(P6Q9)-L(Got0)
147e83
-	.short     L(P7Q9)-L(Got0)
147e83
-
147e83
-	.short     L(P0QA)-L(Got0)
147e83
-	.short     L(P1QA)-L(Got0)
147e83
-	.short     L(P2QA)-L(Got0)
147e83
-	.short     L(P3QA)-L(Got0)
147e83
-	.short     L(P4QA)-L(Got0)
147e83
-	.short     L(P5QA)-L(Got0)
147e83
-	.short     L(P6QA)-L(Got0)
147e83
-	.short     L(P7QA)-L(Got0)
147e83
-
147e83
-	.short     L(P0QB)-L(Got0)
147e83
-	.short     L(P1QB)-L(Got0)
147e83
-	.short     L(P2QB)-L(Got0)
147e83
-	.short     L(P3QB)-L(Got0)
147e83
-	.short     L(P4QB)-L(Got0)
147e83
-	.short     L(P5QB)-L(Got0)
147e83
-	.short     L(P6QB)-L(Got0)
147e83
-	.short     L(P7QB)-L(Got0)
147e83
-
147e83
-	.short     L(P0QC)-L(Got0)
147e83
-	.short     L(P1QC)-L(Got0)
147e83
-	.short     L(P2QC)-L(Got0)
147e83
-	.short     L(P3QC)-L(Got0)
147e83
-	.short     L(P4QC)-L(Got0)
147e83
-	.short     L(P5QC)-L(Got0)
147e83
-	.short     L(P6QC)-L(Got0)
147e83
-	.short     L(P7QC)-L(Got0)
147e83
-
147e83
-	.short     L(P0QD)-L(Got0)
147e83
-	.short     L(P1QD)-L(Got0)
147e83
-	.short     L(P2QD)-L(Got0)
147e83
-	.short     L(P3QD)-L(Got0)
147e83
-	.short     L(P4QD)-L(Got0)
147e83
-	.short     L(P5QD)-L(Got0)
147e83
-	.short     L(P6QD)-L(Got0)
147e83
-	.short     L(P7QD)-L(Got0)
147e83
-
147e83
-	.short     L(P0QE)-L(Got0)
147e83
-	.short     L(P1QE)-L(Got0)
147e83
-	.short     L(P2QE)-L(Got0)
147e83
-	.short     L(P3QE)-L(Got0)
147e83
-	.short     L(P4QE)-L(Got0)
147e83
-	.short     L(P5QE)-L(Got0)
147e83
-	.short     L(P6QE)-L(Got0)
147e83
-	.short     L(P7QE)-L(Got0)
147e83
-
147e83
-	.short     L(P0QF)-L(Got0)
147e83
-	.short     L(P1QF)-L(Got0)
147e83
-	.short     L(P2QF)-L(Got0)
147e83
-	.short     L(P3QF)-L(Got0)
147e83
-	.short     L(P4QF)-L(Got0)
147e83
-	.short     L(P5QF)-L(Got0)
147e83
-	.short     L(P6QF)-L(Got0)
147e83
-	.short     L(P7QF)-L(Got0)
147e83
-
147e83
-	.short     L(P0QG)-L(Got0)
147e83
-	.short     L(P1QG)-L(Got0)
147e83
-	.short     L(P2QG)-L(Got0)
147e83
-	.short     L(P3QG)-L(Got0)
147e83
-	.short     L(P4QG)-L(Got0)
147e83
-	.short     L(P5QG)-L(Got0)
147e83
-	.short     L(P6QG)-L(Got0)
147e83
-	.short     L(P7QG)-L(Got0)
147e83
-
147e83
-	.short     L(P0QH)-L(Got0)
147e83
-	.short     L(P1QH)-L(Got0)
147e83
-	.short     L(P2QH)-L(Got0)
147e83
-	.short     L(P3QH)-L(Got0)
147e83
-	.short     L(P4QH)-L(Got0)
147e83
-	.short     L(P5QH)-L(Got0)
147e83
-	.short     L(P6QH)-L(Got0)
147e83
-	.short     L(P7QH)-L(Got0)
147e83
-
147e83
-	.short     L(P0QI)-L(Got0)
147e83
-# ifdef USE_EXTRA_TABLE
147e83
-	.short     L(P1QI)-L(Got0)
147e83
-	.short     L(P2QI)-L(Got0)
147e83
-	.short     L(P3QI)-L(Got0)
147e83
-	.short     L(P4QI)-L(Got0)
147e83
-	.short     L(P5QI)-L(Got0)
147e83
-	.short     L(P6QI)-L(Got0)
147e83
-	.short     L(P7QI)-L(Got0)
147e83
-# endif
147e83
-#endif
147e83
-	.popsection
147e83
-
147e83
-	.balign     16
147e83
-#ifdef USE_EXTRA_TABLE
147e83
-L(P1QI): mov    %rdx,-0x91(%rdi)
147e83
-#endif
147e83
-L(P1QH): mov    %rdx,-0x89(%rdi)
147e83
-L(P1QG): mov    %rdx,-0x81(%rdi)
147e83
-#		   .balign     16
147e83
-L(P1QF): mov    %rdx,-0x79(%rdi)
147e83
-L(P1QE): mov    %rdx,-0x71(%rdi)
147e83
-L(P1QD): mov    %rdx,-0x69(%rdi)
147e83
-L(P1QC): mov    %rdx,-0x61(%rdi)
147e83
-L(P1QB): mov    %rdx,-0x59(%rdi)
147e83
-L(P1QA): mov    %rdx,-0x51(%rdi)
147e83
-L(P1Q9): mov    %rdx,-0x49(%rdi)
147e83
-L(P1Q8): mov    %rdx,-0x41(%rdi)
147e83
-L(P1Q7): mov    %rdx,-0x39(%rdi)
147e83
-L(P1Q6): mov    %rdx,-0x31(%rdi)
147e83
-L(P1Q5): mov    %rdx,-0x29(%rdi)
147e83
-L(P1Q4): mov    %rdx,-0x21(%rdi)
147e83
-L(P1Q3): mov    %rdx,-0x19(%rdi)
147e83
-L(P1Q2): mov    %rdx,-0x11(%rdi)
147e83
-L(P1Q1): mov    %rdx,-0x9(%rdi)
147e83
-L(P1Q0): mov    %dl,-0x1(%rdi)
147e83
-		retq
147e83
-
147e83
-	.balign     16
147e83
-L(P0QI): mov    %rdx,-0x90(%rdi)
147e83
-L(P0QH): mov    %rdx,-0x88(%rdi)
147e83
-#		   .balign     16
147e83
-L(P0QG): mov    %rdx,-0x80(%rdi)
147e83
-L(P0QF): mov    %rdx,-0x78(%rdi)
147e83
-L(P0QE): mov    %rdx,-0x70(%rdi)
147e83
-L(P0QD): mov    %rdx,-0x68(%rdi)
147e83
-L(P0QC): mov    %rdx,-0x60(%rdi)
147e83
-L(P0QB): mov    %rdx,-0x58(%rdi)
147e83
-L(P0QA): mov    %rdx,-0x50(%rdi)
147e83
-L(P0Q9): mov    %rdx,-0x48(%rdi)
147e83
-L(P0Q8): mov    %rdx,-0x40(%rdi)
147e83
-L(P0Q7): mov    %rdx,-0x38(%rdi)
147e83
-L(P0Q6): mov    %rdx,-0x30(%rdi)
147e83
-L(P0Q5): mov    %rdx,-0x28(%rdi)
147e83
-L(P0Q4): mov    %rdx,-0x20(%rdi)
147e83
-L(P0Q3): mov    %rdx,-0x18(%rdi)
147e83
-L(P0Q2): mov    %rdx,-0x10(%rdi)
147e83
-L(P0Q1): mov    %rdx,-0x8(%rdi)
147e83
-L(P0Q0): retq
147e83
-
147e83
-
147e83
-	.balign     16
147e83
-#ifdef USE_EXTRA_TABLE
147e83
-L(P2QI): mov    %rdx,-0x92(%rdi)
147e83
-#endif
147e83
-L(P2QH): mov    %rdx,-0x8a(%rdi)
147e83
-L(P2QG): mov    %rdx,-0x82(%rdi)
147e83
-#		   .balign     16
147e83
-L(P2QF): mov    %rdx,-0x7a(%rdi)
147e83
-L(P2QE): mov    %rdx,-0x72(%rdi)
147e83
-L(P2QD): mov    %rdx,-0x6a(%rdi)
147e83
-L(P2QC): mov    %rdx,-0x62(%rdi)
147e83
-L(P2QB): mov    %rdx,-0x5a(%rdi)
147e83
-L(P2QA): mov    %rdx,-0x52(%rdi)
147e83
-L(P2Q9): mov    %rdx,-0x4a(%rdi)
147e83
-L(P2Q8): mov    %rdx,-0x42(%rdi)
147e83
-L(P2Q7): mov    %rdx,-0x3a(%rdi)
147e83
-L(P2Q6): mov    %rdx,-0x32(%rdi)
147e83
-L(P2Q5): mov    %rdx,-0x2a(%rdi)
147e83
-L(P2Q4): mov    %rdx,-0x22(%rdi)
147e83
-L(P2Q3): mov    %rdx,-0x1a(%rdi)
147e83
-L(P2Q2): mov    %rdx,-0x12(%rdi)
147e83
-L(P2Q1): mov    %rdx,-0xa(%rdi)
147e83
-L(P2Q0): mov    %dx,-0x2(%rdi)
147e83
-		retq
147e83
-
147e83
-	.balign     16
147e83
-#ifdef USE_EXTRA_TABLE
147e83
-L(P3QI): mov    %rdx,-0x93(%rdi)
147e83
-#endif
147e83
-L(P3QH): mov    %rdx,-0x8b(%rdi)
147e83
-L(P3QG): mov    %rdx,-0x83(%rdi)
147e83
-#		   .balign     16
147e83
-L(P3QF): mov    %rdx,-0x7b(%rdi)
147e83
-L(P3QE): mov    %rdx,-0x73(%rdi)
147e83
-L(P3QD): mov    %rdx,-0x6b(%rdi)
147e83
-L(P3QC): mov    %rdx,-0x63(%rdi)
147e83
-L(P3QB): mov    %rdx,-0x5b(%rdi)
147e83
-L(P3QA): mov    %rdx,-0x53(%rdi)
147e83
-L(P3Q9): mov    %rdx,-0x4b(%rdi)
147e83
-L(P3Q8): mov    %rdx,-0x43(%rdi)
147e83
-L(P3Q7): mov    %rdx,-0x3b(%rdi)
147e83
-L(P3Q6): mov    %rdx,-0x33(%rdi)
147e83
-L(P3Q5): mov    %rdx,-0x2b(%rdi)
147e83
-L(P3Q4): mov    %rdx,-0x23(%rdi)
147e83
-L(P3Q3): mov    %rdx,-0x1b(%rdi)
147e83
-L(P3Q2): mov    %rdx,-0x13(%rdi)
147e83
-L(P3Q1): mov    %rdx,-0xb(%rdi)
147e83
-L(P3Q0): mov    %dx,-0x3(%rdi)
147e83
-		mov    %dl,-0x1(%rdi)
147e83
-		retq
147e83
-
147e83
-	.balign     16
147e83
-#ifdef USE_EXTRA_TABLE
147e83
-L(P4QI): mov    %rdx,-0x94(%rdi)
147e83
-#endif
147e83
-L(P4QH): mov    %rdx,-0x8c(%rdi)
147e83
-L(P4QG): mov    %rdx,-0x84(%rdi)
147e83
-#		   .balign     16
147e83
-L(P4QF): mov    %rdx,-0x7c(%rdi)
147e83
-L(P4QE): mov    %rdx,-0x74(%rdi)
147e83
-L(P4QD): mov    %rdx,-0x6c(%rdi)
147e83
-L(P4QC): mov    %rdx,-0x64(%rdi)
147e83
-L(P4QB): mov    %rdx,-0x5c(%rdi)
147e83
-L(P4QA): mov    %rdx,-0x54(%rdi)
147e83
-L(P4Q9): mov    %rdx,-0x4c(%rdi)
147e83
-L(P4Q8): mov    %rdx,-0x44(%rdi)
147e83
-L(P4Q7): mov    %rdx,-0x3c(%rdi)
147e83
-L(P4Q6): mov    %rdx,-0x34(%rdi)
147e83
-L(P4Q5): mov    %rdx,-0x2c(%rdi)
147e83
-L(P4Q4): mov    %rdx,-0x24(%rdi)
147e83
-L(P4Q3): mov    %rdx,-0x1c(%rdi)
147e83
-L(P4Q2): mov    %rdx,-0x14(%rdi)
147e83
-L(P4Q1): mov    %rdx,-0xc(%rdi)
147e83
-L(P4Q0): mov    %edx,-0x4(%rdi)
147e83
-		retq
147e83
-
147e83
-	.balign     16
147e83
-#ifdef USE_EXTRA_TABLE
147e83
-L(P5QI): mov    %rdx,-0x95(%rdi)
147e83
-#endif
147e83
-L(P5QH): mov    %rdx,-0x8d(%rdi)
147e83
-L(P5QG): mov    %rdx,-0x85(%rdi)
147e83
-#		   .balign     16
147e83
-L(P5QF): mov    %rdx,-0x7d(%rdi)
147e83
-L(P5QE): mov    %rdx,-0x75(%rdi)
147e83
-L(P5QD): mov    %rdx,-0x6d(%rdi)
147e83
-L(P5QC): mov    %rdx,-0x65(%rdi)
147e83
-L(P5QB): mov    %rdx,-0x5d(%rdi)
147e83
-L(P5QA): mov    %rdx,-0x55(%rdi)
147e83
-L(P5Q9): mov    %rdx,-0x4d(%rdi)
147e83
-L(P5Q8): mov    %rdx,-0x45(%rdi)
147e83
-L(P5Q7): mov    %rdx,-0x3d(%rdi)
147e83
-L(P5Q6): mov    %rdx,-0x35(%rdi)
147e83
-L(P5Q5): mov    %rdx,-0x2d(%rdi)
147e83
-L(P5Q4): mov    %rdx,-0x25(%rdi)
147e83
-L(P5Q3): mov    %rdx,-0x1d(%rdi)
147e83
-L(P5Q2): mov    %rdx,-0x15(%rdi)
147e83
-L(P5Q1): mov    %rdx,-0xd(%rdi)
147e83
-L(P5Q0): mov    %edx,-0x5(%rdi)
147e83
-		mov    %dl,-0x1(%rdi)
147e83
-		retq
147e83
-
147e83
-	.balign     16
147e83
-#ifdef USE_EXTRA_TABLE
147e83
-L(P6QI): mov    %rdx,-0x96(%rdi)
147e83
-#endif
147e83
-L(P6QH): mov    %rdx,-0x8e(%rdi)
147e83
-L(P6QG): mov    %rdx,-0x86(%rdi)
147e83
-#		   .balign     16
147e83
-L(P6QF): mov    %rdx,-0x7e(%rdi)
147e83
-L(P6QE): mov    %rdx,-0x76(%rdi)
147e83
-L(P6QD): mov    %rdx,-0x6e(%rdi)
147e83
-L(P6QC): mov    %rdx,-0x66(%rdi)
147e83
-L(P6QB): mov    %rdx,-0x5e(%rdi)
147e83
-L(P6QA): mov    %rdx,-0x56(%rdi)
147e83
-L(P6Q9): mov    %rdx,-0x4e(%rdi)
147e83
-L(P6Q8): mov    %rdx,-0x46(%rdi)
147e83
-L(P6Q7): mov    %rdx,-0x3e(%rdi)
147e83
-L(P6Q6): mov    %rdx,-0x36(%rdi)
147e83
-L(P6Q5): mov    %rdx,-0x2e(%rdi)
147e83
-L(P6Q4): mov    %rdx,-0x26(%rdi)
147e83
-L(P6Q3): mov    %rdx,-0x1e(%rdi)
147e83
-L(P6Q2): mov    %rdx,-0x16(%rdi)
147e83
-L(P6Q1): mov    %rdx,-0xe(%rdi)
147e83
-L(P6Q0): mov    %edx,-0x6(%rdi)
147e83
-		mov    %dx,-0x2(%rdi)
147e83
-		retq
147e83
-
147e83
-	.balign     16
147e83
-#ifdef USE_EXTRA_TABLE
147e83
-L(P7QI): mov    %rdx,-0x97(%rdi)
147e83
-#endif
147e83
-L(P7QH): mov    %rdx,-0x8f(%rdi)
147e83
-L(P7QG): mov    %rdx,-0x87(%rdi)
147e83
-#		   .balign     16
147e83
-L(P7QF): mov    %rdx,-0x7f(%rdi)
147e83
-L(P7QE): mov    %rdx,-0x77(%rdi)
147e83
-L(P7QD): mov    %rdx,-0x6f(%rdi)
147e83
-L(P7QC): mov    %rdx,-0x67(%rdi)
147e83
-L(P7QB): mov    %rdx,-0x5f(%rdi)
147e83
-L(P7QA): mov    %rdx,-0x57(%rdi)
147e83
-L(P7Q9): mov    %rdx,-0x4f(%rdi)
147e83
-L(P7Q8): mov    %rdx,-0x47(%rdi)
147e83
-L(P7Q7): mov    %rdx,-0x3f(%rdi)
147e83
-L(P7Q6): mov    %rdx,-0x37(%rdi)
147e83
-L(P7Q5): mov    %rdx,-0x2f(%rdi)
147e83
-L(P7Q4): mov    %rdx,-0x27(%rdi)
147e83
-L(P7Q3): mov    %rdx,-0x1f(%rdi)
147e83
-L(P7Q2): mov    %rdx,-0x17(%rdi)
147e83
-L(P7Q1): mov    %rdx,-0xf(%rdi)
147e83
-L(P7Q0): mov    %edx,-0x7(%rdi)
147e83
-		mov    %dx,-0x3(%rdi)
147e83
-		mov    %dl,-0x1(%rdi)
147e83
-		retq
147e83
-
147e83
-	.balign     16
147e83
-L(ck_mem_ops_method):
147e83
-
147e83
-# align to 16 byte boundary first
147e83
-	#test $0xf,%rdi
147e83
-	#jz L(aligned_now)
147e83
-	mov    $0x10,%r10
147e83
-	mov    %rdi,%r9
147e83
-	and    $0xf,%r9
147e83
-	sub    %r9,%r10
147e83
-	and    $0xf,%r10
147e83
-	add    %r10,%rdi
147e83
-	sub    %r10,%r8
147e83
-#ifndef PIC
147e83
-	lea    L(AliPxQx)(%rip),%r11
147e83
-	jmpq   *(%r11,%r10,8)
147e83
-#else
147e83
-	lea    L(aligned_now)(%rip), %r11
147e83
-	lea    L(AliPxQx)(%rip),%rcx
147e83
-	movswq (%rcx,%r10,2),%rcx
147e83
-	lea    (%rcx,%r11,1),%r11
147e83
-	jmpq   *%r11
147e83
-#endif
147e83
-
147e83
-	.pushsection .rodata
147e83
-	.balign     16
147e83
-#ifndef PIC
147e83
-L(AliPxQx):
147e83
-	.quad       L(aligned_now), L(A1Q0), L(A2Q0), L(A3Q0)
147e83
-	.quad	    L(A4Q0), L(A5Q0), L(A6Q0), L(A7Q0)
147e83
-	.quad       L(A0Q1), L(A1Q1), L(A2Q1), L(A3Q1)
147e83
-	.quad       L(A4Q1), L(A5Q1), L(A6Q1), L(A7Q1)
147e83
-#else
147e83
-L(AliPxQx):
147e83
-	.short     L(aligned_now)-L(aligned_now)
147e83
-	.short     L(A1Q0)-L(aligned_now)
147e83
-	.short     L(A2Q0)-L(aligned_now)
147e83
-	.short     L(A3Q0)-L(aligned_now)
147e83
-	.short     L(A4Q0)-L(aligned_now)
147e83
-	.short     L(A5Q0)-L(aligned_now)
147e83
-	.short     L(A6Q0)-L(aligned_now)
147e83
-	.short     L(A7Q0)-L(aligned_now)
147e83
-
147e83
-	.short     L(A0Q1)-L(aligned_now)
147e83
-	.short     L(A1Q1)-L(aligned_now)
147e83
-	.short     L(A2Q1)-L(aligned_now)
147e83
-	.short     L(A3Q1)-L(aligned_now)
147e83
-	.short     L(A4Q1)-L(aligned_now)
147e83
-	.short     L(A5Q1)-L(aligned_now)
147e83
-	.short     L(A6Q1)-L(aligned_now)
147e83
-	.short     L(A7Q1)-L(aligned_now)
147e83
-#endif
147e83
-	.popsection
147e83
-
147e83
-	.balign     16
147e83
-L(A5Q1):    mov    %dl,-0xd(%rdi)
147e83
-L(A4Q1):    mov    %edx,-0xc(%rdi)
147e83
-L(A0Q1):    mov    %rdx,-0x8(%rdi)
147e83
-L(A0Q0):    jmp     L(aligned_now)
147e83
-
147e83
-	.balign     16
147e83
-L(A1Q1):   mov    %dl,-0x9(%rdi)
147e83
-	mov    %rdx,-0x8(%rdi)
147e83
-	jmp    L(aligned_now)
147e83
-
147e83
-	.balign     16
147e83
-L(A1Q0):   mov    %dl,-0x1(%rdi)
147e83
-	jmp    L(aligned_now)
147e83
-
147e83
-	.balign     16
147e83
-L(A3Q1):    mov    %dl,-0xb(%rdi)
147e83
-L(A2Q1):    mov    %dx,-0xa(%rdi)
147e83
-	mov    %rdx,-0x8(%rdi)
147e83
-	jmp    L(aligned_now)
147e83
-
147e83
-	.balign     16
147e83
-L(A3Q0):    mov    %dl,-0x3(%rdi)
147e83
-L(A2Q0):    mov    %dx,-0x2(%rdi)
147e83
-	jmp    L(aligned_now)
147e83
-
147e83
-	.balign     16
147e83
-L(A5Q0):    mov    %dl,-0x5(%rdi)
147e83
-L(A4Q0):    mov    %edx,-0x4(%rdi)
147e83
-	jmp    L(aligned_now)
147e83
-
147e83
-	.balign     16
147e83
-L(A7Q1):    mov    %dl,-0xf(%rdi)
147e83
-L(A6Q1):    mov    %dx,-0xe(%rdi)
147e83
-	mov    %edx,-0xc(%rdi)
147e83
-	mov    %rdx,-0x8(%rdi)
147e83
-	jmp    L(aligned_now)
147e83
-
147e83
-	.balign     16
147e83
-L(A7Q0):    mov    %dl,-0x7(%rdi)
147e83
-L(A6Q0):    mov    %dx,-0x6(%rdi)
147e83
-	mov    %edx,-0x4(%rdi)
147e83
-
147e83
-#ifndef USE_MULTIARCH
147e83
-	jmp    L(aligned_now)
147e83
-
147e83
-L(SSE_pre):
147e83
-#else
147e83
-L(aligned_now):
147e83
-#endif
147e83
-#if !defined USE_MULTIARCH || defined USE_SSE2
147e83
-	 # fill RegXMM0 with the pattern
147e83
-	 movd   %rdx,%xmm0
147e83
-	 punpcklqdq %xmm0,%xmm0
147e83
-
147e83
-	 cmp    $0xb0,%r8 # 176
147e83
-	 jae    L(byte32sse2_pre)
147e83
-
147e83
-	 add    %r8,%rdi
147e83
-# ifndef PIC
147e83
-	 lea    L(SSExDx)(%rip),%r9
147e83
-	 jmpq   *(%r9,%r8,8)
147e83
-# else
147e83
-	 lea    L(SSE0Q0)(%rip),%r9
147e83
-	 lea    L(SSExDx)(%rip),%rcx
147e83
-	 movswq (%rcx,%r8,2),%rcx
147e83
-	 lea    (%rcx,%r9,1),%r9
147e83
-	 jmpq   *%r9
147e83
-# endif
147e83
-
147e83
-L(SSE0QB):  movdqa %xmm0,-0xb0(%rdi)
147e83
-L(SSE0QA):  movdqa %xmm0,-0xa0(%rdi)
147e83
-L(SSE0Q9):  movdqa %xmm0,-0x90(%rdi)
147e83
-L(SSE0Q8):  movdqa %xmm0,-0x80(%rdi)
147e83
-L(SSE0Q7):  movdqa %xmm0,-0x70(%rdi)
147e83
-L(SSE0Q6):  movdqa %xmm0,-0x60(%rdi)
147e83
-L(SSE0Q5):  movdqa %xmm0,-0x50(%rdi)
147e83
-L(SSE0Q4):  movdqa %xmm0,-0x40(%rdi)
147e83
-L(SSE0Q3):  movdqa %xmm0,-0x30(%rdi)
147e83
-L(SSE0Q2):  movdqa %xmm0,-0x20(%rdi)
147e83
-L(SSE0Q1):  movdqa %xmm0,-0x10(%rdi)
147e83
-L(SSE0Q0):  retq
147e83
-
147e83
-L(SSE1QB):  movdqa %xmm0,-0xb1(%rdi)
147e83
-L(SSE1QA):  movdqa %xmm0,-0xa1(%rdi)
147e83
-L(SSE1Q9):  movdqa %xmm0,-0x91(%rdi)
147e83
-L(SSE1Q8):  movdqa %xmm0,-0x81(%rdi)
147e83
-L(SSE1Q7):  movdqa %xmm0,-0x71(%rdi)
147e83
-L(SSE1Q6):  movdqa %xmm0,-0x61(%rdi)
147e83
-L(SSE1Q5):  movdqa %xmm0,-0x51(%rdi)
147e83
-L(SSE1Q4):  movdqa %xmm0,-0x41(%rdi)
147e83
-L(SSE1Q3):  movdqa %xmm0,-0x31(%rdi)
147e83
-L(SSE1Q2):  movdqa %xmm0,-0x21(%rdi)
147e83
-L(SSE1Q1):  movdqa %xmm0,-0x11(%rdi)
147e83
-L(SSE1Q0):  mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE2QB):  movdqa %xmm0,-0xb2(%rdi)
147e83
-L(SSE2QA):  movdqa %xmm0,-0xa2(%rdi)
147e83
-L(SSE2Q9):  movdqa %xmm0,-0x92(%rdi)
147e83
-L(SSE2Q8):  movdqa %xmm0,-0x82(%rdi)
147e83
-L(SSE2Q7):  movdqa %xmm0,-0x72(%rdi)
147e83
-L(SSE2Q6):  movdqa %xmm0,-0x62(%rdi)
147e83
-L(SSE2Q5):  movdqa %xmm0,-0x52(%rdi)
147e83
-L(SSE2Q4):  movdqa %xmm0,-0x42(%rdi)
147e83
-L(SSE2Q3):  movdqa %xmm0,-0x32(%rdi)
147e83
-L(SSE2Q2):  movdqa %xmm0,-0x22(%rdi)
147e83
-L(SSE2Q1):  movdqa %xmm0,-0x12(%rdi)
147e83
-L(SSE2Q0):  mov    %dx,-0x2(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE3QB):  movdqa %xmm0,-0xb3(%rdi)
147e83
-L(SSE3QA):  movdqa %xmm0,-0xa3(%rdi)
147e83
-L(SSE3Q9):  movdqa %xmm0,-0x93(%rdi)
147e83
-L(SSE3Q8):  movdqa %xmm0,-0x83(%rdi)
147e83
-L(SSE3Q7):  movdqa %xmm0,-0x73(%rdi)
147e83
-L(SSE3Q6):  movdqa %xmm0,-0x63(%rdi)
147e83
-L(SSE3Q5):  movdqa %xmm0,-0x53(%rdi)
147e83
-L(SSE3Q4):  movdqa %xmm0,-0x43(%rdi)
147e83
-L(SSE3Q3):  movdqa %xmm0,-0x33(%rdi)
147e83
-L(SSE3Q2):  movdqa %xmm0,-0x23(%rdi)
147e83
-L(SSE3Q1):  movdqa %xmm0,-0x13(%rdi)
147e83
-L(SSE3Q0):  mov    %dx,-0x3(%rdi)
147e83
-	mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE4QB):  movdqa %xmm0,-0xb4(%rdi)
147e83
-L(SSE4QA):  movdqa %xmm0,-0xa4(%rdi)
147e83
-L(SSE4Q9):  movdqa %xmm0,-0x94(%rdi)
147e83
-L(SSE4Q8):  movdqa %xmm0,-0x84(%rdi)
147e83
-L(SSE4Q7):  movdqa %xmm0,-0x74(%rdi)
147e83
-L(SSE4Q6):  movdqa %xmm0,-0x64(%rdi)
147e83
-L(SSE4Q5):  movdqa %xmm0,-0x54(%rdi)
147e83
-L(SSE4Q4):  movdqa %xmm0,-0x44(%rdi)
147e83
-L(SSE4Q3):  movdqa %xmm0,-0x34(%rdi)
147e83
-L(SSE4Q2):  movdqa %xmm0,-0x24(%rdi)
147e83
-L(SSE4Q1):  movdqa %xmm0,-0x14(%rdi)
147e83
-L(SSE4Q0):  mov    %edx,-0x4(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE5QB):  movdqa %xmm0,-0xb5(%rdi)
147e83
-L(SSE5QA):  movdqa %xmm0,-0xa5(%rdi)
147e83
-L(SSE5Q9):  movdqa %xmm0,-0x95(%rdi)
147e83
-L(SSE5Q8):  movdqa %xmm0,-0x85(%rdi)
147e83
-L(SSE5Q7):  movdqa %xmm0,-0x75(%rdi)
147e83
-L(SSE5Q6):  movdqa %xmm0,-0x65(%rdi)
147e83
-L(SSE5Q5):  movdqa %xmm0,-0x55(%rdi)
147e83
-L(SSE5Q4):  movdqa %xmm0,-0x45(%rdi)
147e83
-L(SSE5Q3):  movdqa %xmm0,-0x35(%rdi)
147e83
-L(SSE5Q2):  movdqa %xmm0,-0x25(%rdi)
147e83
-L(SSE5Q1):  movdqa %xmm0,-0x15(%rdi)
147e83
-L(SSE5Q0):  mov    %edx,-0x5(%rdi)
147e83
-	mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-
147e83
-L(SSE6QB):  movdqa %xmm0,-0xb6(%rdi)
147e83
-L(SSE6QA):  movdqa %xmm0,-0xa6(%rdi)
147e83
-L(SSE6Q9):  movdqa %xmm0,-0x96(%rdi)
147e83
-L(SSE6Q8):  movdqa %xmm0,-0x86(%rdi)
147e83
-L(SSE6Q7):  movdqa %xmm0,-0x76(%rdi)
147e83
-L(SSE6Q6):  movdqa %xmm0,-0x66(%rdi)
147e83
-L(SSE6Q5):  movdqa %xmm0,-0x56(%rdi)
147e83
-L(SSE6Q4):  movdqa %xmm0,-0x46(%rdi)
147e83
-L(SSE6Q3):  movdqa %xmm0,-0x36(%rdi)
147e83
-L(SSE6Q2):  movdqa %xmm0,-0x26(%rdi)
147e83
-L(SSE6Q1):  movdqa %xmm0,-0x16(%rdi)
147e83
-L(SSE6Q0):  mov    %edx,-0x6(%rdi)
147e83
-	mov    %dx,-0x2(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE7QB):  movdqa %xmm0,-0xb7(%rdi)
147e83
-L(SSE7QA):  movdqa %xmm0,-0xa7(%rdi)
147e83
-L(SSE7Q9):  movdqa %xmm0,-0x97(%rdi)
147e83
-L(SSE7Q8):  movdqa %xmm0,-0x87(%rdi)
147e83
-L(SSE7Q7):  movdqa %xmm0,-0x77(%rdi)
147e83
-L(SSE7Q6):  movdqa %xmm0,-0x67(%rdi)
147e83
-L(SSE7Q5):  movdqa %xmm0,-0x57(%rdi)
147e83
-L(SSE7Q4):  movdqa %xmm0,-0x47(%rdi)
147e83
-L(SSE7Q3):  movdqa %xmm0,-0x37(%rdi)
147e83
-L(SSE7Q2):  movdqa %xmm0,-0x27(%rdi)
147e83
-L(SSE7Q1):  movdqa %xmm0,-0x17(%rdi)
147e83
-L(SSE7Q0):  mov    %edx,-0x7(%rdi)
147e83
-	mov    %dx,-0x3(%rdi)
147e83
-	mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE8QB):  movdqa %xmm0,-0xb8(%rdi)
147e83
-L(SSE8QA):  movdqa %xmm0,-0xa8(%rdi)
147e83
-L(SSE8Q9):  movdqa %xmm0,-0x98(%rdi)
147e83
-L(SSE8Q8):  movdqa %xmm0,-0x88(%rdi)
147e83
-L(SSE8Q7):  movdqa %xmm0,-0x78(%rdi)
147e83
-L(SSE8Q6):  movdqa %xmm0,-0x68(%rdi)
147e83
-L(SSE8Q5):  movdqa %xmm0,-0x58(%rdi)
147e83
-L(SSE8Q4):  movdqa %xmm0,-0x48(%rdi)
147e83
-L(SSE8Q3):  movdqa %xmm0,-0x38(%rdi)
147e83
-L(SSE8Q2):  movdqa %xmm0,-0x28(%rdi)
147e83
-L(SSE8Q1):  movdqa %xmm0,-0x18(%rdi)
147e83
-L(SSE8Q0):  mov    %rdx,-0x8(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE9QB):  movdqa %xmm0,-0xb9(%rdi)
147e83
-L(SSE9QA):  movdqa %xmm0,-0xa9(%rdi)
147e83
-L(SSE9Q9):  movdqa %xmm0,-0x99(%rdi)
147e83
-L(SSE9Q8):  movdqa %xmm0,-0x89(%rdi)
147e83
-L(SSE9Q7):  movdqa %xmm0,-0x79(%rdi)
147e83
-L(SSE9Q6):  movdqa %xmm0,-0x69(%rdi)
147e83
-L(SSE9Q5):  movdqa %xmm0,-0x59(%rdi)
147e83
-L(SSE9Q4):  movdqa %xmm0,-0x49(%rdi)
147e83
-L(SSE9Q3):  movdqa %xmm0,-0x39(%rdi)
147e83
-L(SSE9Q2):  movdqa %xmm0,-0x29(%rdi)
147e83
-L(SSE9Q1):  movdqa %xmm0,-0x19(%rdi)
147e83
-L(SSE9Q0):  mov    %rdx,-0x9(%rdi)
147e83
-	mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE10QB): movdqa %xmm0,-0xba(%rdi)
147e83
-L(SSE10QA): movdqa %xmm0,-0xaa(%rdi)
147e83
-L(SSE10Q9): movdqa %xmm0,-0x9a(%rdi)
147e83
-L(SSE10Q8): movdqa %xmm0,-0x8a(%rdi)
147e83
-L(SSE10Q7): movdqa %xmm0,-0x7a(%rdi)
147e83
-L(SSE10Q6): movdqa %xmm0,-0x6a(%rdi)
147e83
-L(SSE10Q5): movdqa %xmm0,-0x5a(%rdi)
147e83
-L(SSE10Q4): movdqa %xmm0,-0x4a(%rdi)
147e83
-L(SSE10Q3): movdqa %xmm0,-0x3a(%rdi)
147e83
-L(SSE10Q2): movdqa %xmm0,-0x2a(%rdi)
147e83
-L(SSE10Q1): movdqa %xmm0,-0x1a(%rdi)
147e83
-L(SSE10Q0): mov    %rdx,-0xa(%rdi)
147e83
-	mov    %dx,-0x2(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE11QB): movdqa %xmm0,-0xbb(%rdi)
147e83
-L(SSE11QA): movdqa %xmm0,-0xab(%rdi)
147e83
-L(SSE11Q9): movdqa %xmm0,-0x9b(%rdi)
147e83
-L(SSE11Q8): movdqa %xmm0,-0x8b(%rdi)
147e83
-L(SSE11Q7): movdqa %xmm0,-0x7b(%rdi)
147e83
-L(SSE11Q6): movdqa %xmm0,-0x6b(%rdi)
147e83
-L(SSE11Q5): movdqa %xmm0,-0x5b(%rdi)
147e83
-L(SSE11Q4): movdqa %xmm0,-0x4b(%rdi)
147e83
-L(SSE11Q3): movdqa %xmm0,-0x3b(%rdi)
147e83
-L(SSE11Q2): movdqa %xmm0,-0x2b(%rdi)
147e83
-L(SSE11Q1): movdqa %xmm0,-0x1b(%rdi)
147e83
-L(SSE11Q0): mov    %rdx,-0xb(%rdi)
147e83
-	mov    %dx,-0x3(%rdi)
147e83
-	mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE12QB): movdqa %xmm0,-0xbc(%rdi)
147e83
-L(SSE12QA): movdqa %xmm0,-0xac(%rdi)
147e83
-L(SSE12Q9): movdqa %xmm0,-0x9c(%rdi)
147e83
-L(SSE12Q8): movdqa %xmm0,-0x8c(%rdi)
147e83
-L(SSE12Q7): movdqa %xmm0,-0x7c(%rdi)
147e83
-L(SSE12Q6): movdqa %xmm0,-0x6c(%rdi)
147e83
-L(SSE12Q5): movdqa %xmm0,-0x5c(%rdi)
147e83
-L(SSE12Q4): movdqa %xmm0,-0x4c(%rdi)
147e83
-L(SSE12Q3): movdqa %xmm0,-0x3c(%rdi)
147e83
-L(SSE12Q2): movdqa %xmm0,-0x2c(%rdi)
147e83
-L(SSE12Q1): movdqa %xmm0,-0x1c(%rdi)
147e83
-L(SSE12Q0): mov    %rdx,-0xc(%rdi)
147e83
-	mov    %edx,-0x4(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE13QB): movdqa %xmm0,-0xbd(%rdi)
147e83
-L(SSE13QA): movdqa %xmm0,-0xad(%rdi)
147e83
-L(SSE13Q9): movdqa %xmm0,-0x9d(%rdi)
147e83
-L(SSE13Q8): movdqa %xmm0,-0x8d(%rdi)
147e83
-L(SSE13Q7): movdqa %xmm0,-0x7d(%rdi)
147e83
-L(SSE13Q6): movdqa %xmm0,-0x6d(%rdi)
147e83
-L(SSE13Q5): movdqa %xmm0,-0x5d(%rdi)
147e83
-L(SSE13Q4): movdqa %xmm0,-0x4d(%rdi)
147e83
-L(SSE13Q3): movdqa %xmm0,-0x3d(%rdi)
147e83
-L(SSE13Q2): movdqa %xmm0,-0x2d(%rdi)
147e83
-L(SSE13Q1): movdqa %xmm0,-0x1d(%rdi)
147e83
-L(SSE13Q0): mov    %rdx,-0xd(%rdi)
147e83
-	mov    %edx,-0x5(%rdi)
147e83
-	mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE14QB): movdqa %xmm0,-0xbe(%rdi)
147e83
-L(SSE14QA): movdqa %xmm0,-0xae(%rdi)
147e83
-L(SSE14Q9): movdqa %xmm0,-0x9e(%rdi)
147e83
-L(SSE14Q8): movdqa %xmm0,-0x8e(%rdi)
147e83
-L(SSE14Q7): movdqa %xmm0,-0x7e(%rdi)
147e83
-L(SSE14Q6): movdqa %xmm0,-0x6e(%rdi)
147e83
-L(SSE14Q5): movdqa %xmm0,-0x5e(%rdi)
147e83
-L(SSE14Q4): movdqa %xmm0,-0x4e(%rdi)
147e83
-L(SSE14Q3): movdqa %xmm0,-0x3e(%rdi)
147e83
-L(SSE14Q2): movdqa %xmm0,-0x2e(%rdi)
147e83
-L(SSE14Q1): movdqa %xmm0,-0x1e(%rdi)
147e83
-L(SSE14Q0): mov    %rdx,-0xe(%rdi)
147e83
-	mov    %edx,-0x6(%rdi)
147e83
-	mov    %dx,-0x2(%rdi)
147e83
-	retq
147e83
-
147e83
-L(SSE15QB): movdqa %xmm0,-0xbf(%rdi)
147e83
-L(SSE15QA): movdqa %xmm0,-0xaf(%rdi)
147e83
-L(SSE15Q9): movdqa %xmm0,-0x9f(%rdi)
147e83
-L(SSE15Q8): movdqa %xmm0,-0x8f(%rdi)
147e83
-L(SSE15Q7): movdqa %xmm0,-0x7f(%rdi)
147e83
-L(SSE15Q6): movdqa %xmm0,-0x6f(%rdi)
147e83
-L(SSE15Q5): movdqa %xmm0,-0x5f(%rdi)
147e83
-L(SSE15Q4): movdqa %xmm0,-0x4f(%rdi)
147e83
-L(SSE15Q3): movdqa %xmm0,-0x3f(%rdi)
147e83
-L(SSE15Q2): movdqa %xmm0,-0x2f(%rdi)
147e83
-L(SSE15Q1): movdqa %xmm0,-0x1f(%rdi)
147e83
-L(SSE15Q0): mov    %rdx,-0xf(%rdi)
147e83
-	mov    %edx,-0x7(%rdi)
147e83
-	mov    %dx,-0x3(%rdi)
147e83
-	mov    %dl,-0x1(%rdi)
147e83
-	retq
147e83
-
147e83
-	.balign     16
147e83
-L(byte32sse2_pre):
147e83
-
147e83
-	mov    __x86_64_shared_cache_size(%rip),%r9d  # The largest cache size
147e83
-	cmp    %r9,%r8
147e83
-	ja     L(sse2_nt_move_pre)
147e83
-	#jmp    L(byte32sse2)
147e83
-	.balign     16
147e83
-L(byte32sse2):
147e83
-	lea    -0x80(%r8),%r8 # 128
147e83
-	cmp    $0x80,%r8   # 128
147e83
-	movdqa %xmm0,(%rdi)
147e83
-	movdqa %xmm0,0x10(%rdi)
147e83
-	movdqa %xmm0,0x20(%rdi)
147e83
-	movdqa %xmm0,0x30(%rdi)
147e83
-	movdqa %xmm0,0x40(%rdi)
147e83
-	movdqa %xmm0,0x50(%rdi)
147e83
-	movdqa %xmm0,0x60(%rdi)
147e83
-	movdqa %xmm0,0x70(%rdi)
147e83
-
147e83
-	lea    0x80(%rdi),%rdi
147e83
-	jae    L(byte32sse2)
147e83
-	add    %r8,%rdi
147e83
-# ifndef PIC
147e83
-	lea    L(SSExDx)(%rip),%r11
147e83
-	jmpq   *(%r11,%r8,8)
147e83
-# else
147e83
-	lea    L(SSE0Q0)(%rip),%r11
147e83
-	lea    L(SSExDx)(%rip),%rcx
147e83
-	movswq (%rcx,%r8,2),%rcx
147e83
-	lea    (%rcx,%r11,1),%r11
147e83
-	jmpq   *%r11
147e83
-# endif
147e83
-
147e83
-	.balign     16
147e83
-L(sse2_nt_move_pre):
147e83
-	cmp    $0x0,%r9
147e83
-	je     L(byte32sse2)
147e83
-	jmp    L(sse2_nt_move)
147e83
-
147e83
-	.balign     16
147e83
-L(sse2_nt_move):
147e83
-	lea    -0x80(%r8),%r8
147e83
-	cmp    $0x80,%r8
147e83
-
147e83
-	movntdq %xmm0,(%rdi)
147e83
-	movntdq %xmm0,0x10(%rdi)
147e83
-	movntdq %xmm0,0x20(%rdi)
147e83
-	movntdq %xmm0,0x30(%rdi)
147e83
-	movntdq %xmm0,0x40(%rdi)
147e83
-	movntdq %xmm0,0x50(%rdi)
147e83
-	movntdq %xmm0,0x60(%rdi)
147e83
-	movntdq %xmm0,0x70(%rdi)
147e83
-
147e83
-	lea    0x80(%rdi),%rdi
147e83
-	jae    L(sse2_nt_move)
147e83
-	sfence
147e83
-	add    %r8,%rdi
147e83
-# ifndef PIC
147e83
-	lea    L(SSExDx)(%rip),%r11
147e83
-	jmpq   *(%r11,%r8,8)
147e83
-# else
147e83
-	lea    L(SSE0Q0)(%rip),%r11
147e83
-	lea    L(SSExDx)(%rip),%rcx
147e83
-	movswq (%rcx,%r8,2),%rcx
147e83
-	lea   (%rcx,%r11,1),%r11
147e83
-	jmpq   *%r11
147e83
-# endif
147e83
-
147e83
-	.pushsection .rodata
147e83
-	.balign     16
147e83
-# ifndef PIC
147e83
-L(SSExDx):
147e83
-	.quad       L(SSE0Q0), L(SSE1Q0), L(SSE2Q0), L(SSE3Q0)
147e83
-	.quad       L(SSE4Q0), L(SSE5Q0), L(SSE6Q0), L(SSE7Q0)
147e83
-	.quad       L(SSE8Q0), L(SSE9Q0), L(SSE10Q0), L(SSE11Q0)
147e83
-	.quad       L(SSE12Q0), L(SSE13Q0), L(SSE14Q0), L(SSE15Q0)
147e83
-	.quad       L(SSE0Q1), L(SSE1Q1), L(SSE2Q1), L(SSE3Q1)
147e83
-	.quad       L(SSE4Q1), L(SSE5Q1), L(SSE6Q1), L(SSE7Q1)
147e83
-	.quad       L(SSE8Q1), L(SSE9Q1), L(SSE10Q1), L(SSE11Q1)
147e83
-	.quad       L(SSE12Q1), L(SSE13Q1), L(SSE14Q1), L(SSE15Q1)
147e83
-	.quad       L(SSE0Q2), L(SSE1Q2), L(SSE2Q2), L(SSE3Q2)
147e83
-	.quad       L(SSE4Q2), L(SSE5Q2), L(SSE6Q2), L(SSE7Q2)
147e83
-	.quad       L(SSE8Q2), L(SSE9Q2), L(SSE10Q2), L(SSE11Q2)
147e83
-	.quad       L(SSE12Q2), L(SSE13Q2), L(SSE14Q2), L(SSE15Q2)
147e83
-	.quad       L(SSE0Q3), L(SSE1Q3), L(SSE2Q3), L(SSE3Q3)
147e83
-	.quad       L(SSE4Q3), L(SSE5Q3), L(SSE6Q3), L(SSE7Q3)
147e83
-	.quad       L(SSE8Q3), L(SSE9Q3), L(SSE10Q3), L(SSE11Q3)
147e83
-	.quad       L(SSE12Q3), L(SSE13Q3), L(SSE14Q3), L(SSE15Q3)
147e83
-	.quad       L(SSE0Q4), L(SSE1Q4), L(SSE2Q4), L(SSE3Q4)
147e83
-	.quad       L(SSE4Q4), L(SSE5Q4), L(SSE6Q4), L(SSE7Q4)
147e83
-	.quad       L(SSE8Q4), L(SSE9Q4), L(SSE10Q4), L(SSE11Q4)
147e83
-	.quad       L(SSE12Q4), L(SSE13Q4), L(SSE14Q4), L(SSE15Q4)
147e83
-	.quad       L(SSE0Q5), L(SSE1Q5), L(SSE2Q5), L(SSE3Q5)
147e83
-	.quad       L(SSE4Q5), L(SSE5Q5), L(SSE6Q5), L(SSE7Q5)
147e83
-	.quad       L(SSE8Q5), L(SSE9Q5), L(SSE10Q5), L(SSE11Q5)
147e83
-	.quad       L(SSE12Q5), L(SSE13Q5), L(SSE14Q5), L(SSE15Q5)
147e83
-	.quad       L(SSE0Q6), L(SSE1Q6), L(SSE2Q6), L(SSE3Q6)
147e83
-	.quad       L(SSE4Q6), L(SSE5Q6), L(SSE6Q6), L(SSE7Q6)
147e83
-	.quad       L(SSE8Q6), L(SSE9Q6), L(SSE10Q6), L(SSE11Q6)
147e83
-	.quad       L(SSE12Q6), L(SSE13Q6), L(SSE14Q6), L(SSE15Q6)
147e83
-	.quad       L(SSE0Q7), L(SSE1Q7), L(SSE2Q7), L(SSE3Q7)
147e83
-	.quad       L(SSE4Q7), L(SSE5Q7), L(SSE6Q7), L(SSE7Q7)
147e83
-	.quad       L(SSE8Q7), L(SSE9Q7), L(SSE10Q7), L(SSE11Q7)
147e83
-	.quad       L(SSE12Q7), L(SSE13Q7), L(SSE14Q7), L(SSE15Q7)
147e83
-	.quad       L(SSE0Q8), L(SSE1Q8), L(SSE2Q8), L(SSE3Q8)
147e83
-	.quad       L(SSE4Q8), L(SSE5Q8), L(SSE6Q8), L(SSE7Q8)
147e83
-	.quad       L(SSE8Q8), L(SSE9Q8), L(SSE10Q8), L(SSE11Q8)
147e83
-	.quad       L(SSE12Q8), L(SSE13Q8), L(SSE14Q8), L(SSE15Q8)
147e83
-	.quad       L(SSE0Q9), L(SSE1Q9), L(SSE2Q9), L(SSE3Q9)
147e83
-	.quad       L(SSE4Q9), L(SSE5Q9), L(SSE6Q9), L(SSE7Q9)
147e83
-	.quad       L(SSE8Q9), L(SSE9Q9), L(SSE10Q9), L(SSE11Q9)
147e83
-	.quad       L(SSE12Q9), L(SSE13Q9), L(SSE14Q9), L(SSE15Q9)
147e83
-	.quad       L(SSE0QA), L(SSE1QA), L(SSE2QA), L(SSE3QA)
147e83
-	.quad       L(SSE4QA), L(SSE5QA), L(SSE6QA), L(SSE7QA)
147e83
-	.quad       L(SSE8QA), L(SSE9QA), L(SSE10QA), L(SSE11QA)
147e83
-	.quad       L(SSE12QA), L(SSE13QA), L(SSE14QA), L(SSE15QA)
147e83
-	.quad       L(SSE0QB), L(SSE1QB), L(SSE2QB), L(SSE3QB)
147e83
-	.quad       L(SSE4QB), L(SSE5QB), L(SSE6QB), L(SSE7QB)
147e83
-	.quad       L(SSE8QB), L(SSE9QB), L(SSE10QB), L(SSE11QB)
147e83
-	.quad       L(SSE12QB), L(SSE13QB), L(SSE14QB), L(SSE15QB)
147e83
-# else
147e83
-L(SSExDx):
147e83
-	.short     L(SSE0Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q0) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q0) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q0)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q0)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q0)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q0)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q0)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q0)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q1) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q1) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q1)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q1)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q1)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q1)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q1)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q1)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q2) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q2) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q2)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q2)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q2)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q2)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q2)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q2)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q3) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q3) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q3)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q3)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q3)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q3)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q3)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q3)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q4) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q4) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q4)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q4)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q4)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q4)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q4)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q4)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q5) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q5) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q5)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q5)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q5)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q5)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q5)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q5)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q6) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q6) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q6)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q6)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q6)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q6)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q6)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q6)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q7) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q7) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q7)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q7)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q7)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q7)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q7)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q7)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q8) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q8) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q8)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q8)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q8)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q8)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q8)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q8)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE1Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE2Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE3Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE4Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE5Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE6Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE7Q9) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE9Q9) -L(SSE0Q0)
147e83
-	.short     L(SSE10Q9)-L(SSE0Q0)
147e83
-	.short     L(SSE11Q9)-L(SSE0Q0)
147e83
-	.short     L(SSE12Q9)-L(SSE0Q0)
147e83
-	.short     L(SSE13Q9)-L(SSE0Q0)
147e83
-	.short     L(SSE14Q9)-L(SSE0Q0)
147e83
-	.short     L(SSE15Q9)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0QA) -L(SSE0Q0)
147e83
-	.short     L(SSE1QA) -L(SSE0Q0)
147e83
-	.short     L(SSE2QA) -L(SSE0Q0)
147e83
-	.short     L(SSE3QA) -L(SSE0Q0)
147e83
-	.short     L(SSE4QA) -L(SSE0Q0)
147e83
-	.short     L(SSE5QA) -L(SSE0Q0)
147e83
-	.short     L(SSE6QA) -L(SSE0Q0)
147e83
-	.short     L(SSE7QA) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8QA) -L(SSE0Q0)
147e83
-	.short     L(SSE9QA) -L(SSE0Q0)
147e83
-	.short     L(SSE10QA)-L(SSE0Q0)
147e83
-	.short     L(SSE11QA)-L(SSE0Q0)
147e83
-	.short     L(SSE12QA)-L(SSE0Q0)
147e83
-	.short     L(SSE13QA)-L(SSE0Q0)
147e83
-	.short     L(SSE14QA)-L(SSE0Q0)
147e83
-	.short     L(SSE15QA)-L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE0QB) -L(SSE0Q0)
147e83
-	.short     L(SSE1QB) -L(SSE0Q0)
147e83
-	.short     L(SSE2QB) -L(SSE0Q0)
147e83
-	.short     L(SSE3QB) -L(SSE0Q0)
147e83
-	.short     L(SSE4QB) -L(SSE0Q0)
147e83
-	.short     L(SSE5QB) -L(SSE0Q0)
147e83
-	.short     L(SSE6QB) -L(SSE0Q0)
147e83
-	.short     L(SSE7QB) -L(SSE0Q0)
147e83
-
147e83
-	.short     L(SSE8QB) -L(SSE0Q0)
147e83
-	.short     L(SSE9QB) -L(SSE0Q0)
147e83
-	.short     L(SSE10QB)-L(SSE0Q0)
147e83
-	.short     L(SSE11QB)-L(SSE0Q0)
147e83
-	.short     L(SSE12QB)-L(SSE0Q0)
147e83
-	.short     L(SSE13QB)-L(SSE0Q0)
147e83
-	.short     L(SSE14QB)-L(SSE0Q0)
147e83
-	.short     L(SSE15QB)-L(SSE0Q0)
147e83
-# endif
147e83
-	.popsection
147e83
-#endif /* !defined USE_MULTIARCH || defined USE_SSE2  */
147e83
-
147e83
-	.balign     16
147e83
-#ifndef USE_MULTIARCH
147e83
-L(aligned_now):
147e83
-
147e83
-	 cmpl   $0x1,__x86_64_preferred_memory_instruction(%rip)
147e83
-	 jg     L(SSE_pre)
147e83
-#endif /* USE_MULTIARCH */
147e83
-
147e83
-L(8byte_move_try):
147e83
-	cmpq	__STOS_LOWER_BOUNDARY,%r8
147e83
-	jae	L(8byte_stos_try)
147e83
-
147e83
-	.balign     16
147e83
-L(8byte_move):
147e83
-	movq	%r8,%rcx
147e83
-	shrq	$7,%rcx
147e83
-	jz	L(8byte_move_skip)
147e83
-
147e83
-	.p2align 4
147e83
-
147e83
-L(8byte_move_loop):
147e83
-	decq	%rcx
147e83
-
147e83
-	movq	%rdx,    (%rdi)
147e83
-	movq	%rdx,  8 (%rdi)
147e83
-	movq	%rdx, 16 (%rdi)
147e83
-	movq	%rdx, 24 (%rdi)
147e83
-	movq	%rdx, 32 (%rdi)
147e83
-	movq	%rdx, 40 (%rdi)
147e83
-	movq	%rdx, 48 (%rdi)
147e83
-	movq	%rdx, 56 (%rdi)
147e83
-	movq	%rdx, 64 (%rdi)
147e83
-	movq	%rdx, 72 (%rdi)
147e83
-	movq	%rdx, 80 (%rdi)
147e83
-	movq	%rdx, 88 (%rdi)
147e83
-	movq	%rdx, 96 (%rdi)
147e83
-	movq	%rdx, 104 (%rdi)
147e83
-	movq	%rdx, 112 (%rdi)
147e83
-	movq	%rdx, 120 (%rdi)
147e83
-
147e83
-	leaq	128 (%rdi),%rdi
147e83
-
147e83
-	jnz     L(8byte_move_loop)
147e83
-
147e83
-L(8byte_move_skip):
147e83
-	andl	$127,%r8d
147e83
-	lea	(%rdi,%r8,1),%rdi
147e83
-
147e83
-#ifndef PIC
147e83
-	lea	L(setPxQx)(%rip),%r11
147e83
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
147e83
-#else
147e83
-	lea	L(Got0)(%rip),%r11
147e83
-	lea	L(setPxQx)(%rip),%rcx
147e83
-	movswq	(%rcx,%r8,2),%rcx
147e83
-	lea	(%rcx,%r11,1),%r11
147e83
-	jmpq	*%r11
147e83
-#endif
147e83
-
147e83
-	.balign     16
147e83
-L(8byte_stos_try):
147e83
-	mov    __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size
147e83
-	cmpq	%r8,%r9		// calculate the lesser of remaining
147e83
-	cmovaq	%r8,%r9		// bytes and largest cache size
147e83
-	jbe	L(8byte_stos)
147e83
-
147e83
-L(8byte_move_reuse_try):
147e83
-	cmp	__STOS_UPPER_BOUNDARY,%r8
147e83
-	jae	L(8byte_move)
147e83
-
147e83
-	.balign     16
147e83
-L(8byte_stos):
147e83
-	movq	%r9,%rcx
147e83
-	andq	$-8,%r9
147e83
-
147e83
-	shrq	$3,%rcx
147e83
-	jz	L(8byte_stos_skip)
147e83
-
147e83
-	xchgq	%rax,%rdx
147e83
 
147e83
+ENTRY (memset)
147e83
+	movd	%esi, %xmm8
147e83
+	movq	%rdi, %rax
147e83
+	punpcklbw	%xmm8, %xmm8
147e83
+	punpcklwd	%xmm8, %xmm8
147e83
+	pshufd	$0, %xmm8, %xmm8
147e83
+L(entry_from_bzero):
147e83
+	cmpq	$64, %rdx
147e83
+	ja	L(loop_start)
147e83
+	cmpq	$16, %rdx
147e83
+	jbe	L(less_16_bytes)
147e83
+	cmpq	$32, %rdx
147e83
+	movdqu	%xmm8, (%rdi)
147e83
+	movdqu	%xmm8, -16(%rdi,%rdx)
147e83
+	ja	L(between_32_64_bytes)
147e83
+L(return):
147e83
 	rep
147e83
-	stosq
147e83
-
147e83
-	xchgq	%rax,%rdx
147e83
-
147e83
-L(8byte_stos_skip):
147e83
-	subq	%r9,%r8
147e83
-	ja	L(8byte_nt_move)
147e83
-
147e83
-	andl	$7,%r8d
147e83
-	lea	(%rdi,%r8,1),%rdi
147e83
-#ifndef PIC
147e83
-	lea	L(setPxQx)(%rip),%r11
147e83
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
147e83
-#else
147e83
-	lea	L(Got0)(%rip),%r11
147e83
-	lea     L(setPxQx)(%rip),%rcx
147e83
-	movswq	(%rcx,%r8,2),%rcx
147e83
-	lea	(%rcx,%r11,1),%r11
147e83
-	jmpq	*%r11
147e83
-#endif
147e83
-
147e83
-	.balign     16
147e83
-L(8byte_nt_move):
147e83
-	movq	%r8,%rcx
147e83
-	shrq	$7,%rcx
147e83
-	jz      L(8byte_nt_move_skip)
147e83
-
147e83
-	.balign     16
147e83
-L(8byte_nt_move_loop):
147e83
-	decq	%rcx
147e83
+	ret
147e83
 
147e83
-	movntiq	%rdx,     (%rdi)
147e83
-	movntiq	%rdx,   8 (%rdi)
147e83
-	movntiq	%rdx,  16 (%rdi)
147e83
-	movntiq	%rdx,  24 (%rdi)
147e83
-	movntiq	%rdx,  32 (%rdi)
147e83
-	movntiq	%rdx,  40 (%rdi)
147e83
-	movntiq	%rdx,  48 (%rdi)
147e83
-	movntiq	%rdx,  56 (%rdi)
147e83
-	movntiq	%rdx,  64 (%rdi)
147e83
-	movntiq	%rdx,  72 (%rdi)
147e83
-	movntiq	%rdx,  80 (%rdi)
147e83
-	movntiq	%rdx,  88 (%rdi)
147e83
-	movntiq	%rdx,  96 (%rdi)
147e83
-	movntiq	%rdx, 104 (%rdi)
147e83
-	movntiq	%rdx, 112 (%rdi)
147e83
-	movntiq	%rdx, 120 (%rdi)
147e83
-
147e83
-	leaq	128 (%rdi),%rdi
147e83
-
147e83
-	jnz     L(8byte_nt_move_loop)
147e83
-
147e83
-	sfence
147e83
-
147e83
-L(8byte_nt_move_skip):
147e83
-	andl	$127,%r8d
147e83
-
147e83
-	lea	(%rdi,%r8,1),%rdi
147e83
-#ifndef PIC
147e83
-	lea	L(setPxQx)(%rip),%r11
147e83
-	jmpq	*(%r11,%r8,8) # old scheme remained for nonPIC
147e83
-#else
147e83
-	lea	L(Got0)(%rip),%r11
147e83
-	lea     L(setPxQx)(%rip),%rcx
147e83
-	movswq	(%rcx,%r8,2),%rcx
147e83
-	lea	(%rcx,%r11,1),%r11
147e83
-	jmpq	*%r11
147e83
-#endif
147e83
+	ALIGN (4)
147e83
+L(between_32_64_bytes):
147e83
+	movdqu	%xmm8, 16(%rdi)
147e83
+	movdqu	%xmm8, -32(%rdi,%rdx)
147e83
+	ret
147e83
+	ALIGN (4)
147e83
+L(loop_start):
147e83
+	leaq	64(%rdi), %rcx
147e83
+	movdqu	%xmm8, (%rdi)
147e83
+	andq	$-64, %rcx
147e83
+	movdqu	%xmm8, -16(%rdi,%rdx)
147e83
+	movdqu	%xmm8, 16(%rdi)
147e83
+	movdqu	%xmm8, -32(%rdi,%rdx)
147e83
+	movdqu	%xmm8, 32(%rdi)
147e83
+	movdqu	%xmm8, -48(%rdi,%rdx)
147e83
+	movdqu	%xmm8, 48(%rdi)
147e83
+	movdqu	%xmm8, -64(%rdi,%rdx)
147e83
+	addq	%rdi, %rdx
147e83
+	andq	$-64, %rdx
147e83
+	cmpq	%rdx, %rcx
147e83
+	je	L(return)
147e83
+	ALIGN (4)
147e83
+L(loop):
147e83
+	movdqa	%xmm8, (%rcx)
147e83
+	movdqa	%xmm8, 16(%rcx)
147e83
+	movdqa	%xmm8, 32(%rcx)
147e83
+	movdqa	%xmm8, 48(%rcx)
147e83
+	addq	$64, %rcx
147e83
+	cmpq	%rcx, %rdx
147e83
+	jne	L(loop)
147e83
+	rep
147e83
+	ret
147e83
+L(less_16_bytes):
147e83
+	movq %xmm8, %rcx
147e83
+	testb	$24, %dl
147e83
+	jne	L(between8_16bytes)
147e83
+	testb	$4, %dl
147e83
+	jne	L(between4_7bytes)
147e83
+	testb	$1, %dl
147e83
+	je	L(odd_byte)
147e83
+	movb	%cl, (%rdi)
147e83
+L(odd_byte):
147e83
+	testb	$2, %dl
147e83
+	je	L(return)
147e83
+	movw	%cx, -2(%rax,%rdx)
147e83
+	ret
147e83
+L(between4_7bytes):
147e83
+	movl	%ecx, (%rdi)
147e83
+	movl	%ecx, -4(%rdi,%rdx)
147e83
+	ret
147e83
+L(between8_16bytes):
147e83
+	movq	%rcx, (%rdi)
147e83
+	movq	%rcx, -8(%rdi,%rdx)
147e83
+	ret
147e83
 
147e83
 END (memset)
147e83
 libc_hidden_builtin_def (memset)