Blob Blame History Raw
diff --git a/ecc-256.c b/ecc-256.c
index 571cf73..07841b1 100644
--- a/ecc-256.c
+++ b/ecc-256.c
@@ -108,7 +108,10 @@ ecc_256_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
       u0 -= t;
       t = (u1 < cy);
       u1 -= cy;
-      u1 += cnd_add_n (t, rp + n - 4, ecc->p, 3);
+
+      cy = cnd_add_n (t, rp + n - 4, ecc->p, 2);
+      u0 += cy;
+      u1 += (u0 < cy);
       u1 -= (-t) & 0xffffffff;
     }
   rp[2] = u0;
@@ -195,7 +198,7 @@ ecc_256_modq (const struct ecc_curve *ecc, mp_limb_t *rp)
 
       /* Conditional add of p */
       u1 += t;
-      u2 += (t<<32) + (u0 < t);
+      u2 += (t<<32) + (u1 < t);
 
       t = cnd_add_n (t, rp + n - 4, ecc->q, 2);
       u1 += t;
diff --git a/x86_64/ecc-384-modp.asm b/x86_64/ecc-384-modp.asm
index 698838f..31b739e 100644
--- a/x86_64/ecc-384-modp.asm
+++ b/x86_64/ecc-384-modp.asm
@@ -20,7 +20,7 @@ C MA 02111-1301, USA.
 	.file "ecc-384-modp.asm"
 
 define(<RP>, <%rsi>)
-define(<D4>, <%rax>)
+define(<D5>, <%rax>)
 define(<T0>, <%rbx>)
 define(<T1>, <%rcx>)
 define(<T2>, <%rdx>)
@@ -35,8 +35,8 @@ define(<H4>, <%r13>)
 define(<H5>, <%r14>)
 define(<C2>, <%r15>)
 define(<C0>, H5)	C Overlap
-define(<D0>, RP)	C Overlap
-define(<TMP>, H4)	C Overlap
+define(<TMP>, RP)	C Overlap
+
 
 PROLOGUE(nettle_ecc_384_modp)
 	W64_ENTRY(2, 0)
@@ -48,34 +48,38 @@ PROLOGUE(nettle_ecc_384_modp)
 	push	%r14
 	push	%r15
 
-	C First get top 2 limbs, which need folding twice
+	C First get top 2 limbs, which need folding twice.
+	C B^10 = B^6 + B^4 + 2^32 (B-1)B^4.
+	C We handle the terms as follow:
 	C
-	C   H5 H4
-	C     -H5
-	C  ------
-	C   H0 D4
+	C B^6: Folded immediatly.
 	C
-	C Then shift right, (H1,H0,D4)  <--  (H0,D4) << 32
-	C and add
+	C B^4: Delayed, added in in the next folding.
 	C
-	C     H5 H4
-	C     H1 H0
-	C ----------
-	C  C2 H1 H0
-
-	mov	80(RP), D4
-	mov	88(RP), H0
-	mov	D4, H4
-	mov	H0, H5
-	sub	H0, D4
-	sbb	$0, H0
-
-	mov	D4, T2
-	mov	H0, H1
-	shl	$32, H0
-	shr	$32, T2
+	C 2^32(B-1) B^4: Low half limb delayed until the next
+	C folding. Top 1.5 limbs subtracted and shifter now, resulting
+	C in 2.5 limbs. The low limb saved in D5, high 1.5 limbs added
+	C in.
+
+	mov	80(RP), H4
+	mov	88(RP), H5
+	C Shift right 32 bits, into H1, H0
+	mov	H4, H0
+	mov	H5, H1
+	mov	H5, D5
 	shr	$32, H1
-	or	T2, H0
+	shl	$32, D5
+	shr	$32, H0
+	or	D5, H0
+
+	C	H1 H0
+	C       -  H1 H0
+	C       --------
+	C       H1 H0 D5
+	mov	H0, D5
+	neg	D5
+	sbb	H1, H0
+	sbb	$0, H1
 
 	xor	C2, C2
 	add	H4, H0
@@ -114,118 +118,95 @@ PROLOGUE(nettle_ecc_384_modp)
 	adc	H3, T5
 	adc	$0, C0
 
-	C   H3 H2 H1 H0  0
-	C - H4 H3 H2 H1 H0
-	C  ---------------
-	C   H3 H2 H1 H0 D0
-
-	mov	XREG(D4), XREG(D4)
-	mov	H0, D0
-	neg	D0
-	sbb	H1, H0
-	sbb	H2, H1
-	sbb	H3, H2
-	sbb	H4, H3
-	sbb	$0, D4
-
-	C Shift right. High bits are sign, to be added to C0.
-	mov	D4, TMP
-	sar	$32, TMP
-	shl	$32, D4
-	add	TMP, C0
-
+	C Shift left, including low half of H4
 	mov	H3, TMP
+	shl	$32, H4
 	shr	$32, TMP
-	shl	$32, H3
-	or	TMP, D4
+	or	TMP, H4
 
 	mov	H2, TMP
+	shl	$32, H3
 	shr	$32, TMP
-	shl	$32, H2
 	or	TMP, H3
 
 	mov	H1, TMP
+	shl	$32, H2
 	shr	$32, TMP
-	shl	$32, H1
 	or	TMP, H2
 
 	mov	H0, TMP
+	shl	$32, H1
 	shr	$32, TMP
-	shl	$32, H0
 	or	TMP, H1
 
-	mov	D0, TMP
-	shr	$32, TMP
-	shl	$32, D0
-	or	TMP, H0
+	shl	$32, H0
+
+	C   H4 H3 H2 H1 H0  0
+	C  -   H4 H3 H2 H1 H0
+	C  ---------------
+	C   H4 H3 H2 H1 H0 TMP
 
-	add	D0, T0
+	mov	H0, TMP
+	neg	TMP
+	sbb	H1, H0
+	sbb	H2, H1
+	sbb	H3, H2
+	sbb	H4, H3
+	sbb	$0, H4
+
+	add	TMP, T0
 	adc	H0, T1
 	adc	H1, T2
 	adc	H2, T3
 	adc	H3, T4
-	adc	D4, T5
+	adc	H4, T5
 	adc	$0, C0
 
 	C Remains to add in C2 and C0
-	C                         C0  C0<<32  (-2^32+1)C0
-	C    C2  C2<<32  (-2^32+1)C2
-	C where C2 is always positive, while C0 may be -1.
+	C Set H1, H0 = (2^96 - 2^32 + 1) C0
 	mov	C0, H0
 	mov	C0, H1
-	mov	C0, H2
-	sar	$63, C0		C Get sign
 	shl	$32, H1
-	sub	H1, H0		C Gives borrow iff C0 > 0
+	sub	H1, H0
 	sbb	$0, H1
-	add	C0, H2
 
+	C Set H3, H2 = (2^96 - 2^32 + 1) C2
+	mov	C2, H2
+	mov	C2, H3
+	shl	$32, H3
+	sub	H3, H2
+	sbb	$0, H3
+	add	C0, H2		C No carry. Could use lea trick
+
+	xor	C0, C0
 	add	H0, T0
 	adc	H1, T1
-	adc	$0, H2
-	adc	$0, C0
-
-	C Set (H1 H0)  <-- C2 << 96 - C2 << 32 + 1
-	mov	C2, H0
-	mov	C2, H1
-	shl	$32, H1
-	sub	H1, H0
-	sbb	$0, H1
-
-	add	H2, H0
-	adc	C0, H1
-	adc	C2, C0
-	mov	C0, H2
-	sar	$63, C0
-	add	H0, T2
-	adc	H1, T3
-	adc	H2, T4
-	adc	C0, T5
-	sbb	C0, C0
+	adc	H2, T2
+	adc	H3, T3
+	adc	C2, T4
+	adc	D5, T5		C Value delayed from initial folding
+	adc	$0, C0		C Use sbb and switch sign?
 
 	C Final unlikely carry
 	mov	C0, H0
 	mov	C0, H1
-	mov	C0, H2
-	sar	$63, C0
 	shl	$32, H1
 	sub	H1, H0
 	sbb	$0, H1
-	add	C0, H2
 
 	pop	RP
 
-	sub	H0, T0
+	add	H0, T0
 	mov	T0, (RP)
-	sbb	H1, T1
+	adc	H1, T1
 	mov	T1, 8(RP)
-	sbb	H2, T2
+	adc	C0, T2
 	mov	T2, 16(RP)
-	sbb	C0, T3
+	adc	$0, T3
 	mov	T3, 24(RP)
-	sbb	C0, T4
+	adc	$0, T4
 	mov	T4, 32(RP)
-	sbb	C0, T5
+	adc	$0, T5
 	mov	T5, 40(RP)
 
 	pop	%r15