aboutsummaryrefslogtreecommitdiff
path: root/openssl-1.1.0h/crypto/poly1305/asm
diff options
context:
space:
mode:
Diffstat (limited to 'openssl-1.1.0h/crypto/poly1305/asm')
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-armv4.pl1252
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-armv8.pl943
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-c64xplus.pl331
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-mips.pl425
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-ppc.pl644
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-ppcfp.pl739
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl227
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-sparcv9.pl1120
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-x86.pl1814
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-x86_64.pl2268
10 files changed, 9763 insertions, 0 deletions
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-armv4.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-armv4.pl
new file mode 100755
index 0000000..fc899ce
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-armv4.pl
@@ -0,0 +1,1252 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# IALU(*)/gcc-4.4 NEON
+#
+# ARM11xx(ARMv6) 7.78/+100% -
+# Cortex-A5 6.35/+130% 3.00
+# Cortex-A8 6.25/+115% 2.36
+# Cortex-A9 5.10/+95% 2.55
+# Cortex-A15 3.85/+85% 1.25(**)
+# Snapdragon S4 5.70/+100% 1.48(**)
+#
+# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp $inp,#0
+ str r3,[$ctx,#0] @ zero hash value
+ str r3,[$ctx,#4]
+ str r3,[$ctx,#8]
+ str r3,[$ctx,#12]
+ str r3,[$ctx,#16]
+ str r3,[$ctx,#36] @ is_base2_26
+ add $ctx,$ctx,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+#endif
+ ldrb r4,[$inp,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[$inp,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[$inp,#2]
+ ldrb r7,[$inp,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[$inp,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[$inp,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[$inp,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# ifdef __APPLE__
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[$inp,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[$inp,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[$inp,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[$inp,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __APPLE__
+ adr r9,poly1305_blocks_neon
+ adr r11,poly1305_blocks
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r11,r9
+ adr r12,poly1305_emit
+ adr r10,poly1305_emit_neon
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r12,r10
+# else
+# ifdef __thumb2__
+ itete eq
+# endif
+ addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
+# endif
+# ifdef __thumb2__
+ orr r12,r12,#1 @ thumb-ify address
+ orr r11,r11,#1
+# endif
+#endif
+ ldrb r9,[$inp,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[$inp,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[$inp,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[$inp,#14]
+ and r6,r6,r3
+
+ ldrb r10,[$inp,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[$ctx,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[$ctx,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[$ctx,#8]
+ and r7,r7,r3
+ str r7,[$ctx,#12]
+#if __ARM_MAX_ARCH__>=7
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands $len,$len,#-16
+ beq .Lno_data
+
+ cmp $padbit,#0
+ add $len,$len,$inp @ end pointer
+ sub sp,sp,#32
+
+ ldmia $ctx,{$h0-$r3} @ load context
+
+ str $ctx,[sp,#12] @ offload stuff
+ mov lr,$inp
+ str $len,[sp,#16]
+ str $r1,[sp,#20]
+ str $r2,[sp,#24]
+ str $r3,[sp,#28]
+ b .Loop
+
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds $h0,$h0,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs $h1,$h1,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs $h2,$h2,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add $s1,$r1,$r1,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds $h0,$h0,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs $h1,$h1,r1
+ add $s1,$r1,$r1,lsr#2
+ adcs $h2,$h2,r2
+#endif
+ add $s2,$r2,$r2,lsr#2
+ adcs $h3,$h3,r3
+ add $s3,$r3,$r3,lsr#2
+
+ umull r2,r3,$h1,$r0
+ adc $h4,$h4,#0
+ umull r0,r1,$h0,$r0
+ umlal r2,r3,$h4,$s1
+ umlal r0,r1,$h3,$s1
+ ldr $r1,[sp,#20] @ reload $r1
+ umlal r2,r3,$h2,$s3
+ umlal r0,r1,$h1,$s3
+ umlal r2,r3,$h3,$s2
+ umlal r0,r1,$h2,$s2
+ umlal r2,r3,$h0,$r1
+ str r0,[sp,#0] @ future $h0
+ mul r0,$s2,$h4
+ ldr $r2,[sp,#24] @ reload $r2
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future $h2
+ str r2,[sp,#4] @ future $h1
+
+ mul r2,$s3,$h4
+ eor r3,r3,r3
+ umlal r0,r1,$h3,$s3
+ ldr $r3,[sp,#28] @ reload $r3
+ umlal r2,r3,$h3,$r0
+ umlal r0,r1,$h2,$r0
+ umlal r2,r3,$h2,$r1
+ umlal r0,r1,$h1,$r1
+ umlal r2,r3,$h1,$r2
+ umlal r0,r1,$h0,$r2
+ umlal r2,r3,$h0,$r3
+ ldr $h0,[sp,#0]
+ mul $h4,$r0,$h4
+ ldr $h1,[sp,#4]
+
+ adds $h2,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds $h3,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add $h4,$h4,r3 @ h4+=d3>>32
+
+ and r1,$h4,#-4
+ and $h4,$h4,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds $h0,$h0,r1
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr $ctx,[sp,#12]
+ add sp,sp,#32
+ stmia $ctx,{$h0-$h4} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$h4;
+
+$code.=<<___;
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ stmdb sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+
+ ldmia $ctx,{$h0-$h4}
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0]
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+#else
+ strb $h0,[$mac,#0]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#4]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#8]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#12]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#1]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#5]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#9]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#13]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#2]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#6]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#10]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#14]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#3]
+ strb $h1,[$mac,#7]
+ strb $h2,[$mac,#11]
+ strb $h3,[$mac,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+ ldr r4,[$ctx,#20] @ load key base 2^32
+ ldr r5,[$ctx,#24]
+ ldr r6,[$ctx,#28]
+ ldr r7,[$ctx,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 $R0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 $R1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 $S1,r2
+ vdup.32 $R2,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 $S2,r3
+ vdup.32 $R3,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 $S3,r4
+ vdup.32 $R4,r6
+ vdup.32 $S4,r5
+
+ mov $zeros,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 $D0,$R0,${R0}[1]
+ vmull.u32 $D1,$R1,${R0}[1]
+ vmull.u32 $D2,$R2,${R0}[1]
+ vmull.u32 $D3,$R3,${R0}[1]
+ vmull.u32 $D4,$R4,${R0}[1]
+
+ vmlal.u32 $D0,$R4,${S1}[1]
+ vmlal.u32 $D1,$R0,${R1}[1]
+ vmlal.u32 $D2,$R1,${R1}[1]
+ vmlal.u32 $D3,$R2,${R1}[1]
+ vmlal.u32 $D4,$R3,${R1}[1]
+
+ vmlal.u32 $D0,$R3,${S2}[1]
+ vmlal.u32 $D1,$R4,${S2}[1]
+ vmlal.u32 $D3,$R1,${R2}[1]
+ vmlal.u32 $D2,$R0,${R2}[1]
+ vmlal.u32 $D4,$R2,${R2}[1]
+
+ vmlal.u32 $D0,$R2,${S3}[1]
+ vmlal.u32 $D3,$R0,${R3}[1]
+ vmlal.u32 $D1,$R3,${S3}[1]
+ vmlal.u32 $D2,$R4,${S3}[1]
+ vmlal.u32 $D4,$R1,${R3}[1]
+
+ vmlal.u32 $D3,$R4,${S4}[1]
+ vmlal.u32 $D0,$R1,${S4}[1]
+ vmlal.u32 $D1,$R2,${S4}[1]
+ vmlal.u32 $D2,$R3,${S4}[1]
+ vmlal.u32 $D4,$R0,${R4}[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vbic.i32 $D4#lo,#0xfc000000
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vbic.i32 $D2#lo,#0xfc000000
+
+ vshr.u32 $T0#lo,$D0#lo,#26
+ vbic.i32 $D0#lo,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+
+ subs $zeros,$zeros,#1
+ beq .Lsquare_break_neon
+
+ add $tbl0,$ctx,#(48+0*9*4)
+ add $tbl1,$ctx,#(48+1*9*4)
+
+ vtrn.32 $R0,$D0#lo @ r^2:r^1
+ vtrn.32 $R2,$D2#lo
+ vtrn.32 $R3,$D3#lo
+ vtrn.32 $R1,$D1#lo
+ vtrn.32 $R4,$D4#lo
+
+ vshl.u32 $S2,$R2,#2 @ *5
+ vshl.u32 $S3,$R3,#2
+ vshl.u32 $S1,$R1,#2
+ vshl.u32 $S4,$R4,#2
+ vadd.i32 $S2,$S2,$R2
+ vadd.i32 $S1,$S1,$R1
+ vadd.i32 $S3,$S3,$R3
+ vadd.i32 $S4,$S4,$R4
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0,:32]
+ vst1.32 {${S4}[1]},[$tbl1,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add $tbl0,$ctx,#(48+2*4*9)
+ add $tbl1,$ctx,#(48+3*4*9)
+
+ vmov $R0,$D0#lo @ r^4:r^3
+ vshl.u32 $S1,$D1#lo,#2 @ *5
+ vmov $R1,$D1#lo
+ vshl.u32 $S2,$D2#lo,#2
+ vmov $R2,$D2#lo
+ vshl.u32 $S3,$D3#lo,#2
+ vmov $R3,$D3#lo
+ vshl.u32 $S4,$D4#lo,#2
+ vmov $R4,$D4#lo
+ vadd.i32 $S1,$S1,$D1#lo
+ vadd.i32 $S2,$S2,$D2#lo
+ vadd.i32 $S3,$S3,$D3#lo
+ vadd.i32 $S4,$S4,$D4#lo
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0]
+ vst1.32 {${S4}[1]},[$tbl1]
+
+ ret @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+ ands $len,$len,#-16
+ beq .Lno_data_neon
+
+ cmp $len,#64
+ bhs .Lenter_neon
+ tst ip,ip @ is_base2_26?
+ beq poly1305_blocks
+
+.Lenter_neon:
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl poly1305_init_neon
+
+ ldr r4,[$ctx,#0] @ load hash value base 2^32
+ ldr r5,[$ctx,#4]
+ ldr r6,[$ctx,#8]
+ ldr r7,[$ctx,#12]
+ ldr ip,[$ctx,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor $D0#lo,$D0#lo,$D0#lo
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor $D1#lo,$D1#lo,$D1#lo
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor $D2#lo,$D2#lo,$D2#lo
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor $D3#lo,$D3#lo,$D3#lo
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor $D4#lo,$D4#lo,$D4#lo
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[$ctx,#36] @ is_base2_26
+
+ vmov.32 $D0#lo[0],r2
+ vmov.32 $D1#lo[0],r3
+ vmov.32 $D2#lo[0],r4
+ vmov.32 $D3#lo[0],r5
+ vmov.32 $D4#lo[0],r6
+ adr $zeros,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lbase2_32_neon
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor $D0#lo,$D0#lo,$D0#lo
+ veor $D1#lo,$D1#lo,$D1#lo
+ veor $D2#lo,$D2#lo,$D2#lo
+ veor $D3#lo,$D3#lo,$D3#lo
+ veor $D4#lo,$D4#lo,$D4#lo
+ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ adr $zeros,.Lzeros
+ vld1.32 {$D4#lo[0]},[$ctx]
+ sub $ctx,$ctx,#16 @ rewind
+
+.Lbase2_32_neon:
+ add $in2,$inp,#32
+ mov $padbit,$padbit,lsl#24
+ tst $len,#31
+ beq .Leven
+
+ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+ vmov.32 $H4#lo[0],$padbit
+ sub $len,$len,#16
+ add $in2,$inp,#32
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3#lo,$H3#lo,#18
+
+ vsri.u32 $H3#lo,$H2#lo,#14
+ vshl.u32 $H2#lo,$H2#lo,#12
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
+
+ vbic.i32 $H3#lo,#0xfc000000
+ vsri.u32 $H2#lo,$H1#lo,#20
+ vshl.u32 $H1#lo,$H1#lo,#6
+
+ vbic.i32 $H2#lo,#0xfc000000
+ vsri.u32 $H1#lo,$H0#lo,#26
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+
+ vbic.i32 $H0#lo,#0xfc000000
+ vbic.i32 $H1#lo,#0xfc000000
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo
+
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+
+ mov $tbl1,$zeros
+ add $tbl0,$ctx,#48
+
+ cmp $len,$len
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs $len,$len,#64
+ it lo
+ movlo $in2,$zeros
+
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+ itt hi
+ addhi $tbl1,$ctx,#(48+1*9*4)
+ addhi $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3,$H3,#18
+
+ vsri.u32 $H3,$H2,#14
+ vshl.u32 $H2,$H2,#12
+
+ vbic.i32 $H3,#0xfc000000
+ vsri.u32 $H2,$H1,#20
+ vshl.u32 $H1,$H1,#6
+
+ vbic.i32 $H2,#0xfc000000
+ vsri.u32 $H1,$H0,#26
+
+ vbic.i32 $H0,#0xfc000000
+ vbic.i32 $H1,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ \___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ \___________________/ \____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
+ vmull.u32 $D2,$H2#hi,${R0}[1]
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,${R0}[1]
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,${R0}[1]
+ vmlal.u32 $D2,$H1#hi,${R1}[1]
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,${R0}[1]
+
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,${R0}[1]
+ subs $len,$len,#64
+ vmlal.u32 $D0,$H4#hi,${S1}[1]
+ it lo
+ movlo $in2,$zeros
+ vmlal.u32 $D3,$H2#hi,${R1}[1]
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D1,$H0#hi,${R1}[1]
+ vmlal.u32 $D4,$H3#hi,${R1}[1]
+
+ vmlal.u32 $D0,$H3#hi,${S2}[1]
+ vmlal.u32 $D3,$H1#hi,${R2}[1]
+ vmlal.u32 $D4,$H2#hi,${R2}[1]
+ vmlal.u32 $D1,$H4#hi,${S2}[1]
+ vmlal.u32 $D2,$H0#hi,${R2}[1]
+
+ vmlal.u32 $D3,$H0#hi,${R3}[1]
+ vmlal.u32 $D0,$H2#hi,${S3}[1]
+ vmlal.u32 $D4,$H1#hi,${R3}[1]
+ vmlal.u32 $D1,$H3#hi,${S3}[1]
+ vmlal.u32 $D2,$H4#hi,${S3}[1]
+
+ vmlal.u32 $D3,$H4#hi,${S4}[1]
+ vmlal.u32 $D0,$H1#hi,${S4}[1]
+ vmlal.u32 $D4,$H0#hi,${R4}[1]
+ vmlal.u32 $D1,$H2#hi,${S4}[1]
+ vmlal.u32 $D2,$H3#hi,${S4}[1]
+
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 $D3,$H3#lo,${R0}[0]
+ vmlal.u32 $D0,$H0#lo,${R0}[0]
+ vmlal.u32 $D4,$H4#lo,${R0}[0]
+ vmlal.u32 $D1,$H1#lo,${R0}[0]
+ vmlal.u32 $D2,$H2#lo,${R0}[0]
+ vld1.32 ${S4}[0],[$tbl0,:32]
+
+ vmlal.u32 $D3,$H2#lo,${R1}[0]
+ vmlal.u32 $D0,$H4#lo,${S1}[0]
+ vmlal.u32 $D4,$H3#lo,${R1}[0]
+ vmlal.u32 $D1,$H0#lo,${R1}[0]
+ vmlal.u32 $D2,$H1#lo,${R1}[0]
+
+ vmlal.u32 $D3,$H1#lo,${R2}[0]
+ vmlal.u32 $D0,$H3#lo,${S2}[0]
+ vmlal.u32 $D4,$H2#lo,${R2}[0]
+ vmlal.u32 $D1,$H4#lo,${S2}[0]
+ vmlal.u32 $D2,$H0#lo,${R2}[0]
+
+ vmlal.u32 $D3,$H0#lo,${R3}[0]
+ vmlal.u32 $D0,$H2#lo,${S3}[0]
+ vmlal.u32 $D4,$H1#lo,${R3}[0]
+ vmlal.u32 $D1,$H3#lo,${S3}[0]
+ vmlal.u32 $D3,$H4#lo,${S4}[0]
+
+ vmlal.u32 $D2,$H4#lo,${S3}[0]
+ vmlal.u32 $D0,$H1#lo,${S4}[0]
+ vmlal.u32 $D4,$H0#lo,${R4}[0]
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vmlal.u32 $D1,$H2#lo,${S4}[0]
+ vmlal.u32 $D2,$H3#lo,${S4}[0]
+
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+ vrev32.8 $H3,$H3
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vshl.u32 $H3,$H3,#18
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vsri.u32 $H3,$H2,#14
+ vbic.i32 $D4#lo,#0xfc000000
+ vshl.u32 $H2,$H2,#12
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vbic.i32 $H3,#0xfc000000
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
+ vsri.u32 $H2,$H1,#20
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vshl.u32 $H1,$H1,#6
+ vbic.i32 $D2#lo,#0xfc000000
+ vbic.i32 $H2,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
+ vmovn.i64 $D0#lo,$D0
+ vsri.u32 $H1,$H0,#26
+ vbic.i32 $H0,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vbic.i32 $D0#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+ vbic.i32 $H1,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add $tbl1,$ctx,#(48+0*9*4)
+ add $tbl0,$ctx,#(48+1*9*4)
+ adds $len,$len,#32
+ it ne
+ movne $len,#0
+ bne .Long_tail
+
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
+ vmull.u32 $D2,$H2#hi,$R0
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,$R0
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,$R0
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,$R0
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,$R0
+
+ vmlal.u32 $D0,$H4#hi,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#hi,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#hi,$R1
+ vmlal.u32 $D4,$H3#hi,$R1
+ vmlal.u32 $D2,$H1#hi,$R1
+
+ vmlal.u32 $D3,$H1#hi,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#hi,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#hi,$R2
+ vmlal.u32 $D1,$H4#hi,$S2
+ vmlal.u32 $D2,$H0#hi,$R2
+
+ vmlal.u32 $D3,$H0#hi,$R3
+ it ne
+ addne $tbl1,$ctx,#(48+2*9*4)
+ vmlal.u32 $D0,$H2#hi,$S3
+ it ne
+ addne $tbl0,$ctx,#(48+3*9*4)
+ vmlal.u32 $D4,$H1#hi,$R3
+ vmlal.u32 $D1,$H3#hi,$S3
+ vmlal.u32 $D2,$H4#hi,$S3
+
+ vmlal.u32 $D3,$H4#hi,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
+ vmlal.u32 $D0,$H1#hi,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#hi,$R4
+ vmlal.u32 $D1,$H2#hi,$S4
+ vmlal.u32 $D2,$H3#hi,$S4
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+
+ vmlal.u32 $D2,$H2#lo,$R0
+ vmlal.u32 $D0,$H0#lo,$R0
+ vmlal.u32 $D3,$H3#lo,$R0
+ vmlal.u32 $D1,$H1#lo,$R0
+ vmlal.u32 $D4,$H4#lo,$R0
+
+ vmlal.u32 $D0,$H4#lo,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#lo,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#lo,$R1
+ vmlal.u32 $D4,$H3#lo,$R1
+ vmlal.u32 $D2,$H1#lo,$R1
+
+ vmlal.u32 $D3,$H1#lo,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#lo,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#lo,$R2
+ vmlal.u32 $D1,$H4#lo,$S2
+ vmlal.u32 $D2,$H0#lo,$R2
+
+ vmlal.u32 $D3,$H0#lo,$R3
+ vmlal.u32 $D0,$H2#lo,$S3
+ vmlal.u32 $D4,$H1#lo,$R3
+ vmlal.u32 $D1,$H3#lo,$S3
+ vmlal.u32 $D2,$H4#lo,$S3
+
+ vmlal.u32 $D3,$H4#lo,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones
+ vmlal.u32 $D0,$H1#lo,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#lo,$R4
+ vmlal.u32 $D1,$H2#lo,$S4
+ vmlal.u32 $D2,$H3#lo,$S4
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 $T0,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vshr.u64 $T1,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+
+ vshr.u64 $T0,$D4,#26
+ vand.i64 $D4,$D4,$MASK
+ vshr.u64 $T1,$D1,#26
+ vand.i64 $D1,$D1,$MASK
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+
+ vadd.i64 $D0,$D0,$T0
+ vshl.u64 $T0,$T0,#2
+ vshr.u64 $T1,$D2,#26
+ vand.i64 $D2,$D2,$MASK
+ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
+ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
+
+ vshr.u64 $T0,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vshr.u64 $T1,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
+ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
+
+ cmp $len,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ vst1.32 {$D4#lo[0]},[$ctx]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+.Lno_data_neon:
+ ret @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ stmdb sp!,{r4-r11}
+
+ tst ip,ip
+ beq .Lpoly1305_emit_enter
+
+ ldmia $ctx,{$h0-$h4}
+ eor $g0,$g0,$g0
+
+ adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $h1,$h1,lsr#6
+ adcs $h1,$h1,$h2,lsl#20
+ mov $h2,$h2,lsr#12
+ adcs $h2,$h2,$h3,lsl#14
+ mov $h3,$h3,lsr#18
+ adcs $h3,$h3,$h4,lsl#8
+ adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
+
+ and $g0,$h4,#-4 @ ... so reduce
+ and $h4,$h3,#3
+ add $g0,$g0,$g0,lsr#2 @ *= 5
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
+
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+ it ne
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+ it ne
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+ it ne
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+ it ne
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0 @ accumulate nonce
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0] @ store the result
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+
+ ldmia sp!,{r4-r11}
+ ret @ bx lr
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lpoly1305_init
+#endif
+___
+} }
+$code.=<<___;
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-armv8.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-armv8.pl
new file mode 100755
index 0000000..0fc8667
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-armv8.pl
@@ -0,0 +1,943 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.69/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+// forward "declarations" are required for Apple
+.extern OPENSSL_armcap_P
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifdef __ILP32__
+ ldrsw $t1,.LOPENSSL_armcap_P
+#else
+ ldr $t1,.LOPENSSL_armcap_P
+#endif
+ adr $t0,.LOPENSSL_armcap_P
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+ ldr w17,[$t0,$t1]
+#ifdef __ARMEB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ stp $r0,$r1,[$ctx,#32] // save key value
+
+ tst w17,#ARMV7_NEON
+
+ adr $d0,poly1305_blocks
+ adr $r0,poly1305_blocks_neon
+ adr $d1,poly1305_emit
+ adr $r1,poly1305_emit_neon
+
+ csel $d0,$d0,$r0,eq
+ csel $d1,$d1,$r1,eq
+
+#ifdef __ILP32__
+ stp w12,w13,[$len]
+#else
+ stp $d0,$d1,[$len]
+#endif
+
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $r0,$r1,[$ctx,#32] // load key value
+ ldr $h2,[$ctx,#16]
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ b .Loop
+
+.align 5
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __ARMEB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ str $h2,[$ctx,#16]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldr $h2,[$ctx,#16]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __ARMEB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __ARMEB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 5
+poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.hs .Lblocks_neon
+ cbz $is_base2_26,poly1305_blocks
+
+.Lblocks_neon:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ ands $len,$len,#-16
+ b.eq .Lno_data_neon
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+ and $t0,$d2,#-4 // ... so reduce
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$h0,$t0
+ adcs $h1,$h1,xzr
+ adc $h2,$h2,xzr
+
+#ifdef __ARMEB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+ ldr x30,[sp,#8]
+
+ cbz $padbit,.Lstore_base2_64_neon
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cbnz $len,.Leven_neon
+
+ stp w10,w11,[$ctx] // store hash value base 2^26
+ stp w12,w13,[$ctx,#8]
+ str w14,[$ctx,#16]
+ b .Lno_data_neon
+
+.align 4
+.Lstore_base2_64_neon:
+ stp $h0,$h1,[$ctx] // store hash value base 2^64
+ stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
+ b .Lno_data_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __ARMEB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+.Linit_neon:
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+ ldr x30,[sp,#8]
+
+ add $in2,$inp,#32
+ adr $zeros,.Lzeros
+ subs $len,$len,#64
+ csel $in2,$zeros,$in2,lo
+
+ mov x4,#1
+ str x4,[$ctx,#-24] // set is_base2_26
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ add $in2,$inp,#32
+ adr $zeros,.Lzeros
+ subs $len,$len,#64
+ csel $in2,$zeros,$in2,lo
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ ldp x9,x13,[$in2],#48
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr $T0.2d,$ACC3,#26
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ bic $H4,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ add $IN01_2,$IN01_2,$H2
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ st1 {$ACC4}[0],[$ctx]
+
+.Lno_data_neon:
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cbz $is_base2_26,poly1305_emit
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $h2,$h2,xzr // can be partially reduced...
+
+ ldp $t0,$t1,[$nonce] // load nonce
+
+ and $d0,$h2,#-4 // ... so reduce
+ add $d0,$d0,$h2,lsr#2
+ and $h2,$h2,#3
+ adds $h0,$h0,$d0
+ adcs $h1,$h1,xzr
+ adc $h2,$h2,xzr
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __ARMEB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __ARMEB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
+.quad OPENSSL_armcap_P-.
+#endif
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-c64xplus.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-c64xplus.pl
new file mode 100755
index 0000000..93fef37
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-c64xplus.pl
@@ -0,0 +1,331 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Poly1305 hash for C64x+.
+#
+# October 2015
+#
+# Performance is [incredible for a 32-bit processor] 1.82 cycles per
+# processed byte. Comparison to compiler-generated code is problematic,
+# because results were observed to vary from 2.1 to 7.6 cpb depending
+# on compiler's ability to inline small functions. Compiler also
+# disables interrupts for some reason, thus making interrupt response
+# time dependent on input length. This module on the other hand is free
+# from such limitation.
+
+$output=pop;
+open STDOUT,">$output";
+
+($CTXA,$INPB,$LEN,$PADBIT)=("A4","B4","A6","B6");
+($H0,$H1,$H2,$H3,$H4,$H4a)=("A8","B8","A10","B10","B2",$LEN);
+($D0,$D1,$D2,$D3)= ("A9","B9","A11","B11");
+($R0,$R1,$R2,$R3,$S1,$S2,$S3,$S3b)=("A0","B0","A1","B1","A12","B12","A13","B13");
+($THREE,$R0b,$S2a)=("B7","B5","A5");
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg poly1305_init,_poly1305_init
+ .asg poly1305_blocks,_poly1305_blocks
+ .asg poly1305_emit,_poly1305_emit
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .LITTLE_ENDIAN
+ .asg MV,SWAP2
+ .asg MV.L,SWAP4
+ .endif
+
+ .global _poly1305_init
+_poly1305_init:
+ .asmfunc
+ LDNDW *${INPB}[0],B17:B16 ; load key material
+ LDNDW *${INPB}[1],A17:A16
+
+|| ZERO B9:B8
+|| MVK -1,B0
+ STDW B9:B8,*${CTXA}[0] ; initialize h1:h0
+|| SHRU B0,4,B0 ; 0x0fffffff
+|| MVK -4,B1
+ STDW B9:B8,*${CTXA}[1] ; initialize h3:h2
+|| AND B0,B1,B1 ; 0x0ffffffc
+ STW B8,*${CTXA}[4] ; initialize h4
+
+ .if .BIG_ENDIAN
+ SWAP2 B16,B17
+|| SWAP2 B17,B16
+ SWAP2 A16,A17
+|| SWAP2 A17,A16
+ SWAP4 B16,B16
+|| SWAP4 A16,A16
+ SWAP4 B17,B17
+|| SWAP4 A17,A17
+ .endif
+
+ AND B16,B0,B20 ; r0 = key[0] & 0x0fffffff
+|| AND B17,B1,B22 ; r1 = key[1] & 0x0ffffffc
+|| EXTU B17,4,6,B16 ; r1>>2
+ AND A16,B1,B21 ; r2 = key[2] & 0x0ffffffc
+|| AND A17,B1,A23 ; r3 = key[3] & 0x0ffffffc
+|| BNOP RA
+ SHRU B21,2,B18
+|| ADD B22,B16,B16 ; s1 = r1 + r1>>2
+
+ STDW B21:B20,*${CTXA}[3] ; save r2:r0
+|| ADD B21,B18,B18 ; s2 = r2 + r2>>2
+|| SHRU A23,2,B17
+|| MV A23,B23
+ STDW B23:B22,*${CTXA}[4] ; save r3:r1
+|| ADD B23,B17,B19 ; s3 = r3 + r3>>2
+|| ADD B23,B17,B17 ; s3 = r3 + r3>>2
+ STDW B17:B16,*${CTXA}[5] ; save s3:s1
+ STDW B19:B18,*${CTXA}[6] ; save s3:s2
+|| ZERO A4 ; return 0
+ .endasmfunc
+
+ .global _poly1305_blocks
+ .align 32
+_poly1305_blocks:
+ .asmfunc stack_usage(40)
+ SHRU $LEN,4,A2 ; A2 is loop counter, number of blocks
+ [!A2] BNOP RA ; no data
+|| [A2] STW FP,*SP--(40) ; save frame pointer and alloca(40)
+|| [A2] MV SP,FP
+ [A2] STDW B13:B12,*SP[4] ; ABI says so
+|| [A2] MV $CTXA,$S3b ; borrow $S3b
+ [A2] STDW B11:B10,*SP[3]
+|| [A2] STDW A13:A12,*FP[-3]
+ [A2] STDW A11:A10,*FP[-4]
+
+|| [A2] LDDW *${S3b}[0],B25:B24 ; load h1:h0
+ [A2] LDNW *${INPB}++[4],$D0 ; load inp[0]
+ [A2] LDNW *${INPB}[-3],$D1 ; load inp[1]
+
+ LDDW *${CTXA}[1],B29:B28 ; load h3:h2, B28 is h2
+ LDNW *${INPB}[-2],$D2 ; load inp[2]
+ LDNW *${INPB}[-1],$D3 ; load inp[3]
+
+ LDDW *${CTXA}[3],$R2:$R0 ; load r2:r0
+|| LDDW *${S3b}[4],$R3:$R1 ; load r3:r1
+|| SWAP2 $D0,$D0
+
+ LDDW *${CTXA}[5],$S3:$S1 ; load s3:s1
+|| LDDW *${S3b}[6],$S3b:$S2 ; load s3:s2
+|| SWAP4 $D0,$D0
+|| SWAP2 $D1,$D1
+
+ ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
+|| ADD $D0,B24,B27 ; B-copy of h0+inp[0]
+|| SWAP4 $D1,$D1
+ ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
+|| MVK 3,$THREE
+|| SWAP2 $D2,$D2
+ LDW *${CTXA}[4],$H4 ; load h4
+|| SWAP4 $D2,$D2
+|| MV B29,B30 ; B30 is h3
+ MV $R0,$R0b
+
+loop?:
+ MPY32U $H0,$R0,A17:A16
+|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
+|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
+|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
+|| SWAP2 $D3,$D3
+ MPY32U $H0,$R2,A19:A18
+|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
+|| ADD $D0,$H1,A24 ; A-copy of B24
+|| SWAP4 $D3,$D3
+|| [A2] SUB A2,1,A2 ; decrement loop counter
+
+ MPY32U A24,$S3,A21:A20 ; MPY32U $H1,$S3,A21:A20
+|| MPY32U B24,$R0b,B21:B20 ; MPY32U $H1,$R0,B21:B20
+|| ADDU B25,$D2:$H2,$D2:$H2 ; ADDU $D1,$D2:$H2,$D2:$H2
+|| ADDU $D3,B30,$D3:$H3 ; h3+=inp[3]
+|| ADD B25,$H2,B25 ; B-copy of $H2
+ MPY32U A24,$R1,A23:A22 ; MPY32U $H1,$R1,A23:A22
+|| MPY32U B24,$R2,B23:B22 ; MPY32U $H1,$R2,B23:B22
+
+ MPY32U $H2,$S2,A25:A24
+|| MPY32U B25,$S3b,B25:B24 ; MPY32U $H2,$S3,B25:B24
+|| ADDU $D2,$D3:$H3,$D3:$H3
+|| ADD $PADBIT,$H4,$H4 ; h4+=padbit
+ MPY32U $H2,$R0,A27:A26
+|| MPY32U $H2,$R1,B27:B26
+|| ADD $D3,$H4,$H4
+|| MV $S2,$S2a
+
+ MPY32U $H3,$S1,A29:A28
+|| MPY32U $H3,$S2,B29:B28
+|| ADD A21,A17,A21 ; start accumulating "d3:d0"
+|| ADD B21,B17,B21
+|| ADDU A20,A16,A17:A16
+|| ADDU B20,B16,B17:B16
+|| [A2] LDNW *${INPB}++[4],$D0 ; load inp[0]
+ MPY32U $H3,$S3,A31:A30
+|| MPY32U $H3,$R0b,B31:B30
+|| ADD A23,A19,A23
+|| ADD B23,B19,B23
+|| ADDU A22,A18,A19:A18
+|| ADDU B22,B18,B19:B18
+|| [A2] LDNW *${INPB}[-3],$D1 ; load inp[1]
+
+ MPY32 $H4,$S1,B20
+|| MPY32 $H4,$S2a,A20
+|| ADD A25,A21,A21
+|| ADD B25,B21,B21
+|| ADDU A24,A17:A16,A17:A16
+|| ADDU B24,B17:B16,B17:B16
+|| [A2] LDNW *${INPB}[-2],$D2 ; load inp[2]
+ MPY32 $H4,$S3b,B22
+|| ADD A27,A23,A23
+|| ADD B27,B23,B23
+|| ADDU A26,A19:A18,A19:A18
+|| ADDU B26,B19:B18,B19:B18
+|| [A2] LDNW *${INPB}[-1],$D3 ; load inp[3]
+
+ MPY32 $H4,$R0b,$H4
+|| ADD A29,A21,A21 ; final hi("d0")
+|| ADD B29,B21,B21 ; final hi("d1")
+|| ADDU A28,A17:A16,A17:A16 ; final lo("d0")
+|| ADDU B28,B17:B16,B17:B16
+ ADD A31,A23,A23 ; final hi("d2")
+|| ADD B31,B23,B23 ; final hi("d3")
+|| ADDU A30,A19:A18,A19:A18
+|| ADDU B30,B19:B18,B19:B18
+ ADDU B20,B17:B16,B17:B16 ; final lo("d1")
+|| ADDU A20,A19:A18,A19:A18 ; final lo("d2")
+ ADDU B22,B19:B18,B19:B18 ; final lo("d3")
+
+|| ADD A17,A21,A21 ; "flatten" "d3:d0"
+ MV A19,B29 ; move to avoid cross-path stalls
+ ADDU A21,B17:B16,B27:B26 ; B26 is h1
+ ADD B21,B27,B27
+|| DMV B29,A18,B29:B28 ; move to avoid cross-path stalls
+ ADDU B27,B29:B28,B29:B28 ; B28 is h2
+|| [A2] SWAP2 $D0,$D0
+ ADD A23,B29,B29
+|| [A2] SWAP4 $D0,$D0
+ ADDU B29,B19:B18,B31:B30 ; B30 is h3
+ ADD B23,B31,B31
+|| MV A16,B24 ; B24 is h0
+|| [A2] SWAP2 $D1,$D1
+ ADD B31,$H4,$H4
+|| [A2] SWAP4 $D1,$D1
+
+ SHRU $H4,2,B16 ; last reduction step
+|| AND $H4,$THREE,$H4
+ ADDAW B16,B16,B16 ; 5*(h4>>2)
+|| [A2] BNOP loop?
+
+ ADDU B24,B16,B25:B24 ; B24 is h0
+|| [A2] SWAP2 $D2,$D2
+ ADDU B26,B25,B27:B26 ; B26 is h1
+|| [A2] SWAP4 $D2,$D2
+ ADDU B28,B27,B29:B28 ; B28 is h2
+|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
+|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0]
+ ADDU B30,B29,B31:B30 ; B30 is h3
+ ADD B31,$H4,$H4
+|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
+;;===== branch to loop? is taken here
+
+ LDDW *FP[-4],A11:A10 ; ABI says so
+ LDDW *FP[-3],A13:A12
+|| LDDW *SP[3],B11:B10
+ LDDW *SP[4],B13:B12
+|| MV B26,B25
+|| BNOP RA
+ LDW *++SP(40),FP ; restore frame pointer
+|| MV B30,B29
+ STDW B25:B24,*${CTXA}[0] ; save h1:h0
+ STDW B29:B28,*${CTXA}[1] ; save h3:h2
+ STW $H4,*${CTXA}[4] ; save h4
+ NOP 1
+ .endasmfunc
+___
+{
+my ($MAC,$NONCEA,$NONCEB)=($INPB,$LEN,$PADBIT);
+
+$code.=<<___;
+ .global _poly1305_emit
+ .align 32
+_poly1305_emit:
+ .asmfunc
+ LDDW *${CTXA}[0],A17:A16 ; load h1:h0
+ LDDW *${CTXA}[1],A19:A18 ; load h3:h2
+ LDW *${CTXA}[4],A20 ; load h4
+ MV $NONCEA,$NONCEB
+
+ MVK 5,A22 ; compare to modulus
+ ADDU A16,A22,A23:A22
+|| LDW *${NONCEA}[0],A8
+|| LDW *${NONCEB}[1],B8
+ ADDU A17,A23,A25:A24
+|| LDW *${NONCEA}[2],A9
+|| LDW *${NONCEB}[3],B9
+ ADDU A19,A25,A27:A26
+ ADDU A19,A27,A29:A28
+ ADD A20,A29,A29
+
+ SHRU A29,2,A2 ; check for overflow in 130-th bit
+
+ [A2] MV A22,A16 ; select
+|| [A2] MV A24,A17
+ [A2] MV A26,A18
+|| [A2] MV A28,A19
+
+|| ADDU A8,A16,A23:A22 ; accumulate nonce
+ ADDU B8,A17,A25:A24
+|| SWAP2 A22,A22
+ ADDU A23,A25:A24,A25:A24
+ ADDU A9,A18,A27:A26
+|| SWAP2 A24,A24
+ ADDU A25,A27:A26,A27:A26
+|| ADD B9,A19,A28
+ ADD A27,A28,A28
+|| SWAP2 A26,A26
+
+ .if .BIG_ENDIAN
+ SWAP2 A28,A28
+|| SWAP4 A22,A22
+|| SWAP4 A24,B24
+ SWAP4 A26,A26
+ SWAP4 A28,A28
+|| MV B24,A24
+ .endif
+
+ BNOP RA,1
+ STNW A22,*${MAC}[0] ; write the result
+ STNW A24,*${MAC}[1]
+ STNW A26,*${MAC}[2]
+ STNW A28,*${MAC}[3]
+ .endasmfunc
+___
+}
+$code.=<<___;
+ .sect .const
+ .cstring "Poly1305 for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-mips.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-mips.pl
new file mode 100755
index 0000000..d2b3e90
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-mips.pl
@@ -0,0 +1,425 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Poly1305 hash for MIPS64.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc
+# R1x000 5.64/+120% (big-endian)
+# Octeon II 3.80/+280% (little-endian)
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+# excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+die "MIPS64 only" unless ($flavour =~ /64|n32/i);
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sd $zero,0($ctx)
+ sd $zero,8($ctx)
+ sd $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+ ldl $in0,0+MSB($inp)
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ li $tmp0,1
+ dsll $tmp0,32
+ daddiu $tmp0,-63
+ dsll $tmp0,28
+ daddiu $tmp0,-1 # 0ffffffc0fffffff
+
+ and $in0,$tmp0
+ daddiu $tmp0,-3 # 0ffffffc0ffffffc
+ and $in1,$tmp0
+
+ sd $in0,24($ctx)
+ dsrl $tmp0,$in1,2
+ sd $in1,32($ctx)
+ daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
+ sd $tmp0,40($ctx)
+
+.Lno_key:
+ li $v0,0 # return 0
+ jr $ra
+.end poly1305_init
+___
+{
+my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
+ ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+
+$code.=<<___;
+.align 5
+.globl poly1305_blocks
+.ent poly1305_blocks
+poly1305_blocks:
+ .set noreorder
+ dsrl $len,4 # number of complete blocks
+ bnez $len,poly1305_blocks_internal
+ nop
+ jr $ra
+ nop
+.end poly1305_blocks
+
+.align 5
+.ent poly1305_blocks_internal
+poly1305_blocks_internal:
+ .frame $sp,6*8,$ra
+ .mask $SAVED_REGS_MASK,-8
+ .set noreorder
+ dsub $sp,6*8
+ sd $s5,40($sp)
+ sd $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sd $s3,24($sp)
+ sd $s2,16($sp)
+ sd $s1,8($sp)
+ sd $s0,0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ ld $r0,24($ctx) # load key
+ ld $r1,32($ctx)
+ ld $s1,40($ctx)
+
+.Loop:
+ ldl $in0,0+MSB($inp) # load input
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ daddiu $len,-1
+ ldr $in1,8+LSB($inp)
+ daddiu $inp,16
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ daddu $h0,$in0 # accumulate input
+ daddu $h1,$in1
+ sltu $tmp0,$h0,$in0
+ sltu $tmp1,$h1,$in1
+ daddu $h1,$tmp0
+
+ dmultu $r0,$h0 # h0*r0
+ daddu $h2,$padbit
+ sltu $tmp0,$h1,$tmp0
+ mflo $d0
+ mfhi $d1
+
+ dmultu $s1,$h1 # h1*5*r1
+ daddu $tmp0,$tmp1
+ daddu $h2,$tmp0
+ mflo $tmp0
+ mfhi $tmp1
+
+ dmultu $r1,$h0 # h0*r1
+ daddu $d0,$tmp0
+ daddu $d1,$tmp1
+ mflo $tmp2
+ mfhi $d2
+ sltu $tmp0,$d0,$tmp0
+ daddu $d1,$tmp0
+
+ dmultu $r0,$h1 # h1*r0
+ daddu $d1,$tmp2
+ sltu $tmp2,$d1,$tmp2
+ mflo $tmp0
+ mfhi $tmp1
+ daddu $d2,$tmp2
+
+ dmultu $s1,$h2 # h2*5*r1
+ daddu $d1,$tmp0
+ daddu $d2,$tmp1
+ mflo $tmp2
+
+ dmultu $r0,$h2 # h2*r0
+ sltu $tmp0,$d1,$tmp0
+ daddu $d2,$tmp0
+ mflo $tmp3
+
+ daddu $d1,$tmp2
+ daddu $d2,$tmp3
+ sltu $tmp2,$d1,$tmp2
+ daddu $d2,$tmp2
+
+ li $tmp0,-4 # final reduction
+ and $tmp0,$d2
+ dsrl $tmp1,$d2,2
+ andi $h2,$d2,3
+ daddu $tmp0,$tmp1
+ daddu $h0,$d0,$tmp0
+ sltu $tmp0,$h0,$tmp0
+ daddu $h1,$d1,$tmp0
+ sltu $tmp0,$h1,$tmp0
+ daddu $h2,$h2,$tmp0
+
+ bnez $len,.Loop
+
+ sd $h0,0($ctx) # store hash value
+ sd $h1,8($ctx)
+ sd $h2,16($ctx)
+
+ .set noreorder
+ ld $s5,40($sp) # epilogue
+ ld $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
+ ld $s3,24($sp)
+ ld $s2,16($sp)
+ ld $s1,8($sp)
+ ld $s0,0($sp)
+___
+$code.=<<___;
+ jr $ra
+ dadd $sp,6*8
+.end poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ ld $tmp0,0($ctx)
+ ld $tmp1,8($ctx)
+ ld $tmp2,16($ctx)
+
+ daddiu $in0,$tmp0,5 # compare to modulus
+ sltiu $tmp3,$in0,5
+ daddu $in1,$tmp1,$tmp3
+ sltu $tmp3,$in1,$tmp3
+ daddu $tmp2,$tmp2,$tmp3
+
+ dsrl $tmp2,2 # see if it carried/borrowed
+ dsubu $tmp2,$zero,$tmp2
+ nor $tmp3,$zero,$tmp2
+
+ and $in0,$tmp2
+ and $tmp0,$tmp3
+ and $in1,$tmp2
+ and $tmp1,$tmp3
+ or $in0,$tmp0
+ or $in1,$tmp1
+
+ lwu $tmp0,0($nonce) # load nonce
+ lwu $tmp1,4($nonce)
+ lwu $tmp2,8($nonce)
+ lwu $tmp3,12($nonce)
+ dsll $tmp1,32
+ dsll $tmp3,32
+ or $tmp0,$tmp1
+ or $tmp2,$tmp3
+
+ daddu $in0,$tmp0 # accumulate nonce
+ daddu $in1,$tmp2
+ sltu $tmp0,$in0,$tmp0
+ daddu $in1,$tmp0
+
+ dsrl $tmp0,$in0,8 # write mac value
+ dsrl $tmp1,$in0,16
+ dsrl $tmp2,$in0,24
+ sb $in0,0($mac)
+ dsrl $tmp3,$in0,32
+ sb $tmp0,1($mac)
+ dsrl $tmp0,$in0,40
+ sb $tmp1,2($mac)
+ dsrl $tmp1,$in0,48
+ sb $tmp2,3($mac)
+ dsrl $tmp2,$in0,56
+ sb $tmp3,4($mac)
+ dsrl $tmp3,$in1,8
+ sb $tmp0,5($mac)
+ dsrl $tmp0,$in1,16
+ sb $tmp1,6($mac)
+ dsrl $tmp1,$in1,24
+ sb $tmp2,7($mac)
+
+ sb $in1,8($mac)
+ dsrl $tmp2,$in1,32
+ sb $tmp3,9($mac)
+ dsrl $tmp3,$in1,40
+ sb $tmp0,10($mac)
+ dsrl $tmp0,$in1,48
+ sb $tmp1,11($mac)
+ dsrl $tmp1,$in1,56
+ sb $tmp2,12($mac)
+ sb $tmp3,13($mac)
+ sb $tmp0,14($mac)
+ sb $tmp1,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-ppc.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-ppc.pl
new file mode 100755
index 0000000..ab65910
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-ppc.pl
@@ -0,0 +1,644 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for PowerPC.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# and improvement coefficients relative to gcc-generated code.
+#
+# -m32 -m64
+#
+# Freescale e300 14.8/+80% -
+# PPC74x0 7.60/+60% -
+# PPC970 7.00/+114% 3.51/+205%
+# POWER7 3.75/+260% 1.93/+100%
+# POWER8 - 2.03/+200%
+#
+# Do we need floating-point implementation for PPC? Results presented
+# in poly1305_ieee754.c are tricky to compare to, because they are for
+# compiler-generated code. On the other hand it's known that floating-
+# point performance can be dominated by FPU latency, which means that
+# there is limit even for ideally optimized (and even vectorized) code.
+# And this limit is estimated to be higher than above -m64 results. Or
+# in other words floating-point implementation can be meaningful to
+# consider only in 32-bit application context. We probably have to
+# recognize that 32-bit builds are getting less popular on high-end
+# systems and therefore tend to target embedded ones, which might not
+# even have FPU...
+#
+# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
+# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $UCMP ="cmpld";
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $UCMP ="cmplw";
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+} else { die "nonsense $flavour"; }
+
+# Define endianness based on flavour
+# i.e.: linux64le
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=24*$SIZE_T;
+
+$sp="r1";
+my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
+my ($mac,$nonce)=($inp,$len);
+my $mask = "r0";
+
+$code=<<___;
+.machine "any"
+.text
+___
+ if ($flavour =~ /64/) {
+###############################################################################
+# base 2^64 implementation
+
+my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
+
+$code.=<<___;
+.globl .poly1305_init_int
+.align 4
+.poly1305_init_int:
+ xor r0,r0,r0
+ std r0,0($ctx) # zero hash value
+ std r0,8($ctx)
+ std r0,16($ctx)
+
+ $UCMP $inp,r0
+ beq- Lno_key
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ ld $d0,0($inp) # load key material
+ ld $d1,8($inp)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ li $h0,4
+ lwbrx $d0,0,$inp # load key material
+ li $d1,8
+ lwbrx $h0,$h0,$inp
+ li $h1,12
+ lwbrx $d1,$d1,$inp
+ lwbrx $h1,$h1,$inp
+ insrdi $d0,$h0,32,0
+ insrdi $d1,$h1,32,0
+___
+$code.=<<___;
+ lis $h1,0xfff # 0x0fff0000
+ ori $h1,$h1,0xfffc # 0x0ffffffc
+ insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
+ ori $h0,$h1,3 # 0x0ffffffc0fffffff
+
+ and $d0,$d0,$h0
+ and $d1,$d1,$h1
+
+ std $d0,32($ctx) # store key
+ std $d1,40($ctx)
+
+Lno_key:
+ xor r3,r3,r3
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+.size .poly1305_init_int,.-.poly1305_init_int
+
+.globl .poly1305_blocks
+.align 4
+.poly1305_blocks:
+ srdi. $len,$len,4
+ beq- Labort
+
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ ld $r0,32($ctx) # load key
+ ld $r1,40($ctx)
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ srdi $s1,$r1,2
+ mtctr $len
+ add $s1,$s1,$r1 # s1 = r1 + r1>>2
+ li $mask,3
+ b Loop
+
+.align 4
+Loop:
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ ld $t0,0($inp) # load input
+ ld $t1,8($inp)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ li $d0,4
+ lwbrx $t0,0,$inp # load input
+ li $t1,8
+ lwbrx $d0,$d0,$inp
+ li $d1,12
+ lwbrx $t1,$t1,$inp
+ lwbrx $d1,$d1,$inp
+ insrdi $t0,$d0,32,0
+ insrdi $t1,$d1,32,0
+___
+$code.=<<___;
+ addi $inp,$inp,16
+
+ addc $h0,$h0,$t0 # accumulate input
+ adde $h1,$h1,$t1
+
+ mulld $d0,$h0,$r0 # h0*r0
+ mulhdu $d1,$h0,$r0
+ adde $h2,$h2,$padbit
+
+ mulld $t0,$h1,$s1 # h1*5*r1
+ mulhdu $t1,$h1,$s1
+ addc $d0,$d0,$t0
+ adde $d1,$d1,$t1
+
+ mulld $t0,$h0,$r1 # h0*r1
+ mulhdu $d2,$h0,$r1
+ addc $d1,$d1,$t0
+ addze $d2,$d2
+
+ mulld $t0,$h1,$r0 # h1*r0
+ mulhdu $t1,$h1,$r0
+ addc $d1,$d1,$t0
+ adde $d2,$d2,$t1
+
+ mulld $t0,$h2,$s1 # h2*5*r1
+ mulld $t1,$h2,$r0 # h2*r0
+ addc $d1,$d1,$t0
+ adde $d2,$d2,$t1
+
+ andc $t0,$d2,$mask # final reduction step
+ and $h2,$d2,$mask
+ srdi $t1,$t0,2
+ add $t0,$t0,$t1
+ addc $h0,$d0,$t0
+ addze $h1,$d1
+ addze $h2,$h2
+
+ bdnz Loop
+
+ std $h0,0($ctx) # store hash value
+ std $h1,8($ctx)
+ std $h2,16($ctx)
+
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ addi $sp,$sp,$FRAME
+Labort:
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,5,4,0
+.size .poly1305_blocks,.-.poly1305_blocks
+
+.globl .poly1305_emit
+.align 4
+.poly1305_emit:
+ ld $h0,0($ctx) # load hash
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+ ld $padbit,0($nonce) # load nonce
+ ld $nonce,8($nonce)
+
+ addic $d0,$h0,5 # compare to modulus
+ addze $d1,$h1
+ addze $d2,$h2
+
+ srdi $mask,$d2,2 # did it carry/borrow?
+ neg $mask,$mask
+
+ andc $h0,$h0,$mask
+ and $d0,$d0,$mask
+ andc $h1,$h1,$mask
+ and $d1,$d1,$mask
+ or $h0,$h0,$d0
+ or $h1,$h1,$d1
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ rotldi $padbit,$padbit,32 # flip nonce words
+ rotldi $nonce,$nonce,32
+___
+$code.=<<___;
+ addc $h0,$h0,$padbit # accumulate nonce
+ adde $h1,$h1,$nonce
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ std $h0,0($mac) # write result
+ std $h1,8($mac)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ extrdi r0,$h0,32,0
+ li $d0,4
+ stwbrx $h0,0,$mac # write result
+ extrdi $h0,$h1,32,0
+ li $d1,8
+ stwbrx r0,$d0,$mac
+ li $d2,12
+ stwbrx $h1,$d1,$mac
+ stwbrx $h0,$d2,$mac
+___
+$code.=<<___;
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+.size .poly1305_emit,.-.poly1305_emit
+___
+ } else {
+###############################################################################
+# base 2^32 implementation
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
+ $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
+ ) = map("r$_",(7..12,14..31));
+
+$code.=<<___;
+.globl .poly1305_init_int
+.align 4
+.poly1305_init_int:
+ xor r0,r0,r0
+ stw r0,0($ctx) # zero hash value
+ stw r0,4($ctx)
+ stw r0,8($ctx)
+ stw r0,12($ctx)
+ stw r0,16($ctx)
+
+ $UCMP $inp,r0
+ beq- Lno_key
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lw $h0,0($inp) # load key material
+ lw $h1,4($inp)
+ lw $h2,8($inp)
+ lw $h3,12($inp)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ li $h1,4
+ lwbrx $h0,0,$inp # load key material
+ li $h2,8
+ lwbrx $h1,$h1,$inp
+ li $h3,12
+ lwbrx $h2,$h2,$inp
+ lwbrx $h3,$h3,$inp
+___
+$code.=<<___;
+ lis $mask,0xf000 # 0xf0000000
+ li $r0,-4
+ andc $r0,$r0,$mask # 0x0ffffffc
+
+ andc $h0,$h0,$mask
+ and $h1,$h1,$r0
+ and $h2,$h2,$r0
+ and $h3,$h3,$r0
+
+ stw $h0,32($ctx) # store key
+ stw $h1,36($ctx)
+ stw $h2,40($ctx)
+ stw $h3,44($ctx)
+
+Lno_key:
+ xor r3,r3,r3
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+.size .poly1305_init_int,.-.poly1305_init_int
+
+.globl .poly1305_blocks
+.align 4
+.poly1305_blocks:
+ srwi. $len,$len,4
+ beq- Labort
+
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+ $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ lwz $r0,32($ctx) # load key
+ lwz $r1,36($ctx)
+ lwz $r2,40($ctx)
+ lwz $r3,44($ctx)
+
+ lwz $h0,0($ctx) # load hash value
+ lwz $h1,4($ctx)
+ lwz $h2,8($ctx)
+ lwz $h3,12($ctx)
+ lwz $h4,16($ctx)
+
+ srwi $s1,$r1,2
+ srwi $s2,$r2,2
+ srwi $s3,$r3,2
+ add $s1,$s1,$r1 # si = ri + ri>>2
+ add $s2,$s2,$r2
+ add $s3,$s3,$r3
+ mtctr $len
+ li $mask,3
+ b Loop
+
+.align 4
+Loop:
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lwz $d0,0($inp) # load input
+ lwz $d1,4($inp)
+ lwz $d2,8($inp)
+ lwz $d3,12($inp)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ li $d1,4
+ lwbrx $d0,0,$inp # load input
+ li $d2,8
+ lwbrx $d1,$d1,$inp
+ li $d3,12
+ lwbrx $d2,$d2,$inp
+ lwbrx $d3,$d3,$inp
+___
+$code.=<<___;
+ addi $inp,$inp,16
+
+ addc $h0,$h0,$d0 # accumulate input
+ adde $h1,$h1,$d1
+ adde $h2,$h2,$d2
+
+ mullw $d0,$h0,$r0 # h0*r0
+ mulhwu $D0,$h0,$r0
+
+ mullw $d1,$h0,$r1 # h0*r1
+ mulhwu $D1,$h0,$r1
+
+ mullw $d2,$h0,$r2 # h0*r2
+ mulhwu $D2,$h0,$r2
+
+ adde $h3,$h3,$d3
+ adde $h4,$h4,$padbit
+
+ mullw $d3,$h0,$r3 # h0*r3
+ mulhwu $D3,$h0,$r3
+
+ mullw $t0,$h1,$s3 # h1*s3
+ mulhwu $t1,$h1,$s3
+
+ mullw $t2,$h1,$r0 # h1*r0
+ mulhwu $t3,$h1,$r0
+ addc $d0,$d0,$t0
+ adde $D0,$D0,$t1
+
+ mullw $t0,$h1,$r1 # h1*r1
+ mulhwu $t1,$h1,$r1
+ addc $d1,$d1,$t2
+ adde $D1,$D1,$t3
+
+ mullw $t2,$h1,$r2 # h1*r2
+ mulhwu $t3,$h1,$r2
+ addc $d2,$d2,$t0
+ adde $D2,$D2,$t1
+
+ mullw $t0,$h2,$s2 # h2*s2
+ mulhwu $t1,$h2,$s2
+ addc $d3,$d3,$t2
+ adde $D3,$D3,$t3
+
+ mullw $t2,$h2,$s3 # h2*s3
+ mulhwu $t3,$h2,$s3
+ addc $d0,$d0,$t0
+ adde $D0,$D0,$t1
+
+ mullw $t0,$h2,$r0 # h2*r0
+ mulhwu $t1,$h2,$r0
+ addc $d1,$d1,$t2
+ adde $D1,$D1,$t3
+
+ mullw $t2,$h2,$r1 # h2*r1
+ mulhwu $t3,$h2,$r1
+ addc $d2,$d2,$t0
+ adde $D2,$D2,$t1
+
+ mullw $t0,$h3,$s1 # h3*s1
+ mulhwu $t1,$h3,$s1
+ addc $d3,$d3,$t2
+ adde $D3,$D3,$t3
+
+ mullw $t2,$h3,$s2 # h3*s2
+ mulhwu $t3,$h3,$s2
+ addc $d0,$d0,$t0
+ adde $D0,$D0,$t1
+
+ mullw $t0,$h3,$s3 # h3*s3
+ mulhwu $t1,$h3,$s3
+ addc $d1,$d1,$t2
+ adde $D1,$D1,$t3
+
+ mullw $t2,$h3,$r0 # h3*r0
+ mulhwu $t3,$h3,$r0
+ addc $d2,$d2,$t0
+ adde $D2,$D2,$t1
+
+ mullw $t0,$h4,$s1 # h4*s1
+ addc $d3,$d3,$t2
+ adde $D3,$D3,$t3
+ addc $d1,$d1,$t0
+
+ mullw $t1,$h4,$s2 # h4*s2
+ addze $D1,$D1
+ addc $d2,$d2,$t1
+ addze $D2,$D2
+
+ mullw $t2,$h4,$s3 # h4*s3
+ addc $d3,$d3,$t2
+ addze $D3,$D3
+
+ mullw $h4,$h4,$r0 # h4*r0
+
+ addc $h1,$d1,$D0
+ adde $h2,$d2,$D1
+ adde $h3,$d3,$D2
+ adde $h4,$h4,$D3
+
+ andc $D0,$h4,$mask # final reduction step
+ and $h4,$h4,$mask
+ srwi $D1,$D0,2
+ add $D0,$D0,$D1
+ addc $h0,$d0,$D0
+ addze $h1,$h1
+ addze $h2,$h2
+ addze $h3,$h3
+ addze $h4,$h4
+
+ bdnz Loop
+
+ stw $h0,0($ctx) # store hash value
+ stw $h1,4($ctx)
+ stw $h2,8($ctx)
+ stw $h3,12($ctx)
+ stw $h4,16($ctx)
+
+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ addi $sp,$sp,$FRAME
+Labort:
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,4,0
+.size .poly1305_blocks,.-.poly1305_blocks
+
+.globl .poly1305_emit
+.align 4
+.poly1305_emit:
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ lwz $h0,0($ctx) # load hash
+ lwz $h1,4($ctx)
+ lwz $h2,8($ctx)
+ lwz $h3,12($ctx)
+ lwz $h4,16($ctx)
+
+ addic $d0,$h0,5 # compare to modulus
+ addze $d1,$h1
+ addze $d2,$h2
+ addze $d3,$h3
+ addze $mask,$h4
+
+ srwi $mask,$mask,2 # did it carry/borrow?
+ neg $mask,$mask
+
+ andc $h0,$h0,$mask
+ and $d0,$d0,$mask
+ andc $h1,$h1,$mask
+ and $d1,$d1,$mask
+ or $h0,$h0,$d0
+ lwz $d0,0($nonce) # load nonce
+ andc $h2,$h2,$mask
+ and $d2,$d2,$mask
+ or $h1,$h1,$d1
+ lwz $d1,4($nonce)
+ andc $h3,$h3,$mask
+ and $d3,$d3,$mask
+ or $h2,$h2,$d2
+ lwz $d2,8($nonce)
+ or $h3,$h3,$d3
+ lwz $d3,12($nonce)
+
+ addc $h0,$h0,$d0 # accumulate nonce
+ adde $h1,$h1,$d1
+ adde $h2,$h2,$d2
+ adde $h3,$h3,$d3
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ stw $h0,0($mac) # write result
+ stw $h1,4($mac)
+ stw $h2,8($mac)
+ stw $h3,12($mac)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ li $d1,4
+ stwbrx $h0,0,$mac # write result
+ li $d2,8
+ stwbrx $h1,$d1,$mac
+ li $d3,12
+ stwbrx $h2,$d2,$mac
+ stwbrx $h3,$d3,$mac
+___
+$code.=<<___;
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,4,3,0
+.size .poly1305_emit,.-.poly1305_emit
+___
+ }
+$code.=<<___;
+.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-ppcfp.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-ppcfp.pl
new file mode 100755
index 0000000..49f70a8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-ppcfp.pl
@@ -0,0 +1,739 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for PowerPC FPU.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# and improvement coefficients relative to gcc-generated code.
+#
+# Freescale e300 9.78/+30%
+# PPC74x0 6.92/+50%
+# PPC970 6.03/+80%
+# POWER7 3.50/+30%
+# POWER8 3.75/+10%
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $UCMP ="cmpld";
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $UCMP ="cmplw";
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
+
+$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$LOCALS=6*$SIZE_T;
+$FRAME=$LOCALS+6*8+18*8;
+
+my $sp="r1";
+
+my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
+my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
+
+my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
+ $two0,$two32,$two64,$two96,$two130,$five_two130,
+ $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
+ $s2lo,$s2hi,$s3lo,$s3hi,
+ $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
+# borrowings
+my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
+my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
+my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
+
+$code.=<<___;
+.machine "any"
+.text
+
+.globl .poly1305_init_fpu
+.align 6
+.poly1305_init_fpu:
+ $STU $sp,-$LOCALS($sp) # minimal frame
+ mflr $padbit
+ $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
+
+ bl LPICmeup
+
+ xor r0,r0,r0
+ mtlr $padbit # restore lr
+
+ lfd $two0,8*0($len) # load constants
+ lfd $two32,8*1($len)
+ lfd $two64,8*2($len)
+ lfd $two96,8*3($len)
+ lfd $two130,8*4($len)
+ lfd $five_two130,8*5($len)
+
+ stfd $two0,8*0($ctx) # initial hash value, biased 0
+ stfd $two32,8*1($ctx)
+ stfd $two64,8*2($ctx)
+ stfd $two96,8*3($ctx)
+
+ $UCMP $inp,r0
+ beq- Lno_key
+
+ lfd $h3lo,8*13($len) # new fpscr
+ mffs $h3hi # old fpscr
+
+ stfd $two0,8*4($ctx) # key "template"
+ stfd $two32,8*5($ctx)
+ stfd $two64,8*6($ctx)
+ stfd $two96,8*7($ctx)
+
+ li $in1,4
+ li $in2,8
+ li $in3,12
+ $LWXLE $in0,0,$inp # load key
+ $LWXLE $in1,$in1,$inp
+ $LWXLE $in2,$in2,$inp
+ $LWXLE $in3,$in3,$inp
+
+ lis $i1,0xf000 # 0xf0000000
+ ori $i2,$i1,3 # 0xf0000003
+ andc $in0,$in0,$i1 # &=0x0fffffff
+ andc $in1,$in1,$i2 # &=0x0ffffffc
+ andc $in2,$in2,$i2
+ andc $in3,$in3,$i2
+
+ stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
+ stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
+ stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
+ stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
+
+ mtfsf 255,$h3lo # fpscr
+ stfd $two0,8*18($ctx) # copy constants to context
+ stfd $two32,8*19($ctx)
+ stfd $two64,8*20($ctx)
+ stfd $two96,8*21($ctx)
+ stfd $two130,8*22($ctx)
+ stfd $five_two130,8*23($ctx)
+
+ lfd $h0lo,8*4($ctx) # load [biased] key
+ lfd $h1lo,8*5($ctx)
+ lfd $h2lo,8*6($ctx)
+ lfd $h3lo,8*7($ctx)
+
+ fsub $h0lo,$h0lo,$two0 # r0
+ fsub $h1lo,$h1lo,$two32 # r1
+ fsub $h2lo,$h2lo,$two64 # r2
+ fsub $h3lo,$h3lo,$two96 # r3
+
+ lfd $two0,8*6($len) # more constants
+ lfd $two32,8*7($len)
+ lfd $two64,8*8($len)
+ lfd $two96,8*9($len)
+
+ fmul $h1hi,$h1lo,$five_two130 # s1
+ fmul $h2hi,$h2lo,$five_two130 # s2
+ stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
+ fmul $h3hi,$h3lo,$five_two130 # s3
+
+ fadd $h0hi,$h0lo,$two0
+ stfd $h1hi,8*12($ctx) # put aside for now
+ fadd $h1hi,$h1lo,$two32
+ stfd $h2hi,8*13($ctx)
+ fadd $h2hi,$h2lo,$two64
+ stfd $h3hi,8*14($ctx)
+ fadd $h3hi,$h3lo,$two96
+
+ fsub $h0hi,$h0hi,$two0
+ fsub $h1hi,$h1hi,$two32
+ fsub $h2hi,$h2hi,$two64
+ fsub $h3hi,$h3hi,$two96
+
+ lfd $two0,8*10($len) # more constants
+ lfd $two32,8*11($len)
+ lfd $two64,8*12($len)
+
+ fsub $h0lo,$h0lo,$h0hi
+ fsub $h1lo,$h1lo,$h1hi
+ fsub $h2lo,$h2lo,$h2hi
+ fsub $h3lo,$h3lo,$h3hi
+
+ stfd $h0hi,8*5($ctx) # r0hi
+ stfd $h1hi,8*7($ctx) # r1hi
+ stfd $h2hi,8*9($ctx) # r2hi
+ stfd $h3hi,8*11($ctx) # r3hi
+
+ stfd $h0lo,8*4($ctx) # r0lo
+ stfd $h1lo,8*6($ctx) # r1lo
+ stfd $h2lo,8*8($ctx) # r2lo
+ stfd $h3lo,8*10($ctx) # r3lo
+
+ lfd $h1lo,8*12($ctx) # s1
+ lfd $h2lo,8*13($ctx) # s2
+ lfd $h3lo,8*14($ctx) # s3
+ lfd $h0lo,8*15($ctx) # pull original fpscr
+
+ fadd $h1hi,$h1lo,$two0
+ fadd $h2hi,$h2lo,$two32
+ fadd $h3hi,$h3lo,$two64
+
+ fsub $h1hi,$h1hi,$two0
+ fsub $h2hi,$h2hi,$two32
+ fsub $h3hi,$h3hi,$two64
+
+ fsub $h1lo,$h1lo,$h1hi
+ fsub $h2lo,$h2lo,$h2hi
+ fsub $h3lo,$h3lo,$h3hi
+
+ stfd $h1hi,8*13($ctx) # s1hi
+ stfd $h2hi,8*15($ctx) # s2hi
+ stfd $h3hi,8*17($ctx) # s3hi
+
+ stfd $h1lo,8*12($ctx) # s1lo
+ stfd $h2lo,8*14($ctx) # s2lo
+ stfd $h3lo,8*16($ctx) # s3lo
+
+ mtfsf 255,$h0lo # restore fpscr
+Lno_key:
+ xor r3,r3,r3
+ addi $sp,$sp,$LOCALS
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,0,2,0
+.size .poly1305_init_fpu,.-.poly1305_init_fpu
+
+.globl .poly1305_blocks_fpu
+.align 4
+.poly1305_blocks_fpu:
+ srwi. $len,$len,4
+ beq- Labort
+
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+ stfd f14,`$FRAME-8*18`($sp)
+ stfd f15,`$FRAME-8*17`($sp)
+ stfd f16,`$FRAME-8*16`($sp)
+ stfd f17,`$FRAME-8*15`($sp)
+ stfd f18,`$FRAME-8*14`($sp)
+ stfd f19,`$FRAME-8*13`($sp)
+ stfd f20,`$FRAME-8*12`($sp)
+ stfd f21,`$FRAME-8*11`($sp)
+ stfd f22,`$FRAME-8*10`($sp)
+ stfd f23,`$FRAME-8*9`($sp)
+ stfd f24,`$FRAME-8*8`($sp)
+ stfd f25,`$FRAME-8*7`($sp)
+ stfd f26,`$FRAME-8*6`($sp)
+ stfd f27,`$FRAME-8*5`($sp)
+ stfd f28,`$FRAME-8*4`($sp)
+ stfd f29,`$FRAME-8*3`($sp)
+ stfd f30,`$FRAME-8*2`($sp)
+ stfd f31,`$FRAME-8*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ xor r0,r0,r0
+ li $in3,1
+ mtctr $len
+ neg $len,$len
+ stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
+ stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
+
+ lfd $two0,8*18($ctx) # load constants
+ lfd $two32,8*19($ctx)
+ lfd $two64,8*20($ctx)
+ lfd $two96,8*21($ctx)
+ lfd $two130,8*22($ctx)
+ lfd $five_two130,8*23($ctx)
+
+ lfd $h0lo,8*0($ctx) # load [biased] hash value
+ lfd $h1lo,8*1($ctx)
+ lfd $h2lo,8*2($ctx)
+ lfd $h3lo,8*3($ctx)
+
+ stfd $two0,`$LOCALS+8*0`($sp) # input "template"
+ oris $in3,$padbit,`(1023+52+96)<<4`
+ stfd $two32,`$LOCALS+8*1`($sp)
+ stfd $two64,`$LOCALS+8*2`($sp)
+ stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
+
+ li $i1,4
+ li $i2,8
+ li $i3,12
+ $LWXLE $in0,0,$inp # load input
+ $LWXLE $in1,$i1,$inp
+ $LWXLE $in2,$i2,$inp
+ $LWXLE $in3,$i3,$inp
+ addi $inp,$inp,16
+
+ stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
+ stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
+ stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
+ stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
+
+ mffs $x0 # original fpscr
+ lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
+ lfd $r0lo,8*4($ctx) # load key
+ lfd $r0hi,8*5($ctx)
+ lfd $r1lo,8*6($ctx)
+ lfd $r1hi,8*7($ctx)
+ lfd $r2lo,8*8($ctx)
+ lfd $r2hi,8*9($ctx)
+ lfd $r3lo,8*10($ctx)
+ lfd $r3hi,8*11($ctx)
+ lfd $s1lo,8*12($ctx)
+ lfd $s1hi,8*13($ctx)
+ lfd $s2lo,8*14($ctx)
+ lfd $s2hi,8*15($ctx)
+ lfd $s3lo,8*16($ctx)
+ lfd $s3hi,8*17($ctx)
+
+ stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
+ mtfsf 255,$x1
+
+ addic $len,$len,1
+ addze r0,r0
+ slwi. r0,r0,4
+ sub $inp,$inp,r0 # conditional rewind
+
+ lfd $x0,`$LOCALS+8*0`($sp)
+ lfd $x1,`$LOCALS+8*1`($sp)
+ lfd $x2,`$LOCALS+8*2`($sp)
+ lfd $x3,`$LOCALS+8*3`($sp)
+
+ fsub $h0lo,$h0lo,$two0 # de-bias hash value
+ $LWXLE $in0,0,$inp # modulo-scheduled input load
+ fsub $h1lo,$h1lo,$two32
+ $LWXLE $in1,$i1,$inp
+ fsub $h2lo,$h2lo,$two64
+ $LWXLE $in2,$i2,$inp
+ fsub $h3lo,$h3lo,$two96
+ $LWXLE $in3,$i3,$inp
+
+ fsub $x0,$x0,$two0 # de-bias input
+ addi $inp,$inp,16
+ fsub $x1,$x1,$two32
+ fsub $x2,$x2,$two64
+ fsub $x3,$x3,$two96
+
+ fadd $x0,$x0,$h0lo # accumulate input
+ stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
+ fadd $x1,$x1,$h1lo
+ stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
+ fadd $x2,$x2,$h2lo
+ stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
+ fadd $x3,$x3,$h3lo
+ stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
+
+ b Lentry
+
+.align 4
+Loop:
+ fsub $y0,$y0,$two0 # de-bias input
+ addic $len,$len,1
+ fsub $y1,$y1,$two32
+ addze r0,r0
+ fsub $y2,$y2,$two64
+ slwi. r0,r0,4
+ fsub $y3,$y3,$two96
+ sub $inp,$inp,r0 # conditional rewind
+
+ fadd $h0lo,$h0lo,$y0 # accumulate input
+ fadd $h0hi,$h0hi,$y1
+ fadd $h2lo,$h2lo,$y2
+ fadd $h2hi,$h2hi,$y3
+
+ ######################################### base 2^48 -> base 2^32
+ fadd $c1lo,$h1lo,$two64
+ $LWXLE $in0,0,$inp # modulo-scheduled input load
+ fadd $c1hi,$h1hi,$two64
+ $LWXLE $in1,$i1,$inp
+ fadd $c3lo,$h3lo,$two130
+ $LWXLE $in2,$i2,$inp
+ fadd $c3hi,$h3hi,$two130
+ $LWXLE $in3,$i3,$inp
+ fadd $c0lo,$h0lo,$two32
+ addi $inp,$inp,16
+ fadd $c0hi,$h0hi,$two32
+ fadd $c2lo,$h2lo,$two96
+ fadd $c2hi,$h2hi,$two96
+
+ fsub $c1lo,$c1lo,$two64
+ stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
+ fsub $c1hi,$c1hi,$two64
+ stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
+ fsub $c3lo,$c3lo,$two130
+ stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
+ fsub $c3hi,$c3hi,$two130
+ stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
+ fsub $c0lo,$c0lo,$two32
+ fsub $c0hi,$c0hi,$two32
+ fsub $c2lo,$c2lo,$two96
+ fsub $c2hi,$c2hi,$two96
+
+ fsub $h1lo,$h1lo,$c1lo
+ fsub $h1hi,$h1hi,$c1hi
+ fsub $h3lo,$h3lo,$c3lo
+ fsub $h3hi,$h3hi,$c3hi
+ fsub $h2lo,$h2lo,$c2lo
+ fsub $h2hi,$h2hi,$c2hi
+ fsub $h0lo,$h0lo,$c0lo
+ fsub $h0hi,$h0hi,$c0hi
+
+ fadd $h1lo,$h1lo,$c0lo
+ fadd $h1hi,$h1hi,$c0hi
+ fadd $h3lo,$h3lo,$c2lo
+ fadd $h3hi,$h3hi,$c2hi
+ fadd $h2lo,$h2lo,$c1lo
+ fadd $h2hi,$h2hi,$c1hi
+ fmadd $h0lo,$c3lo,$five_two130,$h0lo
+ fmadd $h0hi,$c3hi,$five_two130,$h0hi
+
+ fadd $x1,$h1lo,$h1hi
+ lfd $s1lo,8*12($ctx) # reload constants
+ fadd $x3,$h3lo,$h3hi
+ lfd $s1hi,8*13($ctx)
+ fadd $x2,$h2lo,$h2hi
+ lfd $r3lo,8*10($ctx)
+ fadd $x0,$h0lo,$h0hi
+ lfd $r3hi,8*11($ctx)
+Lentry:
+ fmul $h0lo,$s3lo,$x1
+ fmul $h0hi,$s3hi,$x1
+ fmul $h2lo,$r1lo,$x1
+ fmul $h2hi,$r1hi,$x1
+ fmul $h1lo,$r0lo,$x1
+ fmul $h1hi,$r0hi,$x1
+ fmul $h3lo,$r2lo,$x1
+ fmul $h3hi,$r2hi,$x1
+
+ fmadd $h0lo,$s1lo,$x3,$h0lo
+ fmadd $h0hi,$s1hi,$x3,$h0hi
+ fmadd $h2lo,$s3lo,$x3,$h2lo
+ fmadd $h2hi,$s3hi,$x3,$h2hi
+ fmadd $h1lo,$s2lo,$x3,$h1lo
+ fmadd $h1hi,$s2hi,$x3,$h1hi
+ fmadd $h3lo,$r0lo,$x3,$h3lo
+ fmadd $h3hi,$r0hi,$x3,$h3hi
+
+ fmadd $h0lo,$s2lo,$x2,$h0lo
+ fmadd $h0hi,$s2hi,$x2,$h0hi
+ fmadd $h2lo,$r0lo,$x2,$h2lo
+ fmadd $h2hi,$r0hi,$x2,$h2hi
+ fmadd $h1lo,$s3lo,$x2,$h1lo
+ fmadd $h1hi,$s3hi,$x2,$h1hi
+ fmadd $h3lo,$r1lo,$x2,$h3lo
+ fmadd $h3hi,$r1hi,$x2,$h3hi
+
+ fmadd $h0lo,$r0lo,$x0,$h0lo
+ lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
+ fmadd $h0hi,$r0hi,$x0,$h0hi
+ lfd $y1,`$LOCALS+8*1`($sp)
+ fmadd $h2lo,$r2lo,$x0,$h2lo
+ lfd $y2,`$LOCALS+8*2`($sp)
+ fmadd $h2hi,$r2hi,$x0,$h2hi
+ lfd $y3,`$LOCALS+8*3`($sp)
+ fmadd $h1lo,$r1lo,$x0,$h1lo
+ fmadd $h1hi,$r1hi,$x0,$h1hi
+ fmadd $h3lo,$r3lo,$x0,$h3lo
+ fmadd $h3hi,$r3hi,$x0,$h3hi
+
+ bdnz Loop
+
+ ######################################### base 2^48 -> base 2^32
+ fadd $c0lo,$h0lo,$two32
+ fadd $c0hi,$h0hi,$two32
+ fadd $c2lo,$h2lo,$two96
+ fadd $c2hi,$h2hi,$two96
+ fadd $c1lo,$h1lo,$two64
+ fadd $c1hi,$h1hi,$two64
+ fadd $c3lo,$h3lo,$two130
+ fadd $c3hi,$h3hi,$two130
+
+ fsub $c0lo,$c0lo,$two32
+ fsub $c0hi,$c0hi,$two32
+ fsub $c2lo,$c2lo,$two96
+ fsub $c2hi,$c2hi,$two96
+ fsub $c1lo,$c1lo,$two64
+ fsub $c1hi,$c1hi,$two64
+ fsub $c3lo,$c3lo,$two130
+ fsub $c3hi,$c3hi,$two130
+
+ fsub $h1lo,$h1lo,$c1lo
+ fsub $h1hi,$h1hi,$c1hi
+ fsub $h3lo,$h3lo,$c3lo
+ fsub $h3hi,$h3hi,$c3hi
+ fsub $h2lo,$h2lo,$c2lo
+ fsub $h2hi,$h2hi,$c2hi
+ fsub $h0lo,$h0lo,$c0lo
+ fsub $h0hi,$h0hi,$c0hi
+
+ fadd $h1lo,$h1lo,$c0lo
+ fadd $h1hi,$h1hi,$c0hi
+ fadd $h3lo,$h3lo,$c2lo
+ fadd $h3hi,$h3hi,$c2hi
+ fadd $h2lo,$h2lo,$c1lo
+ fadd $h2hi,$h2hi,$c1hi
+ fmadd $h0lo,$c3lo,$five_two130,$h0lo
+ fmadd $h0hi,$c3hi,$five_two130,$h0hi
+
+ fadd $x1,$h1lo,$h1hi
+ fadd $x3,$h3lo,$h3hi
+ fadd $x2,$h2lo,$h2hi
+ fadd $x0,$h0lo,$h0hi
+
+ lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
+ fadd $x1,$x1,$two32 # bias
+ fadd $x3,$x3,$two96
+ fadd $x2,$x2,$two64
+ fadd $x0,$x0,$two0
+
+ stfd $x1,8*1($ctx) # store [biased] hash value
+ stfd $x3,8*3($ctx)
+ stfd $x2,8*2($ctx)
+ stfd $x0,8*0($ctx)
+
+ mtfsf 255,$h0lo # restore original fpscr
+ lfd f14,`$FRAME-8*18`($sp)
+ lfd f15,`$FRAME-8*17`($sp)
+ lfd f16,`$FRAME-8*16`($sp)
+ lfd f17,`$FRAME-8*15`($sp)
+ lfd f18,`$FRAME-8*14`($sp)
+ lfd f19,`$FRAME-8*13`($sp)
+ lfd f20,`$FRAME-8*12`($sp)
+ lfd f21,`$FRAME-8*11`($sp)
+ lfd f22,`$FRAME-8*10`($sp)
+ lfd f23,`$FRAME-8*9`($sp)
+ lfd f24,`$FRAME-8*8`($sp)
+ lfd f25,`$FRAME-8*7`($sp)
+ lfd f26,`$FRAME-8*6`($sp)
+ lfd f27,`$FRAME-8*5`($sp)
+ lfd f28,`$FRAME-8*4`($sp)
+ lfd f29,`$FRAME-8*3`($sp)
+ lfd f30,`$FRAME-8*2`($sp)
+ lfd f31,`$FRAME-8*1`($sp)
+ addi $sp,$sp,$FRAME
+Labort:
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,0,4,0
+.size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
+___
+{
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
+ ) = map("r$_",(7..11,28..31));
+my $mask = "r0";
+my $FRAME = (6+4)*$SIZE_T;
+
+$code.=<<___;
+.globl .poly1305_emit_fpu
+.align 4
+.poly1305_emit_fpu:
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
+ lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
+ lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
+ lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
+ lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
+ lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
+ lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
+ lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
+
+ lis $mask,0xfff0
+ andc $d0,$d0,$mask # mask exponent
+ andc $d1,$d1,$mask
+ andc $d2,$d2,$mask
+ andc $d3,$d3,$mask # can be partially reduced...
+ li $mask,3
+
+ srwi $padbit,$d3,2 # ... so reduce
+ and $h4,$d3,$mask
+ andc $d3,$d3,$mask
+ add $d3,$d3,$padbit
+___
+ if ($SIZE_T==4) {
+$code.=<<___;
+ addc $h0,$h0,$d3
+ adde $h1,$h1,$d0
+ adde $h2,$h2,$d1
+ adde $h3,$h3,$d2
+ addze $h4,$h4
+
+ addic $d0,$h0,5 # compare to modulus
+ addze $d1,$h1
+ addze $d2,$h2
+ addze $d3,$h3
+ addze $mask,$h4
+
+ srwi $mask,$mask,2 # did it carry/borrow?
+ neg $mask,$mask
+ srawi $mask,$mask,31 # mask
+
+ andc $h0,$h0,$mask
+ and $d0,$d0,$mask
+ andc $h1,$h1,$mask
+ and $d1,$d1,$mask
+ or $h0,$h0,$d0
+ lwz $d0,0($nonce) # load nonce
+ andc $h2,$h2,$mask
+ and $d2,$d2,$mask
+ or $h1,$h1,$d1
+ lwz $d1,4($nonce)
+ andc $h3,$h3,$mask
+ and $d3,$d3,$mask
+ or $h2,$h2,$d2
+ lwz $d2,8($nonce)
+ or $h3,$h3,$d3
+ lwz $d3,12($nonce)
+
+ addc $h0,$h0,$d0 # accumulate nonce
+ adde $h1,$h1,$d1
+ adde $h2,$h2,$d2
+ adde $h3,$h3,$d3
+___
+ } else {
+$code.=<<___;
+ add $h0,$h0,$d3
+ add $h1,$h1,$d0
+ add $h2,$h2,$d1
+ add $h3,$h3,$d2
+
+ srdi $d0,$h0,32
+ add $h1,$h1,$d0
+ srdi $d1,$h1,32
+ add $h2,$h2,$d1
+ srdi $d2,$h2,32
+ add $h3,$h3,$d2
+ srdi $d3,$h3,32
+ add $h4,$h4,$d3
+
+ insrdi $h0,$h1,32,0
+ insrdi $h2,$h3,32,0
+
+ addic $d0,$h0,5 # compare to modulus
+ addze $d1,$h2
+ addze $d2,$h4
+
+ srdi $mask,$d2,2 # did it carry/borrow?
+ neg $mask,$mask
+ sradi $mask,$mask,63 # mask
+ ld $d2,0($nonce) # load nonce
+ ld $d3,8($nonce)
+
+ andc $h0,$h0,$mask
+ and $d0,$d0,$mask
+ andc $h2,$h2,$mask
+ and $d1,$d1,$mask
+ or $h0,$h0,$d0
+ or $h2,$h2,$d1
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ rotldi $d2,$d2,32 # flip nonce words
+ rotldi $d3,$d3,32
+___
+$code.=<<___;
+ addc $h0,$h0,$d2 # accumulate nonce
+ adde $h2,$h2,$d3
+
+ srdi $h1,$h0,32
+ srdi $h3,$h2,32
+___
+ }
+$code.=<<___ if ($LITTLE_ENDIAN);
+ stw $h0,0($mac) # write result
+ stw $h1,4($mac)
+ stw $h2,8($mac)
+ stw $h3,12($mac)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ li $d1,4
+ stwbrx $h0,0,$mac # write result
+ li $d2,8
+ stwbrx $h1,$d1,$mac
+ li $d3,12
+ stwbrx $h2,$d2,$mac
+ stwbrx $h3,$d3,$mac
+___
+$code.=<<___;
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,4,3,0
+.size .poly1305_emit_fpu,.-.poly1305_emit_fpu
+___
+}
+# Ugly hack here, because PPC assembler syntax seem to vary too
+# much from platforms to platform...
+$code.=<<___;
+.align 6
+LPICmeup:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $len # vvvvvv "distance" between . and 1st data entry
+ addi $len,$len,`64-8` # borrow $len
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
+
+.quad 0x4330000000000000 # 2^(52+0)
+.quad 0x4530000000000000 # 2^(52+32)
+.quad 0x4730000000000000 # 2^(52+64)
+.quad 0x4930000000000000 # 2^(52+96)
+.quad 0x4b50000000000000 # 2^(52+130)
+
+.quad 0x37f4000000000000 # 5/2^130
+
+.quad 0x4430000000000000 # 2^(52+16+0)
+.quad 0x4630000000000000 # 2^(52+16+32)
+.quad 0x4830000000000000 # 2^(52+16+64)
+.quad 0x4a30000000000000 # 2^(52+16+96)
+.quad 0x3e30000000000000 # 2^(52+16+0-96)
+.quad 0x4030000000000000 # 2^(52+16+32-96)
+.quad 0x4230000000000000 # 2^(52+16+64-96)
+
+.quad 0x0000000000000001 # fpscr: truncate, no exceptions
+.asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl
new file mode 100755
index 0000000..82d757d
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl
@@ -0,0 +1,227 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for s390x.
+#
+# June 2015
+#
+# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
+# code. For older compiler improvement coefficient is >3x, because
+# then base 2^64 and base 2^32 implementations are compared.
+#
+# On side note, z13 enables vector base 2^26 implementation...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$sp="%r15";
+
+my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
+
+$code.=<<___;
+.text
+
+.globl poly1305_init
+.type poly1305_init,\@function
+.align 16
+poly1305_init:
+ lghi %r0,0
+ lghi %r1,-1
+ stg %r0,0($ctx) # zero hash value
+ stg %r0,8($ctx)
+ stg %r0,16($ctx)
+
+ cl${g}r $inp,%r0
+ je .Lno_key
+
+ lrvg %r4,0($inp) # load little-endian key
+ lrvg %r5,8($inp)
+
+ nihl %r1,0xffc0 # 0xffffffc0ffffffff
+ srlg %r0,%r1,4 # 0x0ffffffc0fffffff
+ srlg %r1,%r1,4
+ nill %r1,0xfffc # 0x0ffffffc0ffffffc
+
+ ngr %r4,%r0
+ ngr %r5,%r1
+
+ stg %r4,32($ctx)
+ stg %r5,40($ctx)
+
+.Lno_key:
+ lghi %r2,0
+ br %r14
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
+my ($r0,$r1,$s1) = map("%r$_",(0..2));
+
+$code.=<<___;
+.globl poly1305_blocks
+.type poly1305_blocks,\@function
+.align 16
+poly1305_blocks:
+ srl${g} $len,4 # fixed-up in 64-bit build
+ lghi %r0,0
+ cl${g}r $len,%r0
+ je .Lno_data
+
+ stm${g} %r6,%r14,`6*$SIZE_T`($sp)
+
+ llgfr $padbit,$padbit # clear upper half, much needed with
+ # non-64-bit ABI
+ lg $r0,32($ctx) # load key
+ lg $r1,40($ctx)
+
+ lg $h0,0($ctx) # load hash value
+ lg $h1,8($ctx)
+ lg $h2,16($ctx)
+
+ st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx
+ srlg $s1,$r1,2
+ algr $s1,$r1 # s1 = r1 + r1>>2
+ j .Loop
+
+.align 16
+.Loop:
+ lrvg $d0lo,0($inp) # load little-endian input
+ lrvg $d1lo,8($inp)
+ la $inp,16($inp)
+
+ algr $d0lo,$h0 # accumulate input
+ alcgr $d1lo,$h1
+
+ lgr $h0,$d0lo
+ mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo
+ lgr $h1,$d1lo
+ mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo
+
+ mlgr $t0,$r1 # h0*r1 -> $t0:$h0
+ mlgr $t1,$r0 # h1*r0 -> $t1:$h1
+ alcgr $h2,$padbit
+
+ algr $d0lo,$d1lo
+ lgr $d1lo,$h2
+ alcgr $d0hi,$d1hi
+ lghi $d1hi,0
+
+ algr $h1,$h0
+ alcgr $t1,$t0
+
+ msgr $d1lo,$s1 # h2*s1
+ msgr $h2,$r0 # h2*r0
+
+ algr $h1,$d1lo
+ alcgr $t1,$d1hi # $d1hi is zero
+
+ algr $h1,$d0hi
+ alcgr $h2,$t1
+
+ lghi $h0,-4 # final reduction step
+ ngr $h0,$h2
+ srlg $t0,$h2,2
+ algr $h0,$t0
+ lghi $t1,3
+ ngr $h2,$t1
+
+ algr $h0,$d0lo
+ alcgr $h1,$d1hi # $d1hi is still zero
+ alcgr $h2,$d1hi # $d1hi is still zero
+
+ brct$g $len,.Loop
+
+ l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx
+
+ stg $h0,0($ctx) # store hash value
+ stg $h1,8($ctx)
+ stg $h2,16($ctx)
+
+ lm${g} %r6,%r14,`6*$SIZE_T`($sp)
+.Lno_data:
+ br %r14
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($mac,$nonce)=($inp,$len);
+my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
+
+$code.=<<___;
+.globl poly1305_emit
+.type poly1305_emit,\@function
+.align 16
+poly1305_emit:
+ stm${g} %r6,%r9,`6*$SIZE_T`($sp)
+
+ lg $h0,0($ctx)
+ lg $h1,8($ctx)
+ lg $h2,16($ctx)
+
+ lghi %r0,5
+ lghi %r1,0
+ lgr $d0,$h0
+ lgr $d1,$h1
+
+ algr $h0,%r0 # compare to modulus
+ alcgr $h1,%r1
+ alcgr $h2,%r1
+
+ srlg $h2,$h2,2 # did it borrow/carry?
+ slgr %r1,$h2 # 0-$h2>>2
+ lg $h2,0($nonce) # load nonce
+ lghi %r0,-1
+ lg $ctx,8($nonce)
+ xgr %r0,%r1 # ~%r1
+
+ ngr $h0,%r1
+ ngr $d0,%r0
+ ngr $h1,%r1
+ ngr $d1,%r0
+ ogr $h0,$d0
+ rllg $d0,$h2,32 # flip nonce words
+ ogr $h1,$d1
+ rllg $d1,$ctx,32
+
+ algr $h0,$d0 # accumulate nonce
+ alcgr $h1,$d1
+
+ strvg $h0,0($mac) # write little-endian result
+ strvg $h1,8($mac)
+
+ lm${g} %r6,%r9,`6*$SIZE_T`($sp)
+ br %r14
+.size poly1305_emit,.-poly1305_emit
+
+.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-sparcv9.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-sparcv9.pl
new file mode 100755
index 0000000..0bdd048
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-sparcv9.pl
@@ -0,0 +1,1120 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for SPARCv9, vanilla, as well
+# as VIS3 and FMA extensions.
+#
+# May, August 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU(*) FMA
+#
+# UltraSPARC III 12.3(**)
+# SPARC T3 7.92
+# SPARC T4 1.70(***) 6.55
+# SPARC64 X 5.60 3.64
+#
+# (*) Comparison to compiler-generated code is really problematic,
+# because latter's performance varies too much depending on too
+# many variables. For example, one can measure from 5x to 15x
+# improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
+# unfair comparison, because compiler doesn't use VIS3, but
+# given same initial conditions coefficient varies from 3x to 9x.
+# (**) Pre-III performance should be even worse; floating-point
+# performance for UltraSPARC I-IV on the other hand is reported
+# to be 4.25 for hand-coded assembly, but they are just too old
+# to care about.
+# (***) Multi-process benchmark saturates at ~12.5x single-process
+# result on 8-core processor, or ~21GBps per 2.85GHz socket.
+
+my $output = pop;
+open STDOUT,">$output";
+
+my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
+my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
+my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
+my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
+
+my $output = pop;
+open STDOUT,">$stdout";
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+# define STPTR stx
+# define SIZE_T 8
+#else
+# define STPTR st
+# define SIZE_T 4
+#endif
+#define LOCALS (STACK_BIAS+STACK_FRAME)
+
+.section ".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl poly1305_init
+.align 32
+poly1305_init:
+ save %sp,-STACK_FRAME-16,%sp
+ nop
+
+ SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
+ ld [%g1],%g1
+
+ and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
+ cmp %g1,SPARCV9_FMADD
+ be .Lpoly1305_init_fma
+ nop
+
+ stx %g0,[$ctx+0]
+ stx %g0,[$ctx+8] ! zero hash value
+ brz,pn $inp,.Lno_key
+ stx %g0,[$ctx+16]
+
+ and $inp,7,$shr ! alignment factor
+ andn $inp,7,$inp
+ sll $shr,3,$shr ! *8
+ neg $shr,$shl
+
+ sethi %hi(0x0ffffffc),$t0
+ set 8,$h1
+ or $t0,%lo(0x0ffffffc),$t0
+ set 16,$h2
+ sllx $t0,32,$t1
+ or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
+ or $t1,3,$t0 ! 0x0ffffffc0fffffff
+
+ ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
+ brz,pt $shr,.Lkey_aligned
+ ldxa [$inp+$h1]0x88,$h1
+
+ ldxa [$inp+$h2]0x88,$h2
+ srlx $h0,$shr,$h0
+ sllx $h1,$shl,$t2
+ srlx $h1,$shr,$h1
+ or $t2,$h0,$h0
+ sllx $h2,$shl,$h2
+ or $h2,$h1,$h1
+
+.Lkey_aligned:
+ and $t0,$h0,$h0
+ and $t1,$h1,$h1
+ stx $h0,[$ctx+32+0] ! store key
+ stx $h1,[$ctx+32+8]
+
+ andcc %g1,SPARCV9_VIS3,%g0
+ be .Lno_key
+ nop
+
+1: call .+8
+ add %o7,poly1305_blocks_vis3-1b,%o7
+
+ add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
+ STPTR %o7,[%i2]
+ STPTR %o5,[%i2+SIZE_T]
+
+ ret
+ restore %g0,1,%o0 ! return 1
+
+.Lno_key:
+ ret
+ restore %g0,%g0,%o0 ! return 0
+.type poly1305_init,#function
+.size poly1305_init,.-poly1305_init
+
+.globl poly1305_blocks
+.align 32
+poly1305_blocks:
+ save %sp,-STACK_FRAME,%sp
+ srln $len,4,$len
+
+ brz,pn $len,.Lno_data
+ nop
+
+ ld [$ctx+32+0],$r1 ! load key
+ ld [$ctx+32+4],$r0
+ ld [$ctx+32+8],$r3
+ ld [$ctx+32+12],$r2
+
+ ld [$ctx+0],$h1 ! load hash value
+ ld [$ctx+4],$h0
+ ld [$ctx+8],$h3
+ ld [$ctx+12],$h2
+ ld [$ctx+16],$h4
+
+ and $inp,7,$shr ! alignment factor
+ andn $inp,7,$inp
+ set 8,$d1
+ sll $shr,3,$shr ! *8
+ set 16,$d2
+ neg $shr,$shl
+
+ srl $r1,2,$s1
+ srl $r2,2,$s2
+ add $r1,$s1,$s1
+ srl $r3,2,$s3
+ add $r2,$s2,$s2
+ add $r3,$s3,$s3
+
+.Loop:
+ ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
+ brz,pt $shr,.Linp_aligned
+ ldxa [$inp+$d1]0x88,$d1
+
+ ldxa [$inp+$d2]0x88,$d2
+ srlx $d0,$shr,$d0
+ sllx $d1,$shl,$t1
+ srlx $d1,$shr,$d1
+ or $t1,$d0,$d0
+ sllx $d2,$shl,$d2
+ or $d2,$d1,$d1
+
+.Linp_aligned:
+ srlx $d0,32,$t0
+ addcc $d0,$h0,$h0 ! accumulate input
+ srlx $d1,32,$t1
+ addccc $t0,$h1,$h1
+ addccc $d1,$h2,$h2
+ addccc $t1,$h3,$h3
+ addc $padbit,$h4,$h4
+
+ umul $r0,$h0,$d0
+ umul $r1,$h0,$d1
+ umul $r2,$h0,$d2
+ umul $r3,$h0,$d3
+ sub $len,1,$len
+ add $inp,16,$inp
+
+ umul $s3,$h1,$t0
+ umul $r0,$h1,$t1
+ umul $r1,$h1,$t2
+ add $t0,$d0,$d0
+ add $t1,$d1,$d1
+ umul $r2,$h1,$t0
+ add $t2,$d2,$d2
+ add $t0,$d3,$d3
+
+ umul $s2,$h2,$t1
+ umul $s3,$h2,$t2
+ umul $r0,$h2,$t0
+ add $t1,$d0,$d0
+ add $t2,$d1,$d1
+ umul $r1,$h2,$t1
+ add $t0,$d2,$d2
+ add $t1,$d3,$d3
+
+ umul $s1,$h3,$t2
+ umul $s2,$h3,$t0
+ umul $s3,$h3,$t1
+ add $t2,$d0,$d0
+ add $t0,$d1,$d1
+ umul $r0,$h3,$t2
+ add $t1,$d2,$d2
+ add $t2,$d3,$d3
+
+ umul $s1,$h4,$t0
+ umul $s2,$h4,$t1
+ umul $s3,$h4,$t2
+ umul $r0,$h4,$h4
+ add $t0,$d1,$d1
+ add $t1,$d2,$d2
+ srlx $d0,32,$h1
+ add $t2,$d3,$d3
+ srlx $d1,32,$h2
+
+ addcc $d1,$h1,$h1
+ srlx $d2,32,$h3
+ set 8,$d1
+ addccc $d2,$h2,$h2
+ srlx $d3,32,$t0
+ set 16,$d2
+ addccc $d3,$h3,$h3
+ addc $t0,$h4,$h4
+
+ srl $h4,2,$t0 ! final reduction step
+ andn $h4,3,$t1
+ and $h4,3,$h4
+ add $t1,$t0,$t0
+
+ addcc $t0,$d0,$h0
+ addccc %g0,$h1,$h1
+ addccc %g0,$h2,$h2
+ addccc %g0,$h3,$h3
+ brnz,pt $len,.Loop
+ addc %g0,$h4,$h4
+
+ st $h1,[$ctx+0] ! store hash value
+ st $h0,[$ctx+4]
+ st $h3,[$ctx+8]
+ st $h2,[$ctx+12]
+ st $h4,[$ctx+16]
+
+.Lno_data:
+ ret
+ restore
+.type poly1305_blocks,#function
+.size poly1305_blocks,.-poly1305_blocks
+___
+########################################################################
+# VIS3 has umulxhi and addxc...
+{
+my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
+my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
+
+$code.=<<___;
+.align 32
+poly1305_blocks_vis3:
+ save %sp,-STACK_FRAME,%sp
+ srln $len,4,$len
+
+ brz,pn $len,.Lno_data
+ nop
+
+ ldx [$ctx+32+0],$R0 ! load key
+ ldx [$ctx+32+8],$R1
+
+ ldx [$ctx+0],$H0 ! load hash value
+ ldx [$ctx+8],$H1
+ ld [$ctx+16],$H2
+
+ and $inp,7,$shr ! alignment factor
+ andn $inp,7,$inp
+ set 8,$r1
+ sll $shr,3,$shr ! *8
+ set 16,$r2
+ neg $shr,$shl
+
+ srlx $R1,2,$S1
+ b .Loop_vis3
+ add $R1,$S1,$S1
+
+.Loop_vis3:
+ ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
+ brz,pt $shr,.Linp_aligned_vis3
+ ldxa [$inp+$r1]0x88,$D1
+
+ ldxa [$inp+$r2]0x88,$D2
+ srlx $D0,$shr,$D0
+ sllx $D1,$shl,$T1
+ srlx $D1,$shr,$D1
+ or $T1,$D0,$D0
+ sllx $D2,$shl,$D2
+ or $D2,$D1,$D1
+
+.Linp_aligned_vis3:
+ addcc $D0,$H0,$H0 ! accumulate input
+ sub $len,1,$len
+ addxccc $D1,$H1,$H1
+ add $inp,16,$inp
+
+ mulx $R0,$H0,$D0 ! r0*h0
+ addxc $padbit,$H2,$H2
+ umulxhi $R0,$H0,$D1
+ mulx $S1,$H1,$T0 ! s1*h1
+ umulxhi $S1,$H1,$T1
+ addcc $T0,$D0,$D0
+ mulx $R1,$H0,$T0 ! r1*h0
+ addxc $T1,$D1,$D1
+ umulxhi $R1,$H0,$D2
+ addcc $T0,$D1,$D1
+ mulx $R0,$H1,$T0 ! r0*h1
+ addxc %g0,$D2,$D2
+ umulxhi $R0,$H1,$T1
+ addcc $T0,$D1,$D1
+ mulx $S1,$H2,$T0 ! s1*h2
+ addxc $T1,$D2,$D2
+ mulx $R0,$H2,$T1 ! r0*h2
+ addcc $T0,$D1,$D1
+ addxc $T1,$D2,$D2
+
+ srlx $D2,2,$T0 ! final reduction step
+ andn $D2,3,$T1
+ and $D2,3,$H2
+ add $T1,$T0,$T0
+
+ addcc $T0,$D0,$H0
+ addxccc %g0,$D1,$H1
+ brnz,pt $len,.Loop_vis3
+ addxc %g0,$H2,$H2
+
+ stx $H0,[$ctx+0] ! store hash value
+ stx $H1,[$ctx+8]
+ st $H2,[$ctx+16]
+
+ ret
+ restore
+.type poly1305_blocks_vis3,#function
+.size poly1305_blocks_vis3,.-poly1305_blocks_vis3
+___
+}
+my ($mac,$nonce) = ($inp,$len);
+
+$code.=<<___;
+.globl poly1305_emit
+.align 32
+poly1305_emit:
+ save %sp,-STACK_FRAME,%sp
+
+ ld [$ctx+0],$h1 ! load hash value
+ ld [$ctx+4],$h0
+ ld [$ctx+8],$h3
+ ld [$ctx+12],$h2
+ ld [$ctx+16],$h4
+
+ addcc $h0,5,$r0 ! compare to modulus
+ addccc $h1,0,$r1
+ addccc $h2,0,$r2
+ addccc $h3,0,$r3
+ addc $h4,0,$h4
+ andcc $h4,4,%g0 ! did it carry/borrow?
+
+ movnz %icc,$r0,$h0
+ ld [$nonce+0],$r0 ! load nonce
+ movnz %icc,$r1,$h1
+ ld [$nonce+4],$r1
+ movnz %icc,$r2,$h2
+ ld [$nonce+8],$r2
+ movnz %icc,$r3,$h3
+ ld [$nonce+12],$r3
+
+ addcc $r0,$h0,$h0 ! accumulate nonce
+ addccc $r1,$h1,$h1
+ addccc $r2,$h2,$h2
+ addc $r3,$h3,$h3
+
+ srl $h0,8,$r0
+ stb $h0,[$mac+0] ! store little-endian result
+ srl $h0,16,$r1
+ stb $r0,[$mac+1]
+ srl $h0,24,$r2
+ stb $r1,[$mac+2]
+ stb $r2,[$mac+3]
+
+ srl $h1,8,$r0
+ stb $h1,[$mac+4]
+ srl $h1,16,$r1
+ stb $r0,[$mac+5]
+ srl $h1,24,$r2
+ stb $r1,[$mac+6]
+ stb $r2,[$mac+7]
+
+ srl $h2,8,$r0
+ stb $h2,[$mac+8]
+ srl $h2,16,$r1
+ stb $r0,[$mac+9]
+ srl $h2,24,$r2
+ stb $r1,[$mac+10]
+ stb $r2,[$mac+11]
+
+ srl $h3,8,$r0
+ stb $h3,[$mac+12]
+ srl $h3,16,$r1
+ stb $r0,[$mac+13]
+ srl $h3,24,$r2
+ stb $r1,[$mac+14]
+ stb $r2,[$mac+15]
+
+ ret
+ restore
+.type poly1305_emit,#function
+.size poly1305_emit,.-poly1305_emit
+___
+
+{
+my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
+my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
+my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
+my $i2=$step;
+
+my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
+ $two0,$two32,$two64,$two96,$two130,$five_two130,
+ $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
+ $s2lo,$s2hi,$s3lo,$s3hi,
+ $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
+# borrowings
+my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
+my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
+my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
+
+$code.=<<___;
+.align 32
+poly1305_init_fma:
+ save %sp,-STACK_FRAME-16,%sp
+ nop
+
+.Lpoly1305_init_fma:
+1: call .+8
+ add %o7,.Lconsts_fma-1b,%o7
+
+ ldd [%o7+8*0],$two0 ! load constants
+ ldd [%o7+8*1],$two32
+ ldd [%o7+8*2],$two64
+ ldd [%o7+8*3],$two96
+ ldd [%o7+8*5],$five_two130
+
+ std $two0,[$ctx+8*0] ! initial hash value, biased 0
+ std $two32,[$ctx+8*1]
+ std $two64,[$ctx+8*2]
+ std $two96,[$ctx+8*3]
+
+ brz,pn $inp,.Lno_key_fma
+ nop
+
+ stx %fsr,[%sp+LOCALS] ! save original %fsr
+ ldx [%o7+8*6],%fsr ! load new %fsr
+
+ std $two0,[$ctx+8*4] ! key "template"
+ std $two32,[$ctx+8*5]
+ std $two64,[$ctx+8*6]
+ std $two96,[$ctx+8*7]
+
+ and $inp,7,$shr
+ andn $inp,7,$inp ! align pointer
+ mov 8,$i1
+ sll $shr,3,$shr
+ mov 16,$i2
+ neg $shr,$shl
+
+ ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
+ ldxa [$inp+$i1]0x88,$in2
+
+ brz $shr,.Lkey_aligned_fma
+ sethi %hi(0xf0000000),$i1 ! 0xf0000000
+
+ ldxa [$inp+$i2]0x88,$in4
+
+ srlx $in0,$shr,$in0 ! align data
+ sllx $in2,$shl,$in1
+ srlx $in2,$shr,$in2
+ or $in1,$in0,$in0
+ sllx $in4,$shl,$in3
+ or $in3,$in2,$in2
+
+.Lkey_aligned_fma:
+ or $i1,3,$i2 ! 0xf0000003
+ srlx $in0,32,$in1
+ andn $in0,$i1,$in0 ! &=0x0fffffff
+ andn $in1,$i2,$in1 ! &=0x0ffffffc
+ srlx $in2,32,$in3
+ andn $in2,$i2,$in2
+ andn $in3,$i2,$in3
+
+ st $in0,[$ctx+`8*4+4`] ! fill "template"
+ st $in1,[$ctx+`8*5+4`]
+ st $in2,[$ctx+`8*6+4`]
+ st $in3,[$ctx+`8*7+4`]
+
+ ldd [$ctx+8*4],$h0lo ! load [biased] key
+ ldd [$ctx+8*5],$h1lo
+ ldd [$ctx+8*6],$h2lo
+ ldd [$ctx+8*7],$h3lo
+
+ fsubd $h0lo,$two0, $h0lo ! r0
+ ldd [%o7+8*7],$two0 ! more constants
+ fsubd $h1lo,$two32,$h1lo ! r1
+ ldd [%o7+8*8],$two32
+ fsubd $h2lo,$two64,$h2lo ! r2
+ ldd [%o7+8*9],$two64
+ fsubd $h3lo,$two96,$h3lo ! r3
+ ldd [%o7+8*10],$two96
+
+ fmuld $five_two130,$h1lo,$s1lo ! s1
+ fmuld $five_two130,$h2lo,$s2lo ! s2
+ fmuld $five_two130,$h3lo,$s3lo ! s3
+
+ faddd $h0lo,$two0, $h0hi
+ faddd $h1lo,$two32,$h1hi
+ faddd $h2lo,$two64,$h2hi
+ faddd $h3lo,$two96,$h3hi
+
+ fsubd $h0hi,$two0, $h0hi
+ ldd [%o7+8*11],$two0 ! more constants
+ fsubd $h1hi,$two32,$h1hi
+ ldd [%o7+8*12],$two32
+ fsubd $h2hi,$two64,$h2hi
+ ldd [%o7+8*13],$two64
+ fsubd $h3hi,$two96,$h3hi
+
+ fsubd $h0lo,$h0hi,$h0lo
+ std $h0hi,[$ctx+8*5] ! r0hi
+ fsubd $h1lo,$h1hi,$h1lo
+ std $h1hi,[$ctx+8*7] ! r1hi
+ fsubd $h2lo,$h2hi,$h2lo
+ std $h2hi,[$ctx+8*9] ! r2hi
+ fsubd $h3lo,$h3hi,$h3lo
+ std $h3hi,[$ctx+8*11] ! r3hi
+
+ faddd $s1lo,$two0, $s1hi
+ faddd $s2lo,$two32,$s2hi
+ faddd $s3lo,$two64,$s3hi
+
+ fsubd $s1hi,$two0, $s1hi
+ fsubd $s2hi,$two32,$s2hi
+ fsubd $s3hi,$two64,$s3hi
+
+ fsubd $s1lo,$s1hi,$s1lo
+ fsubd $s2lo,$s2hi,$s2lo
+ fsubd $s3lo,$s3hi,$s3lo
+
+ ldx [%sp+LOCALS],%fsr ! restore %fsr
+
+ std $h0lo,[$ctx+8*4] ! r0lo
+ std $h1lo,[$ctx+8*6] ! r1lo
+ std $h2lo,[$ctx+8*8] ! r2lo
+ std $h3lo,[$ctx+8*10] ! r3lo
+
+ std $s1hi,[$ctx+8*13]
+ std $s2hi,[$ctx+8*15]
+ std $s3hi,[$ctx+8*17]
+
+ std $s1lo,[$ctx+8*12]
+ std $s2lo,[$ctx+8*14]
+ std $s3lo,[$ctx+8*16]
+
+ add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
+ add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
+ STPTR %o0,[%i2]
+ STPTR %o1,[%i2+SIZE_T]
+
+ ret
+ restore %g0,1,%o0 ! return 1
+
+.Lno_key_fma:
+ ret
+ restore %g0,%g0,%o0 ! return 0
+.type poly1305_init_fma,#function
+.size poly1305_init_fma,.-poly1305_init_fma
+
+.align 32
+poly1305_blocks_fma:
+ save %sp,-STACK_FRAME-48,%sp
+ srln $len,4,$len
+
+ brz,pn $len,.Labort
+ sub $len,1,$len
+
+1: call .+8
+ add %o7,.Lconsts_fma-1b,%o7
+
+ ldd [%o7+8*0],$two0 ! load constants
+ ldd [%o7+8*1],$two32
+ ldd [%o7+8*2],$two64
+ ldd [%o7+8*3],$two96
+ ldd [%o7+8*4],$two130
+ ldd [%o7+8*5],$five_two130
+
+ ldd [$ctx+8*0],$h0lo ! load [biased] hash value
+ ldd [$ctx+8*1],$h1lo
+ ldd [$ctx+8*2],$h2lo
+ ldd [$ctx+8*3],$h3lo
+
+ std $two0,[%sp+LOCALS+8*0] ! input "template"
+ sethi %hi((1023+52+96)<<20),$in3
+ std $two32,[%sp+LOCALS+8*1]
+ or $padbit,$in3,$in3
+ std $two64,[%sp+LOCALS+8*2]
+ st $in3,[%sp+LOCALS+8*3]
+
+ and $inp,7,$shr
+ andn $inp,7,$inp ! align pointer
+ mov 8,$i1
+ sll $shr,3,$shr
+ mov 16,$step
+ neg $shr,$shl
+
+ ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
+ brz $shr,.Linp_aligned_fma
+ ldxa [$inp+$i1]0x88,$in2
+
+ ldxa [$inp+$step]0x88,$in4
+ add $inp,8,$inp
+
+ srlx $in0,$shr,$in0 ! align data
+ sllx $in2,$shl,$in1
+ srlx $in2,$shr,$in2
+ or $in1,$in0,$in0
+ sllx $in4,$shl,$in3
+ srlx $in4,$shr,$in4 ! pre-shift
+ or $in3,$in2,$in2
+
+.Linp_aligned_fma:
+ srlx $in0,32,$in1
+ movrz $len,0,$step
+ srlx $in2,32,$in3
+ add $step,$inp,$inp ! conditional advance
+
+ st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
+ st $in1,[%sp+LOCALS+8*1+4]
+ st $in2,[%sp+LOCALS+8*2+4]
+ st $in3,[%sp+LOCALS+8*3+4]
+
+ ldd [$ctx+8*4],$r0lo ! load key
+ ldd [$ctx+8*5],$r0hi
+ ldd [$ctx+8*6],$r1lo
+ ldd [$ctx+8*7],$r1hi
+ ldd [$ctx+8*8],$r2lo
+ ldd [$ctx+8*9],$r2hi
+ ldd [$ctx+8*10],$r3lo
+ ldd [$ctx+8*11],$r3hi
+ ldd [$ctx+8*12],$s1lo
+ ldd [$ctx+8*13],$s1hi
+ ldd [$ctx+8*14],$s2lo
+ ldd [$ctx+8*15],$s2hi
+ ldd [$ctx+8*16],$s3lo
+ ldd [$ctx+8*17],$s3hi
+
+ stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
+ ldx [%o7+8*6],%fsr ! load new %fsr
+
+ subcc $len,1,$len
+ movrz $len,0,$step
+
+ ldd [%sp+LOCALS+8*0],$x0 ! load biased input
+ ldd [%sp+LOCALS+8*1],$x1
+ ldd [%sp+LOCALS+8*2],$x2
+ ldd [%sp+LOCALS+8*3],$x3
+
+ fsubd $h0lo,$two0, $h0lo ! de-bias hash value
+ fsubd $h1lo,$two32,$h1lo
+ ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
+ fsubd $h2lo,$two64,$h2lo
+ fsubd $h3lo,$two96,$h3lo
+ ldxa [$inp+$i1]0x88,$in2
+
+ fsubd $x0,$two0, $x0 ! de-bias input
+ fsubd $x1,$two32,$x1
+ fsubd $x2,$two64,$x2
+ fsubd $x3,$two96,$x3
+
+ brz $shr,.Linp_aligned_fma2
+ add $step,$inp,$inp ! conditional advance
+
+ sllx $in0,$shl,$in1 ! align data
+ srlx $in0,$shr,$in3
+ or $in1,$in4,$in0
+ sllx $in2,$shl,$in1
+ srlx $in2,$shr,$in4 ! pre-shift
+ or $in3,$in1,$in2
+.Linp_aligned_fma2:
+ srlx $in0,32,$in1
+ srlx $in2,32,$in3
+
+ faddd $h0lo,$x0,$x0 ! accumulate input
+ stw $in0,[%sp+LOCALS+8*0+4]
+ faddd $h1lo,$x1,$x1
+ stw $in1,[%sp+LOCALS+8*1+4]
+ faddd $h2lo,$x2,$x2
+ stw $in2,[%sp+LOCALS+8*2+4]
+ faddd $h3lo,$x3,$x3
+ stw $in3,[%sp+LOCALS+8*3+4]
+
+ b .Lentry_fma
+ nop
+
+.align 16
+.Loop_fma:
+ ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
+ ldxa [$inp+$i1]0x88,$in2
+ movrz $len,0,$step
+
+ faddd $y0,$h0lo,$h0lo ! accumulate input
+ faddd $y1,$h0hi,$h0hi
+ faddd $y2,$h2lo,$h2lo
+ faddd $y3,$h2hi,$h2hi
+
+ brz,pn $shr,.Linp_aligned_fma3
+ add $step,$inp,$inp ! conditional advance
+
+ sllx $in0,$shl,$in1 ! align data
+ srlx $in0,$shr,$in3
+ or $in1,$in4,$in0
+ sllx $in2,$shl,$in1
+ srlx $in2,$shr,$in4 ! pre-shift
+ or $in3,$in1,$in2
+
+.Linp_aligned_fma3:
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
+ faddd $two64,$h1lo,$c1lo
+ srlx $in0,32,$in1
+ faddd $two64,$h1hi,$c1hi
+ srlx $in2,32,$in3
+ faddd $two130,$h3lo,$c3lo
+ st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
+ faddd $two130,$h3hi,$c3hi
+ st $in1,[%sp+LOCALS+8*1+4]
+ faddd $two32,$h0lo,$c0lo
+ st $in2,[%sp+LOCALS+8*2+4]
+ faddd $two32,$h0hi,$c0hi
+ st $in3,[%sp+LOCALS+8*3+4]
+ faddd $two96,$h2lo,$c2lo
+ faddd $two96,$h2hi,$c2hi
+
+ fsubd $c1lo,$two64,$c1lo
+ fsubd $c1hi,$two64,$c1hi
+ fsubd $c3lo,$two130,$c3lo
+ fsubd $c3hi,$two130,$c3hi
+ fsubd $c0lo,$two32,$c0lo
+ fsubd $c0hi,$two32,$c0hi
+ fsubd $c2lo,$two96,$c2lo
+ fsubd $c2hi,$two96,$c2hi
+
+ fsubd $h1lo,$c1lo,$h1lo
+ fsubd $h1hi,$c1hi,$h1hi
+ fsubd $h3lo,$c3lo,$h3lo
+ fsubd $h3hi,$c3hi,$h3hi
+ fsubd $h2lo,$c2lo,$h2lo
+ fsubd $h2hi,$c2hi,$h2hi
+ fsubd $h0lo,$c0lo,$h0lo
+ fsubd $h0hi,$c0hi,$h0hi
+
+ faddd $h1lo,$c0lo,$h1lo
+ faddd $h1hi,$c0hi,$h1hi
+ faddd $h3lo,$c2lo,$h3lo
+ faddd $h3hi,$c2hi,$h3hi
+ faddd $h2lo,$c1lo,$h2lo
+ faddd $h2hi,$c1hi,$h2hi
+ fmaddd $five_two130,$c3lo,$h0lo,$h0lo
+ fmaddd $five_two130,$c3hi,$h0hi,$h0hi
+
+ faddd $h1lo,$h1hi,$x1
+ ldd [$ctx+8*12],$s1lo ! reload constants
+ faddd $h3lo,$h3hi,$x3
+ ldd [$ctx+8*13],$s1hi
+ faddd $h2lo,$h2hi,$x2
+ ldd [$ctx+8*10],$r3lo
+ faddd $h0lo,$h0hi,$x0
+ ldd [$ctx+8*11],$r3hi
+
+.Lentry_fma:
+ fmuld $x1,$s3lo,$h0lo
+ fmuld $x1,$s3hi,$h0hi
+ fmuld $x1,$r1lo,$h2lo
+ fmuld $x1,$r1hi,$h2hi
+ fmuld $x1,$r0lo,$h1lo
+ fmuld $x1,$r0hi,$h1hi
+ fmuld $x1,$r2lo,$h3lo
+ fmuld $x1,$r2hi,$h3hi
+
+ fmaddd $x3,$s1lo,$h0lo,$h0lo
+ fmaddd $x3,$s1hi,$h0hi,$h0hi
+ fmaddd $x3,$s3lo,$h2lo,$h2lo
+ fmaddd $x3,$s3hi,$h2hi,$h2hi
+ fmaddd $x3,$s2lo,$h1lo,$h1lo
+ fmaddd $x3,$s2hi,$h1hi,$h1hi
+ fmaddd $x3,$r0lo,$h3lo,$h3lo
+ fmaddd $x3,$r0hi,$h3hi,$h3hi
+
+ fmaddd $x2,$s2lo,$h0lo,$h0lo
+ fmaddd $x2,$s2hi,$h0hi,$h0hi
+ fmaddd $x2,$r0lo,$h2lo,$h2lo
+ fmaddd $x2,$r0hi,$h2hi,$h2hi
+ fmaddd $x2,$s3lo,$h1lo,$h1lo
+ ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
+ fmaddd $x2,$s3hi,$h1hi,$h1hi
+ ldd [%sp+LOCALS+8*1],$y1
+ fmaddd $x2,$r1lo,$h3lo,$h3lo
+ ldd [%sp+LOCALS+8*2],$y2
+ fmaddd $x2,$r1hi,$h3hi,$h3hi
+ ldd [%sp+LOCALS+8*3],$y3
+
+ fmaddd $x0,$r0lo,$h0lo,$h0lo
+ fsubd $y0,$two0, $y0 ! de-bias input
+ fmaddd $x0,$r0hi,$h0hi,$h0hi
+ fsubd $y1,$two32,$y1
+ fmaddd $x0,$r2lo,$h2lo,$h2lo
+ fsubd $y2,$two64,$y2
+ fmaddd $x0,$r2hi,$h2hi,$h2hi
+ fsubd $y3,$two96,$y3
+ fmaddd $x0,$r1lo,$h1lo,$h1lo
+ fmaddd $x0,$r1hi,$h1hi,$h1hi
+ fmaddd $x0,$r3lo,$h3lo,$h3lo
+ fmaddd $x0,$r3hi,$h3hi,$h3hi
+
+ bcc SIZE_T_CC,.Loop_fma
+ subcc $len,1,$len
+
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
+ faddd $h0lo,$two32,$c0lo
+ faddd $h0hi,$two32,$c0hi
+ faddd $h2lo,$two96,$c2lo
+ faddd $h2hi,$two96,$c2hi
+ faddd $h1lo,$two64,$c1lo
+ faddd $h1hi,$two64,$c1hi
+ faddd $h3lo,$two130,$c3lo
+ faddd $h3hi,$two130,$c3hi
+
+ fsubd $c0lo,$two32,$c0lo
+ fsubd $c0hi,$two32,$c0hi
+ fsubd $c2lo,$two96,$c2lo
+ fsubd $c2hi,$two96,$c2hi
+ fsubd $c1lo,$two64,$c1lo
+ fsubd $c1hi,$two64,$c1hi
+ fsubd $c3lo,$two130,$c3lo
+ fsubd $c3hi,$two130,$c3hi
+
+ fsubd $h1lo,$c1lo,$h1lo
+ fsubd $h1hi,$c1hi,$h1hi
+ fsubd $h3lo,$c3lo,$h3lo
+ fsubd $h3hi,$c3hi,$h3hi
+ fsubd $h2lo,$c2lo,$h2lo
+ fsubd $h2hi,$c2hi,$h2hi
+ fsubd $h0lo,$c0lo,$h0lo
+ fsubd $h0hi,$c0hi,$h0hi
+
+ faddd $h1lo,$c0lo,$h1lo
+ faddd $h1hi,$c0hi,$h1hi
+ faddd $h3lo,$c2lo,$h3lo
+ faddd $h3hi,$c2hi,$h3hi
+ faddd $h2lo,$c1lo,$h2lo
+ faddd $h2hi,$c1hi,$h2hi
+ fmaddd $five_two130,$c3lo,$h0lo,$h0lo
+ fmaddd $five_two130,$c3hi,$h0hi,$h0hi
+
+ faddd $h1lo,$h1hi,$x1
+ faddd $h3lo,$h3hi,$x3
+ faddd $h2lo,$h2hi,$x2
+ faddd $h0lo,$h0hi,$x0
+
+ faddd $x1,$two32,$x1 ! bias
+ faddd $x3,$two96,$x3
+ faddd $x2,$two64,$x2
+ faddd $x0,$two0, $x0
+
+ ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
+
+ std $x1,[$ctx+8*1] ! store [biased] hash value
+ std $x3,[$ctx+8*3]
+ std $x2,[$ctx+8*2]
+ std $x0,[$ctx+8*0]
+
+.Labort:
+ ret
+ restore
+.type poly1305_blocks_fma,#function
+.size poly1305_blocks_fma,.-poly1305_blocks_fma
+___
+{
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
+ ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
+
+$code.=<<___;
+.align 32
+poly1305_emit_fma:
+ save %sp,-STACK_FRAME,%sp
+
+ ld [$ctx+8*0+0],$d0 ! load hash
+ ld [$ctx+8*0+4],$h0
+ ld [$ctx+8*1+0],$d1
+ ld [$ctx+8*1+4],$h1
+ ld [$ctx+8*2+0],$d2
+ ld [$ctx+8*2+4],$h2
+ ld [$ctx+8*3+0],$d3
+ ld [$ctx+8*3+4],$h3
+
+ sethi %hi(0xfff00000),$mask
+ andn $d0,$mask,$d0 ! mask exponent
+ andn $d1,$mask,$d1
+ andn $d2,$mask,$d2
+ andn $d3,$mask,$d3 ! can be partially reduced...
+ mov 3,$mask
+
+ srl $d3,2,$padbit ! ... so reduce
+ and $d3,$mask,$h4
+ andn $d3,$mask,$d3
+ add $padbit,$d3,$d3
+
+ addcc $d3,$h0,$h0
+ addccc $d0,$h1,$h1
+ addccc $d1,$h2,$h2
+ addccc $d2,$h3,$h3
+ addc %g0,$h4,$h4
+
+ addcc $h0,5,$d0 ! compare to modulus
+ addccc $h1,0,$d1
+ addccc $h2,0,$d2
+ addccc $h3,0,$d3
+ addc $h4,0,$mask
+
+ srl $mask,2,$mask ! did it carry/borrow?
+ neg $mask,$mask
+ sra $mask,31,$mask ! mask
+
+ andn $h0,$mask,$h0
+ and $d0,$mask,$d0
+ andn $h1,$mask,$h1
+ and $d1,$mask,$d1
+ or $d0,$h0,$h0
+ ld [$nonce+0],$d0 ! load nonce
+ andn $h2,$mask,$h2
+ and $d2,$mask,$d2
+ or $d1,$h1,$h1
+ ld [$nonce+4],$d1
+ andn $h3,$mask,$h3
+ and $d3,$mask,$d3
+ or $d2,$h2,$h2
+ ld [$nonce+8],$d2
+ or $d3,$h3,$h3
+ ld [$nonce+12],$d3
+
+ addcc $d0,$h0,$h0 ! accumulate nonce
+ addccc $d1,$h1,$h1
+ addccc $d2,$h2,$h2
+ addc $d3,$h3,$h3
+
+ stb $h0,[$mac+0] ! write little-endian result
+ srl $h0,8,$h0
+ stb $h1,[$mac+4]
+ srl $h1,8,$h1
+ stb $h2,[$mac+8]
+ srl $h2,8,$h2
+ stb $h3,[$mac+12]
+ srl $h3,8,$h3
+
+ stb $h0,[$mac+1]
+ srl $h0,8,$h0
+ stb $h1,[$mac+5]
+ srl $h1,8,$h1
+ stb $h2,[$mac+9]
+ srl $h2,8,$h2
+ stb $h3,[$mac+13]
+ srl $h3,8,$h3
+
+ stb $h0,[$mac+2]
+ srl $h0,8,$h0
+ stb $h1,[$mac+6]
+ srl $h1,8,$h1
+ stb $h2,[$mac+10]
+ srl $h2,8,$h2
+ stb $h3,[$mac+14]
+ srl $h3,8,$h3
+
+ stb $h0,[$mac+3]
+ stb $h1,[$mac+7]
+ stb $h2,[$mac+11]
+ stb $h3,[$mac+15]
+
+ ret
+ restore
+.type poly1305_emit_fma,#function
+.size poly1305_emit_fma,.-poly1305_emit_fma
+___
+}
+
+$code.=<<___;
+.align 64
+.Lconsts_fma:
+.word 0x43300000,0x00000000 ! 2^(52+0)
+.word 0x45300000,0x00000000 ! 2^(52+32)
+.word 0x47300000,0x00000000 ! 2^(52+64)
+.word 0x49300000,0x00000000 ! 2^(52+96)
+.word 0x4b500000,0x00000000 ! 2^(52+130)
+
+.word 0x37f40000,0x00000000 ! 5/2^130
+.word 0,1<<30 ! fsr: truncate, no exceptions
+
+.word 0x44300000,0x00000000 ! 2^(52+16+0)
+.word 0x46300000,0x00000000 ! 2^(52+16+32)
+.word 0x48300000,0x00000000 ! 2^(52+16+64)
+.word 0x4a300000,0x00000000 ! 2^(52+16+96)
+.word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
+.word 0x40300000,0x00000000 ! 2^(52+16+32-96)
+.word 0x42300000,0x00000000 ! 2^(52+16+64-96)
+.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+}
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "umulxhi" => 0x016 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unfma {
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+my %fmaopf = ( "fmadds" => 0x1,
+ "fmaddd" => 0x2,
+ "fmsubs" => 0x5,
+ "fmsubd" => 0x6 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if ($opf=$fmaopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rs3,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge or
+ s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
+ &unfma($1,$2,$3,$4,$5)
+ /ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-x86.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-x86.pl
new file mode 100755
index 0000000..ab24dfc
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-x86.pl
@@ -0,0 +1,1814 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86.
+#
+# April 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+# IALU/gcc-3.4(*) SSE2(**) AVX2
+# Pentium 15.7/+80% -
+# PIII 6.21/+90% -
+# P4 19.8/+40% 3.24
+# Core 2 4.85/+90% 1.80
+# Westmere 4.58/+100% 1.43
+# Sandy Bridge 3.90/+100% 1.36
+# Haswell 3.88/+70% 1.18 0.72
+# Silvermont 11.0/+40% 4.80
+# Goldmont 4.10/+200% 2.10
+# VIA Nano 6.71/+90% 2.47
+# Sledgehammer 3.51/+180% 4.27
+# Bulldozer 4.53/+140% 1.31
+#
+# (*) gcc 4.8 for some reason generated worse code;
+# (**) besides SSE2 there are floating-point and AVX options; FP
+# is deemed unnecessary, because pre-SSE2 processor are too
+# old to care about, while it's not the fastest option on
+# SSE2-capable ones; AVX is omitted, because it doesn't give
+# a lot of improvement, 5-10% depending on processor;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386");
+
+$sse2=$avx=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+if ($sse2) {
+ &static_label("const_sse2");
+ &static_label("enter_blocks");
+ &static_label("enter_emit");
+ &external_label("OPENSSL_ia32cap_P");
+
+ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+ }
+
+ if (!$avx && $ARGV[0] eq "win32n" &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+ }
+
+ if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+ }
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int32 h[5]; # current hash value base 2^32
+# unsigned __int32 pad; # is_base2_26 in vector context
+# unsigned __int32 r[4]; # key value base 2^32
+
+&align(64);
+&function_begin("poly1305_init");
+ &mov ("edi",&wparam(0)); # context
+ &mov ("esi",&wparam(1)); # key
+ &mov ("ebp",&wparam(2)); # function table
+
+ &xor ("eax","eax");
+ &mov (&DWP(4*0,"edi"),"eax"); # zero hash value
+ &mov (&DWP(4*1,"edi"),"eax");
+ &mov (&DWP(4*2,"edi"),"eax");
+ &mov (&DWP(4*3,"edi"),"eax");
+ &mov (&DWP(4*4,"edi"),"eax");
+ &mov (&DWP(4*5,"edi"),"eax"); # is_base2_26
+
+ &cmp ("esi",0);
+ &je (&label("nokey"));
+
+ if ($sse2) {
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("ebx");
+
+ &lea ("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx"));
+ &lea ("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx"));
+
+ &picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point"));
+ &mov ("ecx",&DWP(0,"edi"));
+ &and ("ecx",1<<26|1<<24);
+ &cmp ("ecx",1<<26|1<<24); # SSE2 and XMM?
+ &jne (&label("no_sse2"));
+
+ &lea ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
+ &lea ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
+
+ if ($avx>1) {
+ &mov ("ecx",&DWP(8,"edi"));
+ &test ("ecx",1<<5); # AVX2?
+ &jz (&label("no_sse2"));
+
+ &lea ("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx"));
+ }
+ &set_label("no_sse2");
+ &mov ("edi",&wparam(0)); # reload context
+ &mov (&DWP(0,"ebp"),"eax"); # fill function table
+ &mov (&DWP(4,"ebp"),"edx");
+ }
+
+ &mov ("eax",&DWP(4*0,"esi")); # load input key
+ &mov ("ebx",&DWP(4*1,"esi"));
+ &mov ("ecx",&DWP(4*2,"esi"));
+ &mov ("edx",&DWP(4*3,"esi"));
+ &and ("eax",0x0fffffff);
+ &and ("ebx",0x0ffffffc);
+ &and ("ecx",0x0ffffffc);
+ &and ("edx",0x0ffffffc);
+ &mov (&DWP(4*6,"edi"),"eax");
+ &mov (&DWP(4*7,"edi"),"ebx");
+ &mov (&DWP(4*8,"edi"),"ecx");
+ &mov (&DWP(4*9,"edi"),"edx");
+
+ &mov ("eax",$sse2);
+&set_label("nokey");
+&function_end("poly1305_init");
+
+($h0,$h1,$h2,$h3,$h4,
+ $d0,$d1,$d2,$d3,
+ $r0,$r1,$r2,$r3,
+ $s1,$s2,$s3)=map(4*$_,(0..15));
+
+&function_begin("poly1305_blocks");
+ &mov ("edi",&wparam(0)); # ctx
+ &mov ("esi",&wparam(1)); # inp
+ &mov ("ecx",&wparam(2)); # len
+&set_label("enter_blocks");
+ &and ("ecx",-15);
+ &jz (&label("nodata"));
+
+ &stack_push(16);
+ &mov ("eax",&DWP(4*6,"edi")); # r0
+ &mov ("ebx",&DWP(4*7,"edi")); # r1
+ &lea ("ebp",&DWP(0,"esi","ecx")); # end of input
+ &mov ("ecx",&DWP(4*8,"edi")); # r2
+ &mov ("edx",&DWP(4*9,"edi")); # r3
+
+ &mov (&wparam(2),"ebp");
+ &mov ("ebp","esi");
+
+ &mov (&DWP($r0,"esp"),"eax"); # r0
+ &mov ("eax","ebx");
+ &shr ("eax",2);
+ &mov (&DWP($r1,"esp"),"ebx"); # r1
+ &add ("eax","ebx"); # s1
+ &mov ("ebx","ecx");
+ &shr ("ebx",2);
+ &mov (&DWP($r2,"esp"),"ecx"); # r2
+ &add ("ebx","ecx"); # s2
+ &mov ("ecx","edx");
+ &shr ("ecx",2);
+ &mov (&DWP($r3,"esp"),"edx"); # r3
+ &add ("ecx","edx"); # s3
+ &mov (&DWP($s1,"esp"),"eax"); # s1
+ &mov (&DWP($s2,"esp"),"ebx"); # s2
+ &mov (&DWP($s3,"esp"),"ecx"); # s3
+
+ &mov ("eax",&DWP(4*0,"edi")); # load hash value
+ &mov ("ebx",&DWP(4*1,"edi"));
+ &mov ("ecx",&DWP(4*2,"edi"));
+ &mov ("esi",&DWP(4*3,"edi"));
+ &mov ("edi",&DWP(4*4,"edi"));
+ &jmp (&label("loop"));
+
+&set_label("loop",32);
+ &add ("eax",&DWP(4*0,"ebp")); # accumulate input
+ &adc ("ebx",&DWP(4*1,"ebp"));
+ &adc ("ecx",&DWP(4*2,"ebp"));
+ &adc ("esi",&DWP(4*3,"ebp"));
+ &lea ("ebp",&DWP(4*4,"ebp"));
+ &adc ("edi",&wparam(3)); # padbit
+
+ &mov (&DWP($h0,"esp"),"eax"); # put aside hash[+inp]
+ &mov (&DWP($h3,"esp"),"esi");
+
+ &mul (&DWP($r0,"esp")); # h0*r0
+ &mov (&DWP($h4,"esp"),"edi");
+ &mov ("edi","eax");
+ &mov ("eax","ebx"); # h1
+ &mov ("esi","edx");
+ &mul (&DWP($s3,"esp")); # h1*s3
+ &add ("edi","eax");
+ &mov ("eax","ecx"); # h2
+ &adc ("esi","edx");
+ &mul (&DWP($s2,"esp")); # h2*s2
+ &add ("edi","eax");
+ &mov ("eax",&DWP($h3,"esp"));
+ &adc ("esi","edx");
+ &mul (&DWP($s1,"esp")); # h3*s1
+ &add ("edi","eax");
+ &mov ("eax",&DWP($h0,"esp"));
+ &adc ("esi","edx");
+
+ &mul (&DWP($r1,"esp")); # h0*r1
+ &mov (&DWP($d0,"esp"),"edi");
+ &xor ("edi","edi");
+ &add ("esi","eax");
+ &mov ("eax","ebx"); # h1
+ &adc ("edi","edx");
+ &mul (&DWP($r0,"esp")); # h1*r0
+ &add ("esi","eax");
+ &mov ("eax","ecx"); # h2
+ &adc ("edi","edx");
+ &mul (&DWP($s3,"esp")); # h2*s3
+ &add ("esi","eax");
+ &mov ("eax",&DWP($h3,"esp"));
+ &adc ("edi","edx");
+ &mul (&DWP($s2,"esp")); # h3*s2
+ &add ("esi","eax");
+ &mov ("eax",&DWP($h4,"esp"));
+ &adc ("edi","edx");
+ &imul ("eax",&DWP($s1,"esp")); # h4*s1
+ &add ("esi","eax");
+ &mov ("eax",&DWP($h0,"esp"));
+ &adc ("edi",0);
+
+ &mul (&DWP($r2,"esp")); # h0*r2
+ &mov (&DWP($d1,"esp"),"esi");
+ &xor ("esi","esi");
+ &add ("edi","eax");
+ &mov ("eax","ebx"); # h1
+ &adc ("esi","edx");
+ &mul (&DWP($r1,"esp")); # h1*r1
+ &add ("edi","eax");
+ &mov ("eax","ecx"); # h2
+ &adc ("esi","edx");
+ &mul (&DWP($r0,"esp")); # h2*r0
+ &add ("edi","eax");
+ &mov ("eax",&DWP($h3,"esp"));
+ &adc ("esi","edx");
+ &mul (&DWP($s3,"esp")); # h3*s3
+ &add ("edi","eax");
+ &mov ("eax",&DWP($h4,"esp"));
+ &adc ("esi","edx");
+ &imul ("eax",&DWP($s2,"esp")); # h4*s2
+ &add ("edi","eax");
+ &mov ("eax",&DWP($h0,"esp"));
+ &adc ("esi",0);
+
+ &mul (&DWP($r3,"esp")); # h0*r3
+ &mov (&DWP($d2,"esp"),"edi");
+ &xor ("edi","edi");
+ &add ("esi","eax");
+ &mov ("eax","ebx"); # h1
+ &adc ("edi","edx");
+ &mul (&DWP($r2,"esp")); # h1*r2
+ &add ("esi","eax");
+ &mov ("eax","ecx"); # h2
+ &adc ("edi","edx");
+ &mul (&DWP($r1,"esp")); # h2*r1
+ &add ("esi","eax");
+ &mov ("eax",&DWP($h3,"esp"));
+ &adc ("edi","edx");
+ &mul (&DWP($r0,"esp")); # h3*r0
+ &add ("esi","eax");
+ &mov ("ecx",&DWP($h4,"esp"));
+ &adc ("edi","edx");
+
+ &mov ("edx","ecx");
+ &imul ("ecx",&DWP($s3,"esp")); # h4*s3
+ &add ("esi","ecx");
+ &mov ("eax",&DWP($d0,"esp"));
+ &adc ("edi",0);
+
+ &imul ("edx",&DWP($r0,"esp")); # h4*r0
+ &add ("edx","edi");
+
+ &mov ("ebx",&DWP($d1,"esp"));
+ &mov ("ecx",&DWP($d2,"esp"));
+
+ &mov ("edi","edx"); # last reduction step
+ &shr ("edx",2);
+ &and ("edi",3);
+ &lea ("edx",&DWP(0,"edx","edx",4)); # *5
+ &add ("eax","edx");
+ &adc ("ebx",0);
+ &adc ("ecx",0);
+ &adc ("esi",0);
+ &adc ("edi",0);
+
+ &cmp ("ebp",&wparam(2)); # done yet?
+ &jne (&label("loop"));
+
+ &mov ("edx",&wparam(0)); # ctx
+ &stack_pop(16);
+ &mov (&DWP(4*0,"edx"),"eax"); # store hash value
+ &mov (&DWP(4*1,"edx"),"ebx");
+ &mov (&DWP(4*2,"edx"),"ecx");
+ &mov (&DWP(4*3,"edx"),"esi");
+ &mov (&DWP(4*4,"edx"),"edi");
+&set_label("nodata");
+&function_end("poly1305_blocks");
+
+&function_begin("poly1305_emit");
+ &mov ("ebp",&wparam(0)); # context
+&set_label("enter_emit");
+ &mov ("edi",&wparam(1)); # output
+ &mov ("eax",&DWP(4*0,"ebp")); # load hash value
+ &mov ("ebx",&DWP(4*1,"ebp"));
+ &mov ("ecx",&DWP(4*2,"ebp"));
+ &mov ("edx",&DWP(4*3,"ebp"));
+ &mov ("esi",&DWP(4*4,"ebp"));
+
+ &add ("eax",5); # compare to modulus
+ &adc ("ebx",0);
+ &adc ("ecx",0);
+ &adc ("edx",0);
+ &adc ("esi",0);
+ &shr ("esi",2); # did it carry/borrow?
+ &neg ("esi"); # do we choose hash-modulus?
+
+ &and ("eax","esi");
+ &and ("ebx","esi");
+ &and ("ecx","esi");
+ &and ("edx","esi");
+ &mov (&DWP(4*0,"edi"),"eax");
+ &mov (&DWP(4*1,"edi"),"ebx");
+ &mov (&DWP(4*2,"edi"),"ecx");
+ &mov (&DWP(4*3,"edi"),"edx");
+
+ &not ("esi"); # or original hash value?
+ &mov ("eax",&DWP(4*0,"ebp"));
+ &mov ("ebx",&DWP(4*1,"ebp"));
+ &mov ("ecx",&DWP(4*2,"ebp"));
+ &mov ("edx",&DWP(4*3,"ebp"));
+ &mov ("ebp",&wparam(2));
+ &and ("eax","esi");
+ &and ("ebx","esi");
+ &and ("ecx","esi");
+ &and ("edx","esi");
+ &or ("eax",&DWP(4*0,"edi"));
+ &or ("ebx",&DWP(4*1,"edi"));
+ &or ("ecx",&DWP(4*2,"edi"));
+ &or ("edx",&DWP(4*3,"edi"));
+
+ &add ("eax",&DWP(4*0,"ebp")); # accumulate key
+ &adc ("ebx",&DWP(4*1,"ebp"));
+ &adc ("ecx",&DWP(4*2,"ebp"));
+ &adc ("edx",&DWP(4*3,"ebp"));
+
+ &mov (&DWP(4*0,"edi"),"eax");
+ &mov (&DWP(4*1,"edi"),"ebx");
+ &mov (&DWP(4*2,"edi"),"ecx");
+ &mov (&DWP(4*3,"edi"),"edx");
+&function_end("poly1305_emit");
+
+if ($sse2) {
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int32 h[5]; # current hash value base 2^26
+# unsigned __int32 is_base2_26;
+# unsigned __int32 r[4]; # key value base 2^32
+# unsigned __int32 pad[2];
+# struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
+my $MASK=$T2; # borrow and keep in mind
+
+&align (32);
+&function_begin_B("_poly1305_init_sse2");
+ &movdqu ($D4,&QWP(4*6,"edi")); # key base 2^32
+ &lea ("edi",&DWP(16*3,"edi")); # size optimization
+ &mov ("ebp","esp");
+ &sub ("esp",16*(9+5));
+ &and ("esp",-16);
+
+ #&pand ($D4,&QWP(96,"ebx")); # magic mask
+ &movq ($MASK,&QWP(64,"ebx"));
+
+ &movdqa ($D0,$D4);
+ &movdqa ($D1,$D4);
+ &movdqa ($D2,$D4);
+
+ &pand ($D0,$MASK); # -> base 2^26
+ &psrlq ($D1,26);
+ &psrldq ($D2,6);
+ &pand ($D1,$MASK);
+ &movdqa ($D3,$D2);
+ &psrlq ($D2,4)
+ &psrlq ($D3,30);
+ &pand ($D2,$MASK);
+ &pand ($D3,$MASK);
+ &psrldq ($D4,13);
+
+ &lea ("edx",&DWP(16*9,"esp")); # size optimization
+ &mov ("ecx",2);
+&set_label("square");
+ &movdqa (&QWP(16*0,"esp"),$D0);
+ &movdqa (&QWP(16*1,"esp"),$D1);
+ &movdqa (&QWP(16*2,"esp"),$D2);
+ &movdqa (&QWP(16*3,"esp"),$D3);
+ &movdqa (&QWP(16*4,"esp"),$D4);
+
+ &movdqa ($T1,$D1);
+ &movdqa ($T0,$D2);
+ &pslld ($T1,2);
+ &pslld ($T0,2);
+ &paddd ($T1,$D1); # *5
+ &paddd ($T0,$D2); # *5
+ &movdqa (&QWP(16*5,"esp"),$T1);
+ &movdqa (&QWP(16*6,"esp"),$T0);
+ &movdqa ($T1,$D3);
+ &movdqa ($T0,$D4);
+ &pslld ($T1,2);
+ &pslld ($T0,2);
+ &paddd ($T1,$D3); # *5
+ &paddd ($T0,$D4); # *5
+ &movdqa (&QWP(16*7,"esp"),$T1);
+ &movdqa (&QWP(16*8,"esp"),$T0);
+
+ &pshufd ($T1,$D0,0b01000100);
+ &movdqa ($T0,$D1);
+ &pshufd ($D1,$D1,0b01000100);
+ &pshufd ($D2,$D2,0b01000100);
+ &pshufd ($D3,$D3,0b01000100);
+ &pshufd ($D4,$D4,0b01000100);
+ &movdqa (&QWP(16*0,"edx"),$T1);
+ &movdqa (&QWP(16*1,"edx"),$D1);
+ &movdqa (&QWP(16*2,"edx"),$D2);
+ &movdqa (&QWP(16*3,"edx"),$D3);
+ &movdqa (&QWP(16*4,"edx"),$D4);
+
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ &pmuludq ($D4,$D0); # h4*r0
+ &pmuludq ($D3,$D0); # h3*r0
+ &pmuludq ($D2,$D0); # h2*r0
+ &pmuludq ($D1,$D0); # h1*r0
+ &pmuludq ($D0,$T1); # h0*r0
+
+sub pmuladd {
+my $load = shift;
+my $base = shift; $base = "esp" if (!defined($base));
+
+ ################################################################
+ # As for choice to "rotate" $T0-$T2 in order to move paddq
+ # past next multiplication. While it makes code harder to read
+ # and doesn't have significant effect on most processors, it
+ # makes a lot of difference on Atom, up to 30% improvement.
+
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3
+ &movdqa ($T2,$T1);
+ &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2
+ &paddq ($D4,$T0);
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&QWP(16*1,$base)); # r1*h1
+ &paddq ($D3,$T1);
+ &$load ($T1,5); # s1
+ &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0
+ &paddq ($D2,$T2);
+ &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4
+ &$load ($T2,2); # r2^n
+ &paddq ($D1,$T0);
+
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&QWP(16*2,$base)); # r2*h2
+ &paddq ($D0,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1
+ &paddq ($D4,$T2);
+ &$load ($T2,6); # s2^n
+ &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0
+ &paddq ($D3,$T0);
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&QWP(16*4,$base)); # s2*h4
+ &paddq ($D2,$T1);
+ &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3
+ &$load ($T1,3); # r3^n
+ &paddq ($D1,$T2);
+
+ &movdqa ($T2,$T1);
+ &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1
+ &paddq ($D0,$T0);
+ &$load ($T0,7); # s3^n
+ &pmuludq ($T2,&QWP(16*0,$base)); # r3*h0
+ &paddq ($D4,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4
+ &paddq ($D3,$T2);
+ &movdqa ($T2,$T1);
+ &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3
+ &paddq ($D2,$T0);
+ &pmuludq ($T2,&QWP(16*2,$base)); # s3*h2
+ &$load ($T0,4); # r4^n
+ &paddq ($D1,$T1);
+
+ &$load ($T1,8); # s4^n
+ &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0
+ &paddq ($D0,$T2);
+ &movdqa ($T2,$T1);
+ &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4
+ &paddq ($D4,$T0);
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&QWP(16*1,$base)); # s4*h1
+ &paddq ($D3,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2
+ &paddq ($D0,$T2);
+ &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3
+ &movdqa ($MASK,&QWP(64,"ebx"));
+ &paddq ($D1,$T0);
+ &paddq ($D2,$T1);
+}
+ &pmuladd (sub { my ($reg,$i)=@_;
+ &movdqa ($reg,&QWP(16*$i,"esp"));
+ },"edx");
+
+sub lazy_reduction {
+my $extra = shift;
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+ #
+ # [(*) see discussion in poly1305-armv4 module]
+
+ &movdqa ($T0,$D3);
+ &pand ($D3,$MASK);
+ &psrlq ($T0,26);
+ &$extra () if (defined($extra));
+ &paddq ($T0,$D4); # h3 -> h4
+ &movdqa ($T1,$D0);
+ &pand ($D0,$MASK);
+ &psrlq ($T1,26);
+ &movdqa ($D4,$T0);
+ &paddq ($T1,$D1); # h0 -> h1
+ &psrlq ($T0,26);
+ &pand ($D4,$MASK);
+ &movdqa ($D1,$T1);
+ &psrlq ($T1,26);
+ &paddd ($D0,$T0); # favour paddd when
+ # possible, because
+ # paddq is "broken"
+ # on Atom
+ &psllq ($T0,2);
+ &paddq ($T1,$D2); # h1 -> h2
+ &paddq ($T0,$D0); # h4 -> h0 (*)
+ &pand ($D1,$MASK);
+ &movdqa ($D2,$T1);
+ &psrlq ($T1,26);
+ &pand ($D2,$MASK);
+ &paddd ($T1,$D3); # h2 -> h3
+ &movdqa ($D0,$T0);
+ &psrlq ($T0,26);
+ &movdqa ($D3,$T1);
+ &psrlq ($T1,26);
+ &pand ($D0,$MASK);
+ &paddd ($D1,$T0); # h0 -> h1
+ &pand ($D3,$MASK);
+ &paddd ($D4,$T1); # h3 -> h4
+}
+ &lazy_reduction ();
+
+ &dec ("ecx");
+ &jz (&label("square_break"));
+
+ &punpcklqdq ($D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2
+ &punpcklqdq ($D1,&QWP(16*1,"esp"));
+ &punpcklqdq ($D2,&QWP(16*2,"esp"));
+ &punpcklqdq ($D3,&QWP(16*3,"esp"));
+ &punpcklqdq ($D4,&QWP(16*4,"esp"));
+ &jmp (&label("square"));
+
+&set_label("square_break");
+ &psllq ($D0,32); # -> r^3:0:r^4:0
+ &psllq ($D1,32);
+ &psllq ($D2,32);
+ &psllq ($D3,32);
+ &psllq ($D4,32);
+ &por ($D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2
+ &por ($D1,&QWP(16*1,"esp"));
+ &por ($D2,&QWP(16*2,"esp"));
+ &por ($D3,&QWP(16*3,"esp"));
+ &por ($D4,&QWP(16*4,"esp"));
+
+ &pshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4
+ &pshufd ($D1,$D1,0b10001101);
+ &pshufd ($D2,$D2,0b10001101);
+ &pshufd ($D3,$D3,0b10001101);
+ &pshufd ($D4,$D4,0b10001101);
+
+ &movdqu (&QWP(16*0,"edi"),$D0); # save the table
+ &movdqu (&QWP(16*1,"edi"),$D1);
+ &movdqu (&QWP(16*2,"edi"),$D2);
+ &movdqu (&QWP(16*3,"edi"),$D3);
+ &movdqu (&QWP(16*4,"edi"),$D4);
+
+ &movdqa ($T1,$D1);
+ &movdqa ($T0,$D2);
+ &pslld ($T1,2);
+ &pslld ($T0,2);
+ &paddd ($T1,$D1); # *5
+ &paddd ($T0,$D2); # *5
+ &movdqu (&QWP(16*5,"edi"),$T1);
+ &movdqu (&QWP(16*6,"edi"),$T0);
+ &movdqa ($T1,$D3);
+ &movdqa ($T0,$D4);
+ &pslld ($T1,2);
+ &pslld ($T0,2);
+ &paddd ($T1,$D3); # *5
+ &paddd ($T0,$D4); # *5
+ &movdqu (&QWP(16*7,"edi"),$T1);
+ &movdqu (&QWP(16*8,"edi"),$T0);
+
+ &mov ("esp","ebp");
+ &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization
+ &ret ();
+&function_end_B("_poly1305_init_sse2");
+
+&align (32);
+&function_begin("_poly1305_blocks_sse2");
+ &mov ("edi",&wparam(0)); # ctx
+ &mov ("esi",&wparam(1)); # inp
+ &mov ("ecx",&wparam(2)); # len
+
+ &mov ("eax",&DWP(4*5,"edi")); # is_base2_26
+ &and ("ecx",-16);
+ &jz (&label("nodata"));
+ &cmp ("ecx",64);
+ &jae (&label("enter_sse2"));
+ &test ("eax","eax"); # is_base2_26?
+ &jz (&label("enter_blocks"));
+
+&set_label("enter_sse2",16);
+ &call (&label("pic_point"));
+&set_label("pic_point");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
+
+ &test ("eax","eax"); # is_base2_26?
+ &jnz (&label("base2_26"));
+
+ &call ("_poly1305_init_sse2");
+
+ ################################################# base 2^32 -> base 2^26
+ &mov ("eax",&DWP(0,"edi"));
+ &mov ("ecx",&DWP(3,"edi"));
+ &mov ("edx",&DWP(6,"edi"));
+ &mov ("esi",&DWP(9,"edi"));
+ &mov ("ebp",&DWP(13,"edi"));
+ &mov (&DWP(4*5,"edi"),1); # is_base2_26
+
+ &shr ("ecx",2);
+ &and ("eax",0x3ffffff);
+ &shr ("edx",4);
+ &and ("ecx",0x3ffffff);
+ &shr ("esi",6);
+ &and ("edx",0x3ffffff);
+
+ &movd ($D0,"eax");
+ &movd ($D1,"ecx");
+ &movd ($D2,"edx");
+ &movd ($D3,"esi");
+ &movd ($D4,"ebp");
+
+ &mov ("esi",&wparam(1)); # [reload] inp
+ &mov ("ecx",&wparam(2)); # [reload] len
+ &jmp (&label("base2_32"));
+
+&set_label("base2_26",16);
+ &movd ($D0,&DWP(4*0,"edi")); # load hash value
+ &movd ($D1,&DWP(4*1,"edi"));
+ &movd ($D2,&DWP(4*2,"edi"));
+ &movd ($D3,&DWP(4*3,"edi"));
+ &movd ($D4,&DWP(4*4,"edi"));
+ &movdqa ($MASK,&QWP(64,"ebx"));
+
+&set_label("base2_32");
+ &mov ("eax",&wparam(3)); # padbit
+ &mov ("ebp","esp");
+
+ &sub ("esp",16*(5+5+5+9+9));
+ &and ("esp",-16);
+
+ &lea ("edi",&DWP(16*3,"edi")); # size optimization
+ &shl ("eax",24); # padbit
+
+ &test ("ecx",31);
+ &jz (&label("even"));
+
+ ################################################################
+ # process single block, with SSE2, because it's still faster
+ # even though half of result is discarded
+
+ &movdqu ($T1,&QWP(0,"esi")); # input
+ &lea ("esi",&DWP(16,"esi"));
+
+ &movdqa ($T0,$T1); # -> base 2^26 ...
+ &pand ($T1,$MASK);
+ &paddd ($D0,$T1); # ... and accumuate
+
+ &movdqa ($T1,$T0);
+ &psrlq ($T0,26);
+ &psrldq ($T1,6);
+ &pand ($T0,$MASK);
+ &paddd ($D1,$T0);
+
+ &movdqa ($T0,$T1);
+ &psrlq ($T1,4);
+ &pand ($T1,$MASK);
+ &paddd ($D2,$T1);
+
+ &movdqa ($T1,$T0);
+ &psrlq ($T0,30);
+ &pand ($T0,$MASK);
+ &psrldq ($T1,7);
+ &paddd ($D3,$T0);
+
+ &movd ($T0,"eax"); # padbit
+ &paddd ($D4,$T1);
+ &movd ($T1,&DWP(16*0+12,"edi")); # r0
+ &paddd ($D4,$T0);
+
+ &movdqa (&QWP(16*0,"esp"),$D0);
+ &movdqa (&QWP(16*1,"esp"),$D1);
+ &movdqa (&QWP(16*2,"esp"),$D2);
+ &movdqa (&QWP(16*3,"esp"),$D3);
+ &movdqa (&QWP(16*4,"esp"),$D4);
+
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ &pmuludq ($D0,$T1); # h4*r0
+ &pmuludq ($D1,$T1); # h3*r0
+ &pmuludq ($D2,$T1); # h2*r0
+ &movd ($T0,&DWP(16*1+12,"edi")); # r1
+ &pmuludq ($D3,$T1); # h1*r0
+ &pmuludq ($D4,$T1); # h0*r0
+
+ &pmuladd (sub { my ($reg,$i)=@_;
+ &movd ($reg,&DWP(16*$i+12,"edi"));
+ });
+
+ &lazy_reduction ();
+
+ &sub ("ecx",16);
+ &jz (&label("done"));
+
+&set_label("even");
+ &lea ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
+ &lea ("eax",&DWP(-16*2,"esi"));
+ &sub ("ecx",64);
+
+ ################################################################
+ # expand and copy pre-calculated table to stack
+
+ &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4
+ &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4
+ &cmovb ("esi","eax");
+ &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2
+ &movdqa (&QWP(16*0,"edx"),$T1);
+ &lea ("eax",&DWP(16*10,"esp"));
+ &movdqu ($T1,&QWP(16*1,"edi"));
+ &movdqa (&QWP(16*(0-9),"edx"),$T0);
+ &pshufd ($T0,$T1,0b01000100);
+ &pshufd ($T1,$T1,0b11101110);
+ &movdqa (&QWP(16*1,"edx"),$T0);
+ &movdqu ($T0,&QWP(16*2,"edi"));
+ &movdqa (&QWP(16*(1-9),"edx"),$T1);
+ &pshufd ($T1,$T0,0b01000100);
+ &pshufd ($T0,$T0,0b11101110);
+ &movdqa (&QWP(16*2,"edx"),$T1);
+ &movdqu ($T1,&QWP(16*3,"edi"));
+ &movdqa (&QWP(16*(2-9),"edx"),$T0);
+ &pshufd ($T0,$T1,0b01000100);
+ &pshufd ($T1,$T1,0b11101110);
+ &movdqa (&QWP(16*3,"edx"),$T0);
+ &movdqu ($T0,&QWP(16*4,"edi"));
+ &movdqa (&QWP(16*(3-9),"edx"),$T1);
+ &pshufd ($T1,$T0,0b01000100);
+ &pshufd ($T0,$T0,0b11101110);
+ &movdqa (&QWP(16*4,"edx"),$T1);
+ &movdqu ($T1,&QWP(16*5,"edi"));
+ &movdqa (&QWP(16*(4-9),"edx"),$T0);
+ &pshufd ($T0,$T1,0b01000100);
+ &pshufd ($T1,$T1,0b11101110);
+ &movdqa (&QWP(16*5,"edx"),$T0);
+ &movdqu ($T0,&QWP(16*6,"edi"));
+ &movdqa (&QWP(16*(5-9),"edx"),$T1);
+ &pshufd ($T1,$T0,0b01000100);
+ &pshufd ($T0,$T0,0b11101110);
+ &movdqa (&QWP(16*6,"edx"),$T1);
+ &movdqu ($T1,&QWP(16*7,"edi"));
+ &movdqa (&QWP(16*(6-9),"edx"),$T0);
+ &pshufd ($T0,$T1,0b01000100);
+ &pshufd ($T1,$T1,0b11101110);
+ &movdqa (&QWP(16*7,"edx"),$T0);
+ &movdqu ($T0,&QWP(16*8,"edi"));
+ &movdqa (&QWP(16*(7-9),"edx"),$T1);
+ &pshufd ($T1,$T0,0b01000100);
+ &pshufd ($T0,$T0,0b11101110);
+ &movdqa (&QWP(16*8,"edx"),$T1);
+ &movdqa (&QWP(16*(8-9),"edx"),$T0);
+
+sub load_input {
+my ($inpbase,$offbase)=@_;
+
+ &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input
+ &movdqu ($T1,&QWP($inpbase+16,"esi"));
+ &lea ("esi",&DWP(16*2,"esi"));
+
+ &movdqa (&QWP($offbase+16*2,"esp"),$D2);
+ &movdqa (&QWP($offbase+16*3,"esp"),$D3);
+ &movdqa (&QWP($offbase+16*4,"esp"),$D4);
+
+ &movdqa ($D2,$T0); # splat input
+ &movdqa ($D3,$T1);
+ &psrldq ($D2,6);
+ &psrldq ($D3,6);
+ &movdqa ($D4,$T0);
+ &punpcklqdq ($D2,$D3); # 2:3
+ &punpckhqdq ($D4,$T1); # 4
+ &punpcklqdq ($T0,$T1); # 0:1
+
+ &movdqa ($D3,$D2);
+ &psrlq ($D2,4);
+ &psrlq ($D3,30);
+ &movdqa ($T1,$T0);
+ &psrlq ($D4,40); # 4
+ &psrlq ($T1,26);
+ &pand ($T0,$MASK); # 0
+ &pand ($T1,$MASK); # 1
+ &pand ($D2,$MASK); # 2
+ &pand ($D3,$MASK); # 3
+ &por ($D4,&QWP(0,"ebx")); # padbit, yes, always
+
+ &movdqa (&QWP($offbase+16*0,"esp"),$D0) if ($offbase);
+ &movdqa (&QWP($offbase+16*1,"esp"),$D1) if ($offbase);
+}
+ &load_input (16*2,16*5);
+
+ &jbe (&label("skip_loop"));
+ &jmp (&label("loop"));
+
+&set_label("loop",32);
+ ################################################################
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ # \___________________/
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ # \___________________/ \____________________/
+ ################################################################
+
+ &movdqa ($T2,&QWP(16*(0-9),"edx")); # r0^2
+ &movdqa (&QWP(16*1,"eax"),$T1);
+ &movdqa (&QWP(16*2,"eax"),$D2);
+ &movdqa (&QWP(16*3,"eax"),$D3);
+ &movdqa (&QWP(16*4,"eax"),$D4);
+
+ ################################################################
+ # d4 = h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1
+ # d3 = h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4
+ # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
+ # d1 = h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ &movdqa ($D1,$T0);
+ &pmuludq ($T0,$T2); # h0*r0
+ &movdqa ($D0,$T1);
+ &pmuludq ($T1,$T2); # h1*r0
+ &pmuludq ($D2,$T2); # h2*r0
+ &pmuludq ($D3,$T2); # h3*r0
+ &pmuludq ($D4,$T2); # h4*r0
+
+sub pmuladd_alt {
+my $addr = shift;
+
+ &pmuludq ($D0,&$addr(8)); # h1*s4
+ &movdqa ($T2,$D1);
+ &pmuludq ($D1,&$addr(1)); # h0*r1
+ &paddq ($D0,$T0);
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&$addr(2)); # h0*r2
+ &paddq ($D1,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&$addr(3)); # h0*r3
+ &paddq ($D2,$T2);
+ &movdqa ($T2,&QWP(16*1,"eax")); # pull h1
+ &pmuludq ($T1,&$addr(4)); # h0*r4
+ &paddq ($D3,$T0);
+
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&$addr(1)); # h1*r1
+ &paddq ($D4,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&$addr(2)); # h1*r2
+ &paddq ($D2,$T2);
+ &movdqa ($T2,&QWP(16*2,"eax")); # pull h2
+ &pmuludq ($T1,&$addr(3)); # h1*r3
+ &paddq ($D3,$T0);
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&$addr(7)); # h2*s3
+ &paddq ($D4,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&$addr(8)); # h2*s4
+ &paddq ($D0,$T2);
+
+ &movdqa ($T2,$T1);
+ &pmuludq ($T1,&$addr(1)); # h2*r1
+ &paddq ($D1,$T0);
+ &movdqa ($T0,&QWP(16*3,"eax")); # pull h3
+ &pmuludq ($T2,&$addr(2)); # h2*r2
+ &paddq ($D3,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&$addr(6)); # h3*s2
+ &paddq ($D4,$T2);
+ &movdqa ($T2,$T1);
+ &pmuludq ($T1,&$addr(7)); # h3*s3
+ &paddq ($D0,$T0);
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&$addr(8)); # h3*s4
+ &paddq ($D1,$T1);
+
+ &movdqa ($T1,&QWP(16*4,"eax")); # pull h4
+ &pmuludq ($T0,&$addr(1)); # h3*r1
+ &paddq ($D2,$T2);
+ &movdqa ($T2,$T1);
+ &pmuludq ($T1,&$addr(8)); # h4*s4
+ &paddq ($D4,$T0);
+ &movdqa ($T0,$T2);
+ &pmuludq ($T2,&$addr(5)); # h4*s1
+ &paddq ($D3,$T1);
+ &movdqa ($T1,$T0);
+ &pmuludq ($T0,&$addr(6)); # h4*s2
+ &paddq ($D0,$T2);
+ &movdqa ($MASK,&QWP(64,"ebx"));
+ &pmuludq ($T1,&$addr(7)); # h4*s3
+ &paddq ($D1,$T0);
+ &paddq ($D2,$T1);
+}
+ &pmuladd_alt (sub { my $i=shift; &QWP(16*($i-9),"edx"); });
+
+ &load_input (-16*2,0);
+ &lea ("eax",&DWP(-16*2,"esi"));
+ &sub ("ecx",64);
+
+ &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value
+ &paddd ($T1,&QWP(16*(5+1),"esp"));
+ &paddd ($D2,&QWP(16*(5+2),"esp"));
+ &paddd ($D3,&QWP(16*(5+3),"esp"));
+ &paddd ($D4,&QWP(16*(5+4),"esp"));
+
+ &cmovb ("esi","eax");
+ &lea ("eax",&DWP(16*10,"esp"));
+
+ &movdqa ($T2,&QWP(16*0,"edx")); # r0^4
+ &movdqa (&QWP(16*1,"esp"),$D1);
+ &movdqa (&QWP(16*1,"eax"),$T1);
+ &movdqa (&QWP(16*2,"eax"),$D2);
+ &movdqa (&QWP(16*3,"eax"),$D3);
+ &movdqa (&QWP(16*4,"eax"),$D4);
+
+ ################################################################
+ # d4 += h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1
+ # d3 += h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4
+ # d2 += h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
+ # d1 += h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ &movdqa ($D1,$T0);
+ &pmuludq ($T0,$T2); # h0*r0
+ &paddq ($T0,$D0);
+ &movdqa ($D0,$T1);
+ &pmuludq ($T1,$T2); # h1*r0
+ &pmuludq ($D2,$T2); # h2*r0
+ &pmuludq ($D3,$T2); # h3*r0
+ &pmuludq ($D4,$T2); # h4*r0
+
+ &paddq ($T1,&QWP(16*1,"esp"));
+ &paddq ($D2,&QWP(16*2,"esp"));
+ &paddq ($D3,&QWP(16*3,"esp"));
+ &paddq ($D4,&QWP(16*4,"esp"));
+
+ &pmuladd_alt (sub { my $i=shift; &QWP(16*$i,"edx"); });
+
+ &lazy_reduction ();
+
+ &load_input (16*2,16*5);
+
+ &ja (&label("loop"));
+
+&set_label("skip_loop");
+ ################################################################
+ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ &pshufd ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
+ &add ("ecx",32);
+ &jnz (&label("long_tail"));
+
+ &paddd ($T0,$D0); # add hash value
+ &paddd ($T1,$D1);
+ &paddd ($D2,&QWP(16*7,"esp"));
+ &paddd ($D3,&QWP(16*8,"esp"));
+ &paddd ($D4,&QWP(16*9,"esp"));
+
+&set_label("long_tail");
+
+ &movdqa (&QWP(16*0,"eax"),$T0);
+ &movdqa (&QWP(16*1,"eax"),$T1);
+ &movdqa (&QWP(16*2,"eax"),$D2);
+ &movdqa (&QWP(16*3,"eax"),$D3);
+ &movdqa (&QWP(16*4,"eax"),$D4);
+
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ &pmuludq ($T0,$T2); # h0*r0
+ &pmuludq ($T1,$T2); # h1*r0
+ &pmuludq ($D2,$T2); # h2*r0
+ &movdqa ($D0,$T0);
+ &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
+ &pmuludq ($D3,$T2); # h3*r0
+ &movdqa ($D1,$T1);
+ &pmuludq ($D4,$T2); # h4*r0
+
+ &pmuladd (sub { my ($reg,$i)=@_;
+ &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
+ },"eax");
+
+ &jz (&label("short_tail"));
+
+ &load_input (-16*2,0);
+
+ &pshufd ($T2,&QWP(16*0,"edx"),0x10); # r0^n
+ &paddd ($T0,&QWP(16*5,"esp")); # add hash value
+ &paddd ($T1,&QWP(16*6,"esp"));
+ &paddd ($D2,&QWP(16*7,"esp"));
+ &paddd ($D3,&QWP(16*8,"esp"));
+ &paddd ($D4,&QWP(16*9,"esp"));
+
+ ################################################################
+ # multiply inp[0:1] by r^4:r^3 and accumulate
+
+ &movdqa (&QWP(16*0,"esp"),$T0);
+ &pmuludq ($T0,$T2); # h0*r0
+ &movdqa (&QWP(16*1,"esp"),$T1);
+ &pmuludq ($T1,$T2); # h1*r0
+ &paddq ($D0,$T0);
+ &movdqa ($T0,$D2);
+ &pmuludq ($D2,$T2); # h2*r0
+ &paddq ($D1,$T1);
+ &movdqa ($T1,$D3);
+ &pmuludq ($D3,$T2); # h3*r0
+ &paddq ($D2,&QWP(16*2,"esp"));
+ &movdqa (&QWP(16*2,"esp"),$T0);
+ &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n
+ &paddq ($D3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*3,"esp"),$T1);
+ &movdqa ($T1,$D4);
+ &pmuludq ($D4,$T2); # h4*r0
+ &paddq ($D4,&QWP(16*4,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$T1);
+
+ &pmuladd (sub { my ($reg,$i)=@_;
+ &pshufd ($reg,&QWP(16*$i,"edx"),0x10);
+ });
+
+&set_label("short_tail");
+
+ ################################################################
+ # horizontal addition
+
+ &pshufd ($T1,$D4,0b01001110);
+ &pshufd ($T0,$D3,0b01001110);
+ &paddq ($D4,$T1);
+ &paddq ($D3,$T0);
+ &pshufd ($T1,$D0,0b01001110);
+ &pshufd ($T0,$D1,0b01001110);
+ &paddq ($D0,$T1);
+ &paddq ($D1,$T0);
+ &pshufd ($T1,$D2,0b01001110);
+ #&paddq ($D2,$T1);
+
+ &lazy_reduction (sub { &paddq ($D2,$T1) });
+
+&set_label("done");
+ &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value
+ &movd (&DWP(-16*3+4*1,"edi"),$D1);
+ &movd (&DWP(-16*3+4*2,"edi"),$D2);
+ &movd (&DWP(-16*3+4*3,"edi"),$D3);
+ &movd (&DWP(-16*3+4*4,"edi"),$D4);
+ &mov ("esp","ebp");
+&set_label("nodata");
+&function_end("_poly1305_blocks_sse2");
+
+&align (32);
+&function_begin("_poly1305_emit_sse2");
+ &mov ("ebp",&wparam(0)); # context
+
+ &cmp (&DWP(4*5,"ebp"),0); # is_base2_26?
+ &je (&label("enter_emit"));
+
+ &mov ("eax",&DWP(4*0,"ebp")); # load hash value
+ &mov ("edi",&DWP(4*1,"ebp"));
+ &mov ("ecx",&DWP(4*2,"ebp"));
+ &mov ("edx",&DWP(4*3,"ebp"));
+ &mov ("esi",&DWP(4*4,"ebp"));
+
+ &mov ("ebx","edi"); # base 2^26 -> base 2^32
+ &shl ("edi",26);
+ &shr ("ebx",6);
+ &add ("eax","edi");
+ &mov ("edi","ecx");
+ &adc ("ebx",0);
+
+ &shl ("edi",20);
+ &shr ("ecx",12);
+ &add ("ebx","edi");
+ &mov ("edi","edx");
+ &adc ("ecx",0);
+
+ &shl ("edi",14);
+ &shr ("edx",18);
+ &add ("ecx","edi");
+ &mov ("edi","esi");
+ &adc ("edx",0);
+
+ &shl ("edi",8);
+ &shr ("esi",24);
+ &add ("edx","edi");
+ &adc ("esi",0); # can be partially reduced
+
+ &mov ("edi","esi"); # final reduction
+ &and ("esi",3);
+ &shr ("edi",2);
+ &lea ("ebp",&DWP(0,"edi","edi",4)); # *5
+ &mov ("edi",&wparam(1)); # output
+ &add ("eax","ebp");
+ &mov ("ebp",&wparam(2)); # key
+ &adc ("ebx",0);
+ &adc ("ecx",0);
+ &adc ("edx",0);
+ &adc ("esi",0);
+
+ &movd ($D0,"eax"); # offload original hash value
+ &add ("eax",5); # compare to modulus
+ &movd ($D1,"ebx");
+ &adc ("ebx",0);
+ &movd ($D2,"ecx");
+ &adc ("ecx",0);
+ &movd ($D3,"edx");
+ &adc ("edx",0);
+ &adc ("esi",0);
+ &shr ("esi",2); # did it carry/borrow?
+
+ &neg ("esi"); # do we choose (hash-modulus) ...
+ &and ("eax","esi");
+ &and ("ebx","esi");
+ &and ("ecx","esi");
+ &and ("edx","esi");
+ &mov (&DWP(4*0,"edi"),"eax");
+ &movd ("eax",$D0);
+ &mov (&DWP(4*1,"edi"),"ebx");
+ &movd ("ebx",$D1);
+ &mov (&DWP(4*2,"edi"),"ecx");
+ &movd ("ecx",$D2);
+ &mov (&DWP(4*3,"edi"),"edx");
+ &movd ("edx",$D3);
+
+ &not ("esi"); # ... or original hash value?
+ &and ("eax","esi");
+ &and ("ebx","esi");
+ &or ("eax",&DWP(4*0,"edi"));
+ &and ("ecx","esi");
+ &or ("ebx",&DWP(4*1,"edi"));
+ &and ("edx","esi");
+ &or ("ecx",&DWP(4*2,"edi"));
+ &or ("edx",&DWP(4*3,"edi"));
+
+ &add ("eax",&DWP(4*0,"ebp")); # accumulate key
+ &adc ("ebx",&DWP(4*1,"ebp"));
+ &mov (&DWP(4*0,"edi"),"eax");
+ &adc ("ecx",&DWP(4*2,"ebp"));
+ &mov (&DWP(4*1,"edi"),"ebx");
+ &adc ("edx",&DWP(4*3,"ebp"));
+ &mov (&DWP(4*2,"edi"),"ecx");
+ &mov (&DWP(4*3,"edi"),"edx");
+&function_end("_poly1305_emit_sse2");
+
+if ($avx>1) {
+########################################################################
+# Note that poly1305_init_avx2 operates on %xmm, I could have used
+# poly1305_init_sse2...
+
+&align (32);
+&function_begin_B("_poly1305_init_avx2");
+ &vmovdqu ($D4,&QWP(4*6,"edi")); # key base 2^32
+ &lea ("edi",&DWP(16*3,"edi")); # size optimization
+ &mov ("ebp","esp");
+ &sub ("esp",16*(9+5));
+ &and ("esp",-16);
+
+ #&vpand ($D4,$D4,&QWP(96,"ebx")); # magic mask
+ &vmovdqa ($MASK,&QWP(64,"ebx"));
+
+ &vpand ($D0,$D4,$MASK); # -> base 2^26
+ &vpsrlq ($D1,$D4,26);
+ &vpsrldq ($D3,$D4,6);
+ &vpand ($D1,$D1,$MASK);
+ &vpsrlq ($D2,$D3,4)
+ &vpsrlq ($D3,$D3,30);
+ &vpand ($D2,$D2,$MASK);
+ &vpand ($D3,$D3,$MASK);
+ &vpsrldq ($D4,$D4,13);
+
+ &lea ("edx",&DWP(16*9,"esp")); # size optimization
+ &mov ("ecx",2);
+&set_label("square");
+ &vmovdqa (&QWP(16*0,"esp"),$D0);
+ &vmovdqa (&QWP(16*1,"esp"),$D1);
+ &vmovdqa (&QWP(16*2,"esp"),$D2);
+ &vmovdqa (&QWP(16*3,"esp"),$D3);
+ &vmovdqa (&QWP(16*4,"esp"),$D4);
+
+ &vpslld ($T1,$D1,2);
+ &vpslld ($T0,$D2,2);
+ &vpaddd ($T1,$T1,$D1); # *5
+ &vpaddd ($T0,$T0,$D2); # *5
+ &vmovdqa (&QWP(16*5,"esp"),$T1);
+ &vmovdqa (&QWP(16*6,"esp"),$T0);
+ &vpslld ($T1,$D3,2);
+ &vpslld ($T0,$D4,2);
+ &vpaddd ($T1,$T1,$D3); # *5
+ &vpaddd ($T0,$T0,$D4); # *5
+ &vmovdqa (&QWP(16*7,"esp"),$T1);
+ &vmovdqa (&QWP(16*8,"esp"),$T0);
+
+ &vpshufd ($T0,$D0,0b01000100);
+ &vmovdqa ($T1,$D1);
+ &vpshufd ($D1,$D1,0b01000100);
+ &vpshufd ($D2,$D2,0b01000100);
+ &vpshufd ($D3,$D3,0b01000100);
+ &vpshufd ($D4,$D4,0b01000100);
+ &vmovdqa (&QWP(16*0,"edx"),$T0);
+ &vmovdqa (&QWP(16*1,"edx"),$D1);
+ &vmovdqa (&QWP(16*2,"edx"),$D2);
+ &vmovdqa (&QWP(16*3,"edx"),$D3);
+ &vmovdqa (&QWP(16*4,"edx"),$D4);
+
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ &vpmuludq ($D4,$D4,$D0); # h4*r0
+ &vpmuludq ($D3,$D3,$D0); # h3*r0
+ &vpmuludq ($D2,$D2,$D0); # h2*r0
+ &vpmuludq ($D1,$D1,$D0); # h1*r0
+ &vpmuludq ($D0,$T0,$D0); # h0*r0
+
+ &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3
+ &vpaddq ($D4,$D4,$T0);
+ &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2
+ &vpaddq ($D3,$D3,$T2);
+ &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1
+ &vpaddq ($D2,$D2,$T0);
+ &vmovdqa ($T2,&QWP(16*5,"esp")); # s1
+ &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0
+ &vpaddq ($D1,$D1,$T1);
+ &vmovdqa ($T0,&QWP(16*2,"esp")); # r2
+ &vpmuludq ($T2,$T2,&QWP(16*4,"edx")); # s1*h4
+ &vpaddq ($D0,$D0,$T2);
+
+ &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2
+ &vpaddq ($D4,$D4,$T1);
+ &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r2*h1
+ &vpaddq ($D3,$D3,$T2);
+ &vmovdqa ($T1,&QWP(16*6,"esp")); # s2
+ &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r2*h0
+ &vpaddq ($D2,$D2,$T0);
+ &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4
+ &vpaddq ($D1,$D1,$T2);
+ &vmovdqa ($T0,&QWP(16*3,"esp")); # r3
+ &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3
+ &vpaddq ($D0,$D0,$T1);
+
+ &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r3*h1
+ &vpaddq ($D4,$D4,$T2);
+ &vmovdqa ($T1,&QWP(16*7,"esp")); # s3
+ &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r3*h0
+ &vpaddq ($D3,$D3,$T0);
+ &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4
+ &vpaddq ($D2,$D2,$T2);
+ &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3
+ &vpaddq ($D1,$D1,$T0);
+ &vmovdqa ($T2,&QWP(16*4,"esp")); # r4
+ &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2
+ &vpaddq ($D0,$D0,$T1);
+
+ &vmovdqa ($T0,&QWP(16*8,"esp")); # s4
+ &vpmuludq ($T2,$T2,&QWP(16*0,"edx")); # r4*h0
+ &vpaddq ($D4,$D4,$T2);
+ &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4
+ &vpaddq ($D3,$D3,$T1);
+ &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # s4*h1
+ &vpaddq ($D0,$D0,$T2);
+ &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2
+ &vpaddq ($D1,$D1,$T1);
+ &vmovdqa ($MASK,&QWP(64,"ebx"));
+ &vpmuludq ($T0,$T0,&QWP(16*3,"edx")); # s4*h3
+ &vpaddq ($D2,$D2,$T0);
+
+ ################################################################
+ # lazy reduction
+ &vpsrlq ($T0,$D3,26);
+ &vpand ($D3,$D3,$MASK);
+ &vpsrlq ($T1,$D0,26);
+ &vpand ($D0,$D0,$MASK);
+ &vpaddq ($D4,$D4,$T0); # h3 -> h4
+ &vpaddq ($D1,$D1,$T1); # h0 -> h1
+ &vpsrlq ($T0,$D4,26);
+ &vpand ($D4,$D4,$MASK);
+ &vpsrlq ($T1,$D1,26);
+ &vpand ($D1,$D1,$MASK);
+ &vpaddq ($D2,$D2,$T1); # h1 -> h2
+ &vpaddd ($D0,$D0,$T0);
+ &vpsllq ($T0,$T0,2);
+ &vpsrlq ($T1,$D2,26);
+ &vpand ($D2,$D2,$MASK);
+ &vpaddd ($D0,$D0,$T0); # h4 -> h0
+ &vpaddd ($D3,$D3,$T1); # h2 -> h3
+ &vpsrlq ($T1,$D3,26);
+ &vpsrlq ($T0,$D0,26);
+ &vpand ($D0,$D0,$MASK);
+ &vpand ($D3,$D3,$MASK);
+ &vpaddd ($D1,$D1,$T0); # h0 -> h1
+ &vpaddd ($D4,$D4,$T1); # h3 -> h4
+
+ &dec ("ecx");
+ &jz (&label("square_break"));
+
+ &vpunpcklqdq ($D0,$D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2
+ &vpunpcklqdq ($D1,$D1,&QWP(16*1,"esp"));
+ &vpunpcklqdq ($D2,$D2,&QWP(16*2,"esp"));
+ &vpunpcklqdq ($D3,$D3,&QWP(16*3,"esp"));
+ &vpunpcklqdq ($D4,$D4,&QWP(16*4,"esp"));
+ &jmp (&label("square"));
+
+&set_label("square_break");
+ &vpsllq ($D0,$D0,32); # -> r^3:0:r^4:0
+ &vpsllq ($D1,$D1,32);
+ &vpsllq ($D2,$D2,32);
+ &vpsllq ($D3,$D3,32);
+ &vpsllq ($D4,$D4,32);
+ &vpor ($D0,$D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2
+ &vpor ($D1,$D1,&QWP(16*1,"esp"));
+ &vpor ($D2,$D2,&QWP(16*2,"esp"));
+ &vpor ($D3,$D3,&QWP(16*3,"esp"));
+ &vpor ($D4,$D4,&QWP(16*4,"esp"));
+
+ &vpshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4
+ &vpshufd ($D1,$D1,0b10001101);
+ &vpshufd ($D2,$D2,0b10001101);
+ &vpshufd ($D3,$D3,0b10001101);
+ &vpshufd ($D4,$D4,0b10001101);
+
+ &vmovdqu (&QWP(16*0,"edi"),$D0); # save the table
+ &vmovdqu (&QWP(16*1,"edi"),$D1);
+ &vmovdqu (&QWP(16*2,"edi"),$D2);
+ &vmovdqu (&QWP(16*3,"edi"),$D3);
+ &vmovdqu (&QWP(16*4,"edi"),$D4);
+
+ &vpslld ($T1,$D1,2);
+ &vpslld ($T0,$D2,2);
+ &vpaddd ($T1,$T1,$D1); # *5
+ &vpaddd ($T0,$T0,$D2); # *5
+ &vmovdqu (&QWP(16*5,"edi"),$T1);
+ &vmovdqu (&QWP(16*6,"edi"),$T0);
+ &vpslld ($T1,$D3,2);
+ &vpslld ($T0,$D4,2);
+ &vpaddd ($T1,$T1,$D3); # *5
+ &vpaddd ($T0,$T0,$D4); # *5
+ &vmovdqu (&QWP(16*7,"edi"),$T1);
+ &vmovdqu (&QWP(16*8,"edi"),$T0);
+
+ &mov ("esp","ebp");
+ &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization
+ &ret ();
+&function_end_B("_poly1305_init_avx2");
+
+########################################################################
+# now it's time to switch to %ymm
+
+my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
+my $MASK=$T2;
+
+sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
+
+&align (32);
+&function_begin("_poly1305_blocks_avx2");
+ &mov ("edi",&wparam(0)); # ctx
+ &mov ("esi",&wparam(1)); # inp
+ &mov ("ecx",&wparam(2)); # len
+
+ &mov ("eax",&DWP(4*5,"edi")); # is_base2_26
+ &and ("ecx",-16);
+ &jz (&label("nodata"));
+ &cmp ("ecx",64);
+ &jae (&label("enter_avx2"));
+ &test ("eax","eax"); # is_base2_26?
+ &jz (&label("enter_blocks"));
+
+&set_label("enter_avx2");
+ &vzeroupper ();
+
+ &call (&label("pic_point"));
+&set_label("pic_point");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
+
+ &test ("eax","eax"); # is_base2_26?
+ &jnz (&label("base2_26"));
+
+ &call ("_poly1305_init_avx2");
+
+ ################################################# base 2^32 -> base 2^26
+ &mov ("eax",&DWP(0,"edi"));
+ &mov ("ecx",&DWP(3,"edi"));
+ &mov ("edx",&DWP(6,"edi"));
+ &mov ("esi",&DWP(9,"edi"));
+ &mov ("ebp",&DWP(13,"edi"));
+
+ &shr ("ecx",2);
+ &and ("eax",0x3ffffff);
+ &shr ("edx",4);
+ &and ("ecx",0x3ffffff);
+ &shr ("esi",6);
+ &and ("edx",0x3ffffff);
+
+ &mov (&DWP(4*0,"edi"),"eax");
+ &mov (&DWP(4*1,"edi"),"ecx");
+ &mov (&DWP(4*2,"edi"),"edx");
+ &mov (&DWP(4*3,"edi"),"esi");
+ &mov (&DWP(4*4,"edi"),"ebp");
+ &mov (&DWP(4*5,"edi"),1); # is_base2_26
+
+ &mov ("esi",&wparam(1)); # [reload] inp
+ &mov ("ecx",&wparam(2)); # [reload] len
+
+&set_label("base2_26");
+ &mov ("eax",&wparam(3)); # padbit
+ &mov ("ebp","esp");
+
+ &sub ("esp",32*(5+9));
+ &and ("esp",-512); # ensure that frame
+ # doesn't cross page
+ # boundary, which is
+ # essential for
+ # misaligned 32-byte
+ # loads
+
+ ################################################################
+ # expand and copy pre-calculated table to stack
+
+ &vmovdqu (&X($D0),&QWP(16*(3+0),"edi"));
+ &lea ("edx",&DWP(32*5+128,"esp")); # +128 size optimization
+ &vmovdqu (&X($D1),&QWP(16*(3+1),"edi"));
+ &vmovdqu (&X($D2),&QWP(16*(3+2),"edi"));
+ &vmovdqu (&X($D3),&QWP(16*(3+3),"edi"));
+ &vmovdqu (&X($D4),&QWP(16*(3+4),"edi"));
+ &lea ("edi",&DWP(16*3,"edi")); # size optimization
+ &vpermq ($D0,$D0,0b01000000); # 00001234 -> 12343434
+ &vpermq ($D1,$D1,0b01000000);
+ &vpermq ($D2,$D2,0b01000000);
+ &vpermq ($D3,$D3,0b01000000);
+ &vpermq ($D4,$D4,0b01000000);
+ &vpshufd ($D0,$D0,0b11001000); # 12343434 -> 14243444
+ &vpshufd ($D1,$D1,0b11001000);
+ &vpshufd ($D2,$D2,0b11001000);
+ &vpshufd ($D3,$D3,0b11001000);
+ &vpshufd ($D4,$D4,0b11001000);
+ &vmovdqa (&QWP(32*0-128,"edx"),$D0);
+ &vmovdqu (&X($D0),&QWP(16*5,"edi"));
+ &vmovdqa (&QWP(32*1-128,"edx"),$D1);
+ &vmovdqu (&X($D1),&QWP(16*6,"edi"));
+ &vmovdqa (&QWP(32*2-128,"edx"),$D2);
+ &vmovdqu (&X($D2),&QWP(16*7,"edi"));
+ &vmovdqa (&QWP(32*3-128,"edx"),$D3);
+ &vmovdqu (&X($D3),&QWP(16*8,"edi"));
+ &vmovdqa (&QWP(32*4-128,"edx"),$D4);
+ &vpermq ($D0,$D0,0b01000000);
+ &vpermq ($D1,$D1,0b01000000);
+ &vpermq ($D2,$D2,0b01000000);
+ &vpermq ($D3,$D3,0b01000000);
+ &vpshufd ($D0,$D0,0b11001000);
+ &vpshufd ($D1,$D1,0b11001000);
+ &vpshufd ($D2,$D2,0b11001000);
+ &vpshufd ($D3,$D3,0b11001000);
+ &vmovdqa (&QWP(32*5-128,"edx"),$D0);
+ &vmovd (&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value
+ &vmovdqa (&QWP(32*6-128,"edx"),$D1);
+ &vmovd (&X($D1),&DWP(-16*3+4*1,"edi"));
+ &vmovdqa (&QWP(32*7-128,"edx"),$D2);
+ &vmovd (&X($D2),&DWP(-16*3+4*2,"edi"));
+ &vmovdqa (&QWP(32*8-128,"edx"),$D3);
+ &vmovd (&X($D3),&DWP(-16*3+4*3,"edi"));
+ &vmovd (&X($D4),&DWP(-16*3+4*4,"edi"));
+ &vmovdqa ($MASK,&QWP(64,"ebx"));
+ &neg ("eax"); # padbit
+
+ &test ("ecx",63);
+ &jz (&label("even"));
+
+ &mov ("edx","ecx");
+ &and ("ecx",-64);
+ &and ("edx",63);
+
+ &vmovdqu (&X($T0),&QWP(16*0,"esi"));
+ &cmp ("edx",32);
+ &jb (&label("one"));
+
+ &vmovdqu (&X($T1),&QWP(16*1,"esi"));
+ &je (&label("two"));
+
+ &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
+ &lea ("esi",&DWP(16*3,"esi"));
+ &lea ("ebx",&DWP(8,"ebx")); # three padbits
+ &lea ("edx",&DWP(32*5+128+8,"esp")); # --:r^1:r^2:r^3 (*)
+ &jmp (&label("tail"));
+
+&set_label("two");
+ &lea ("esi",&DWP(16*2,"esi"));
+ &lea ("ebx",&DWP(16,"ebx")); # two padbits
+ &lea ("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*)
+ &jmp (&label("tail"));
+
+&set_label("one");
+ &lea ("esi",&DWP(16*1,"esi"));
+ &vpxor ($T1,$T1,$T1);
+ &lea ("ebx",&DWP(32,"ebx","eax",8)); # one or no padbits
+ &lea ("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*)
+ &jmp (&label("tail"));
+
+# (*) spots marked with '--' are data from next table entry, but they
+# are multiplied by 0 and therefore rendered insignificant
+
+&set_label("even",32);
+ &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input
+ &vmovdqu (&X($T1),&QWP(16*1,"esi"));
+ &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
+ &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
+ &lea ("esi",&DWP(16*4,"esi"));
+ &sub ("ecx",64);
+ &jz (&label("tail"));
+
+&set_label("loop");
+ ################################################################
+ # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
+ # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
+ # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
+ # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
+ # \________/ \_______/
+ ################################################################
+
+sub vsplat_input {
+ &vmovdqa (&QWP(32*2,"esp"),$D2);
+ &vpsrldq ($D2,$T0,6); # splat input
+ &vmovdqa (&QWP(32*0,"esp"),$D0);
+ &vpsrldq ($D0,$T1,6);
+ &vmovdqa (&QWP(32*1,"esp"),$D1);
+ &vpunpckhqdq ($D1,$T0,$T1); # 4
+ &vpunpcklqdq ($T0,$T0,$T1); # 0:1
+ &vpunpcklqdq ($D2,$D2,$D0); # 2:3
+
+ &vpsrlq ($D0,$D2,30);
+ &vpsrlq ($D2,$D2,4);
+ &vpsrlq ($T1,$T0,26);
+ &vpsrlq ($D1,$D1,40); # 4
+ &vpand ($D2,$D2,$MASK); # 2
+ &vpand ($T0,$T0,$MASK); # 0
+ &vpand ($T1,$T1,$MASK); # 1
+ &vpand ($D0,$D0,$MASK); # 3 (*)
+ &vpor ($D1,$D1,&QWP(0,"ebx")); # padbit, yes, always
+
+ # (*) note that output is counterintuitive, inp[3:4] is
+ # returned in $D1-2, while $D3-4 are preserved;
+}
+ &vsplat_input ();
+
+sub vpmuladd {
+my $addr = shift;
+
+ &vpaddq ($D2,$D2,&QWP(32*2,"esp")); # add hash value
+ &vpaddq ($T0,$T0,&QWP(32*0,"esp"));
+ &vpaddq ($T1,$T1,&QWP(32*1,"esp"));
+ &vpaddq ($D0,$D0,$D3);
+ &vpaddq ($D1,$D1,$D4);
+
+ ################################################################
+ # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
+ # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
+ # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
+ # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
+ # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
+
+ &vpmuludq ($D3,$D2,&$addr(1)); # d3 = h2*r1
+ &vmovdqa (QWP(32*1,"esp"),$T1);
+ &vpmuludq ($D4,$D2,&$addr(2)); # d4 = h2*r2
+ &vmovdqa (QWP(32*3,"esp"),$D0);
+ &vpmuludq ($D0,$D2,&$addr(7)); # d0 = h2*s3
+ &vmovdqa (QWP(32*4,"esp"),$D1);
+ &vpmuludq ($D1,$D2,&$addr(8)); # d1 = h2*s4
+ &vpmuludq ($D2,$D2,&$addr(0)); # d2 = h2*r0
+
+ &vpmuludq ($T2,$T0,&$addr(3)); # h0*r3
+ &vpaddq ($D3,$D3,$T2); # d3 += h0*r3
+ &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4
+ &vpaddq ($D4,$D4,$T1); # d4 + h0*r4
+ &vpmuludq ($T2,$T0,&$addr(0)); # h0*r0
+ &vpaddq ($D0,$D0,$T2); # d0 + h0*r0
+ &vmovdqa ($T2,&QWP(32*1,"esp")); # h1
+ &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1
+ &vpaddq ($D1,$D1,$T1); # d1 += h0*r1
+ &vpmuludq ($T0,$T0,&$addr(2)); # h0*r2
+ &vpaddq ($D2,$D2,$T0); # d2 += h0*r2
+
+ &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2
+ &vpaddq ($D3,$D3,$T1); # d3 += h1*r2
+ &vpmuludq ($T0,$T2,&$addr(3)); # h1*r3
+ &vpaddq ($D4,$D4,$T0); # d4 += h1*r3
+ &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4
+ &vpaddq ($D0,$D0,$T1); # d0 += h1*s4
+ &vmovdqa ($T1,&QWP(32*3,"esp")); # h3
+ &vpmuludq ($T0,$T2,&$addr(0)); # h1*r0
+ &vpaddq ($D1,$D1,$T0); # d1 += h1*r0
+ &vpmuludq ($T2,$T2,&$addr(1)); # h1*r1
+ &vpaddq ($D2,$D2,$T2); # d2 += h1*r1
+
+ &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0
+ &vpaddq ($D3,$D3,$T0); # d3 += h3*r0
+ &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1
+ &vpaddq ($D4,$D4,$T2); # d4 += h3*r1
+ &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2
+ &vpaddq ($D0,$D0,$T0); # d0 += h3*s2
+ &vmovdqa ($T0,&QWP(32*4,"esp")); # h4
+ &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3
+ &vpaddq ($D1,$D1,$T2); # d1+= h3*s3
+ &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4
+ &vpaddq ($D2,$D2,$T1); # d2 += h3*s4
+
+ &vpmuludq ($T2,$T0,&$addr(8)); # h4*s4
+ &vpaddq ($D3,$D3,$T2); # d3 += h4*s4
+ &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1
+ &vpaddq ($D0,$D0,$T1); # d0 += h4*s1
+ &vpmuludq ($T2,$T0,&$addr(0)); # h4*r0
+ &vpaddq ($D4,$D4,$T2); # d4 += h4*r0
+ &vmovdqa ($MASK,&QWP(64,"ebx"));
+ &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2
+ &vpaddq ($D1,$D1,$T1); # d1 += h4*s2
+ &vpmuludq ($T0,$T0,&$addr(7)); # h4*s3
+ &vpaddq ($D2,$D2,$T0); # d2 += h4*s3
+}
+ &vpmuladd (sub { my $i=shift; &QWP(32*$i-128,"edx"); });
+
+sub vlazy_reduction {
+ ################################################################
+ # lazy reduction
+
+ &vpsrlq ($T0,$D3,26);
+ &vpand ($D3,$D3,$MASK);
+ &vpsrlq ($T1,$D0,26);
+ &vpand ($D0,$D0,$MASK);
+ &vpaddq ($D4,$D4,$T0); # h3 -> h4
+ &vpaddq ($D1,$D1,$T1); # h0 -> h1
+ &vpsrlq ($T0,$D4,26);
+ &vpand ($D4,$D4,$MASK);
+ &vpsrlq ($T1,$D1,26);
+ &vpand ($D1,$D1,$MASK);
+ &vpaddq ($D2,$D2,$T1); # h1 -> h2
+ &vpaddq ($D0,$D0,$T0);
+ &vpsllq ($T0,$T0,2);
+ &vpsrlq ($T1,$D2,26);
+ &vpand ($D2,$D2,$MASK);
+ &vpaddq ($D0,$D0,$T0); # h4 -> h0
+ &vpaddq ($D3,$D3,$T1); # h2 -> h3
+ &vpsrlq ($T1,$D3,26);
+ &vpsrlq ($T0,$D0,26);
+ &vpand ($D0,$D0,$MASK);
+ &vpand ($D3,$D3,$MASK);
+ &vpaddq ($D1,$D1,$T0); # h0 -> h1
+ &vpaddq ($D4,$D4,$T1); # h3 -> h4
+}
+ &vlazy_reduction();
+
+ &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input
+ &vmovdqu (&X($T1),&QWP(16*1,"esi"));
+ &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
+ &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
+ &lea ("esi",&DWP(16*4,"esi"));
+ &sub ("ecx",64);
+ &jnz (&label("loop"));
+
+&set_label("tail");
+ &vsplat_input ();
+ &and ("ebx",-64); # restore pointer
+
+ &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); });
+
+ ################################################################
+ # horizontal addition
+
+ &vpsrldq ($T0,$D4,8);
+ &vpsrldq ($T1,$D3,8);
+ &vpaddq ($D4,$D4,$T0);
+ &vpsrldq ($T0,$D0,8);
+ &vpaddq ($D3,$D3,$T1);
+ &vpsrldq ($T1,$D1,8);
+ &vpaddq ($D0,$D0,$T0);
+ &vpsrldq ($T0,$D2,8);
+ &vpaddq ($D1,$D1,$T1);
+ &vpermq ($T1,$D4,2); # keep folding
+ &vpaddq ($D2,$D2,$T0);
+ &vpermq ($T0,$D3,2);
+ &vpaddq ($D4,$D4,$T1);
+ &vpermq ($T1,$D0,2);
+ &vpaddq ($D3,$D3,$T0);
+ &vpermq ($T0,$D1,2);
+ &vpaddq ($D0,$D0,$T1);
+ &vpermq ($T1,$D2,2);
+ &vpaddq ($D1,$D1,$T0);
+ &vpaddq ($D2,$D2,$T1);
+
+ &vlazy_reduction();
+
+ &cmp ("ecx",0);
+ &je (&label("done"));
+
+ ################################################################
+ # clear all but single word
+
+ &vpshufd (&X($D0),&X($D0),0b11111100);
+ &lea ("edx",&DWP(32*5+128,"esp")); # restore pointer
+ &vpshufd (&X($D1),&X($D1),0b11111100);
+ &vpshufd (&X($D2),&X($D2),0b11111100);
+ &vpshufd (&X($D3),&X($D3),0b11111100);
+ &vpshufd (&X($D4),&X($D4),0b11111100);
+ &jmp (&label("even"));
+
+&set_label("done",16);
+ &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
+ &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1));
+ &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2));
+ &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3));
+ &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4));
+ &vzeroupper ();
+ &mov ("esp","ebp");
+&set_label("nodata");
+&function_end("_poly1305_blocks_avx2");
+}
+&set_label("const_sse2",64);
+ &data_word(1<<24,0, 1<<24,0, 1<<24,0, 1<<24,0);
+ &data_word(0,0, 0,0, 0,0, 0,0);
+ &data_word(0x03ffffff,0,0x03ffffff,0, 0x03ffffff,0, 0x03ffffff,0);
+ &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
+}
+&asciz ("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&align (4);
+
+&asm_finish();
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-x86_64.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-x86_64.pl
new file mode 100755
index 0000000..4c22ded
--- /dev/null
+++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -0,0 +1,2268 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86_64.
+#
+# March 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+# IALU/gcc-4.8(*) AVX(**) AVX2
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
+# Sandy Bridge 1.39/+140% 1.10
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake 1.13/+120% 0.96 0.51
+# Silvermont 2.83/+95% -
+# Goldmont 1.70/+180% -
+# VIA Nano 1.82/+150% -
+# Sledgehammer 1.38/+160% -
+# Bulldozer 2.30/+130% 0.97
+#
+# (*) improvement coefficients relative to clang are more modest and
+# are ~50% on most processors, in both cases we are comparing to
+# __int128 code;
+# (**) SSE2 implementation was attempted, but among non-AVX processors
+# it was faster than integer-only code only on older Intel P4 and
+# Core processors, 50-30%, less newer processor is, but slower on
+# contemporary ones, for example almost 2x slower on Atom, and as
+# former are naturally disappearing, SSE2 is deemed unnecessary;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=12);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+my ($mac,$nonce)=($inp,$len); # *_emit arguments
+my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
+my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
+
+sub poly1305_iteration {
+# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
+# output: $h0-$h2 *= $r0-$r1
+$code.=<<___;
+ mulq $h0 # h0*r1
+ mov %rax,$d2
+ mov $r0,%rax
+ mov %rdx,$d3
+
+ mulq $h0 # h0*r0
+ mov %rax,$h0 # future $h0
+ mov $r0,%rax
+ mov %rdx,$d1
+
+ mulq $h1 # h1*r0
+ add %rax,$d2
+ mov $s1,%rax
+ adc %rdx,$d3
+
+ mulq $h1 # h1*s1
+ mov $h2,$h1 # borrow $h1
+ add %rax,$h0
+ adc %rdx,$d1
+
+ imulq $s1,$h1 # h2*s1
+ add $h1,$d2
+ mov $d1,$h1
+ adc \$0,$d3
+
+ imulq $r0,$h2 # h2*r0
+ add $d2,$h1
+ mov \$-4,%rax # mask value
+ adc $h2,$d3
+
+ and $d3,%rax # last reduction step
+ mov $d3,$h2
+ shr \$2,$d3
+ and \$3,$h2
+ add $d3,%rax
+ add %rax,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+___
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^64
+# unsigned __int64 r[2]; # key value base 2^64
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl poly1305_init
+.hidden poly1305_init
+.globl poly1305_blocks
+.hidden poly1305_blocks
+.globl poly1305_emit
+.hidden poly1305_emit
+
+.type poly1305_init,\@function,3
+.align 32
+poly1305_init:
+ xor %rax,%rax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+ cmp \$0,$inp
+ je .Lno_key
+
+ lea poly1305_blocks(%rip),%r10
+ lea poly1305_emit(%rip),%r11
+___
+$code.=<<___ if ($avx);
+ mov OPENSSL_ia32cap_P+4(%rip),%r9
+ lea poly1305_blocks_avx(%rip),%rax
+ lea poly1305_emit_avx(%rip),%rcx
+ bt \$`60-32`,%r9 # AVX?
+ cmovc %rax,%r10
+ cmovc %rcx,%r11
+___
+$code.=<<___ if ($avx>1);
+ lea poly1305_blocks_avx2(%rip),%rax
+ bt \$`5+32`,%r9 # AVX2?
+ cmovc %rax,%r10
+___
+$code.=<<___;
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ and 8($inp),%rcx
+ mov %rax,24($ctx)
+ mov %rcx,32($ctx)
+___
+$code.=<<___ if ($flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,\@function,4
+.align 32
+poly1305_blocks:
+.Lblocks:
+ shr \$4,$len
+ jz .Lno_data # too short
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+.Lblocks_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2
+
+ mov $s1,$r1
+ shr \$2,$s1
+ mov $r1,%rax
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+ jmp .Loop
+
+.align 32
+.Loop:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+___
+ &poly1305_iteration();
+$code.=<<___;
+ mov $r1,%rax
+ dec %r15 # len-=16
+ jnz .Loop
+
+ mov $h0,0($ctx) # store hash value
+ mov $h1,8($ctx)
+ mov $h2,16($ctx)
+
+ mov 0(%rsp),%r15
+ mov 8(%rsp),%r14
+ mov 16(%rsp),%r13
+ mov 24(%rsp),%r12
+ mov 32(%rsp),%rbp
+ mov 40(%rsp),%rbx
+ lea 48(%rsp),%rsp
+.Lno_data:
+.Lblocks_epilogue:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,\@function,3
+.align 32
+poly1305_emit:
+.Lemit:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overfow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+if ($avx) {
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int32 h[5]; # current hash value base 2^26
+# unsigned __int32 is_base2_26;
+# unsigned __int64 r[2]; # key value base 2^64
+# unsigned __int64 pad;
+# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
+ map("%xmm$_",(0..15));
+
+$code.=<<___;
+.type __poly1305_block,\@abi-omnipotent
+.align 32
+__poly1305_block:
+___
+ &poly1305_iteration();
+$code.=<<___;
+ ret
+.size __poly1305_block,.-__poly1305_block
+
+.type __poly1305_init_avx,\@abi-omnipotent
+.align 32
+__poly1305_init_avx:
+ mov $r0,$h0
+ mov $r1,$h1
+ xor $h2,$h2
+
+ lea 48+64($ctx),$ctx # size optimization
+
+ mov $r1,%rax
+ call __poly1305_block # r^2
+
+ mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
+ mov \$0x3ffffff,%edx
+ mov $h0,$d1
+ and $h0#d,%eax
+ mov $r0,$d2
+ and $r0#d,%edx
+ mov %eax,`16*0+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*0+4-64`($ctx)
+ shr \$26,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*1+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*1+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*2+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*2+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h1,%rax
+ mov $r1,%rdx
+ shl \$12,%rax
+ shl \$12,%rdx
+ or $d1,%rax
+ or $d2,%rdx
+ and \$0x3ffffff,%eax
+ and \$0x3ffffff,%edx
+ mov %eax,`16*3+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*3+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*4+0-64`($ctx)
+ mov $h1,$d1
+ mov %edx,`16*4+4-64`($ctx)
+ mov $r1,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ shr \$14,$d2
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*5+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*5+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*6+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*6+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+0-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d2#d,`16*7+4-64`($ctx)
+ lea ($d2,$d2,4),$d2 # *5
+ mov $d1#d,`16*8+0-64`($ctx)
+ mov $d2#d,`16*8+4-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^3
+
+ mov \$0x3ffffff,%eax # save r^3 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+12-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+12-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+12-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+12-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+12-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^4
+
+ mov \$0x3ffffff,%eax # save r^4 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+8-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+8-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+8-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+8-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+8-64`($ctx)
+
+ lea -48-64($ctx),$ctx # size [de-]optimization
+ ret
+.size __poly1305_init_avx,.-__poly1305_init_avx
+
+.type poly1305_blocks_avx,\@function,4
+.align 32
+poly1305_blocks_avx:
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ and \$-16,$len
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ test \$31,$len
+ jz .Leven_avx
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+.Lblocks_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+
+ call __poly1305_block
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ sub \$16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx:
+ mov 0(%rsp),%r15
+ mov 8(%rsp),%r14
+ mov 16(%rsp),%r13
+ mov 24(%rsp),%r12
+ mov 32(%rsp),%rbp
+ mov 40(%rsp),%rbx
+ lea 48(%rsp),%rsp
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ ret
+
+.align 32
+.Lbase2_64_avx:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+.Lbase2_64_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$31,$len
+ jz .Linit_avx
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+
+.Linit_avx:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx:
+ mov %r15,$len
+
+ mov 0(%rsp),%r15
+ mov 8(%rsp),%r14
+ mov 16(%rsp),%r13
+ mov 24(%rsp),%r12
+ mov 32(%rsp),%rbp
+ mov 40(%rsp),%rbx
+ lea 48(%rsp),%rax
+ lea 48(%rsp),%rsp
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+
+.align 32
+.Leven_avx:
+ vmovd 4*0($ctx),$H0 # load hash value
+ vmovd 4*1($ctx),$H1
+ vmovd 4*2($ctx),$H2
+ vmovd 4*3($ctx),$H3
+ vmovd 4*4($ctx),$H4
+
+.Ldo_avx:
+___
+$code.=<<___ if (!$win64);
+ lea -0x58(%rsp),%r11
+ sub \$0x178,%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xf8(%rsp),%r11
+ sub \$0x218,%rsp
+ vmovdqa %xmm6,0x50(%r11)
+ vmovdqa %xmm7,0x60(%r11)
+ vmovdqa %xmm8,0x70(%r11)
+ vmovdqa %xmm9,0x80(%r11)
+ vmovdqa %xmm10,0x90(%r11)
+ vmovdqa %xmm11,0xa0(%r11)
+ vmovdqa %xmm12,0xb0(%r11)
+ vmovdqa %xmm13,0xc0(%r11)
+ vmovdqa %xmm14,0xd0(%r11)
+ vmovdqa %xmm15,0xe0(%r11)
+.Ldo_avx_body:
+___
+$code.=<<___;
+ sub \$64,$len
+ lea -32($inp),%rax
+ cmovc %rax,$inp
+
+ vmovdqu `16*3`($ctx),$D4 # preload r0^2
+ lea `16*3+64`($ctx),$ctx # size optimization
+ lea .Lconst(%rip),%rcx
+
+ ################################################################
+ # load input
+ vmovdqu 16*2($inp),$T0
+ vmovdqu 16*3($inp),$T1
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ vpsrlq \$40,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ jbe .Lskip_loop_avx
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*1-64`($ctx),$D1
+ vmovdqu `16*2-64`($ctx),$D2
+ vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
+ vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
+ vmovdqa $D3,-0x90(%r11)
+ vmovdqa $D0,0x00(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vmovdqu `16*3-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x80(%r11)
+ vmovdqa $D1,0x10(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqu `16*4-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x70(%r11)
+ vmovdqa $D2,0x20(%rsp)
+ vpshufd \$0xEE,$D0,$D4
+ vmovdqu `16*5-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D4,-0x60(%r11)
+ vmovdqa $D0,0x30(%rsp)
+ vpshufd \$0xEE,$D1,$D3
+ vmovdqu `16*6-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D3,-0x50(%r11)
+ vmovdqa $D1,0x40(%rsp)
+ vpshufd \$0xEE,$D2,$D4
+ vmovdqu `16*7-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D4,-0x40(%r11)
+ vmovdqa $D2,0x50(%rsp)
+ vpshufd \$0xEE,$D0,$D3
+ vmovdqu `16*8-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D3,-0x30(%r11)
+ vmovdqa $D0,0x60(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x20(%r11)
+ vmovdqa $D1,0x70(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x10(%r11)
+ vmovdqa $D2,0x80(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ ################################################################
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ # \___________________/
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ # \___________________/ \____________________/
+ #
+ # Note that we start with inp[2:3]*r^2. This is because it
+ # doesn't depend on reduction in previous iteration.
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # though note that $Tx and $Hx are "reversed" in this section,
+ # and $D4 is preloaded with r0^2...
+
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vmovdqa $H2,0x20(%r11) # offload hash
+ vpmuludq $T2,$D4,$D2 # d3 = h2*r0
+ vmovdqa 0x10(%rsp),$H2 # r1^2
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vmovdqa $H0,0x00(%r11) #
+ vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
+ vmovdqa $H1,0x10(%r11) #
+ vpmuludq $T3,$H2,$H1 # h3*r1
+ vpaddq $H0,$D0,$D0 # d0 += h4*s1
+ vpaddq $H1,$D4,$D4 # d4 += h3*r1
+ vmovdqa $H3,0x30(%r11) #
+ vpmuludq $T2,$H2,$H0 # h2*r1
+ vpmuludq $T1,$H2,$H1 # h1*r1
+ vpaddq $H0,$D3,$D3 # d3 += h2*r1
+ vmovdqa 0x30(%rsp),$H3 # r2^2
+ vpaddq $H1,$D2,$D2 # d2 += h1*r1
+ vmovdqa $H4,0x40(%r11) #
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpmuludq $T2,$H3,$H0 # h2*r2
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa 0x40(%rsp),$H4 # s2^2
+ vpaddq $H0,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H3,$H1 # h1*r2
+ vpmuludq $T0,$H3,$H3 # h0*r2
+ vpaddq $H1,$D3,$D3 # d3 += h1*r2
+ vmovdqa 0x50(%rsp),$H2 # r3^2
+ vpaddq $H3,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H4,$H0 # h4*s2
+ vpmuludq $T3,$H4,$H4 # h3*s2
+ vpaddq $H0,$D1,$D1 # d1 += h4*s2
+ vmovdqa 0x60(%rsp),$H3 # s3^2
+ vpaddq $H4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa 0x80(%rsp),$H4 # s4^2
+ vpmuludq $T1,$H2,$H1 # h1*r3
+ vpmuludq $T0,$H2,$H2 # h0*r3
+ vpaddq $H1,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $T4,$H3,$H0 # h4*s3
+ vpmuludq $T3,$H3,$H1 # h3*s3
+ vpaddq $H0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*0($inp),$H0 # load input
+ vpaddq $H1,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H3,$H3 # h2*s3
+ vpmuludq $T2,$H4,$T2 # h2*s4
+ vpaddq $H3,$D0,$D0 # d0 += h2*s3
+
+ vmovdqu 16*1($inp),$H1 #
+ vpaddq $T2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $T3,$H4,$T3 # h3*s4
+ vpmuludq $T4,$H4,$T4 # h4*s4
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpaddq $T3,$D2,$D2 # d2 += h3*s4
+ vpaddq $T4,$D3,$D3 # d3 += h4*s4
+ vpsrldq \$6,$H1,$H3 #
+ vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
+ vpmuludq $T1,$H4,$T0 # h1*s4
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpaddq $T4,$D4,$D4 # d4 += h0*r4
+ vmovdqa -0x90(%r11),$T4 # r0^4
+ vpaddq $T0,$D0,$D0 # d0 += h1*s4
+
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ #vpsrlq \$40,$H4,$H4 # 4
+ vpsrldq \$`40/8`,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpand 0(%rcx),$H4,$H4 # .Lmask24
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpaddq 0x00(%r11),$H0,$H0 # add hash value
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ lea 16*2($inp),%rax
+ lea 16*4($inp),$inp
+ sub \$64,$len
+ cmovc %rax,$inp
+
+ ################################################################
+ # Now we accumulate (inp[0:1]+hash)*r^4
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vmovdqa -0x80(%r11),$T2 # r1^4
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T0,$D2,$D2
+ vpaddq $T1,$D3,$D3
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
+ vpaddq $T4,$D4,$D4
+
+ vpaddq $T0,$D0,$D0 # d0 += h4*s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vmovdqa -0x60(%r11),$T3 # r2^4
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpmuludq $H1,$T2,$T1 # h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T1,$D2,$D2 # d2 += h1*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa -0x50(%r11),$T4 # s2^4
+ vpmuludq $H2,$T3,$T0 # h2*r2
+ vpmuludq $H1,$T3,$T1 # h1*r2
+ vpaddq $T0,$D4,$D4 # d4 += h2*r2
+ vpaddq $T1,$D3,$D3 # d3 += h1*r2
+ vmovdqa -0x40(%r11),$T2 # r3^4
+ vpmuludq $H0,$T3,$T3 # h0*r2
+ vpmuludq $H4,$T4,$T0 # h4*s2
+ vpaddq $T3,$D2,$D2 # d2 += h0*r2
+ vpaddq $T0,$D1,$D1 # d1 += h4*s2
+ vmovdqa -0x30(%r11),$T3 # s3^4
+ vpmuludq $H3,$T4,$T4 # h3*s2
+ vpmuludq $H1,$T2,$T1 # h1*r3
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa -0x10(%r11),$T4 # s4^4
+ vpaddq $T1,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T2,$T2 # h0*r3
+ vpmuludq $H4,$T3,$T0 # h4*s3
+ vpaddq $T2,$D3,$D3 # d3 += h0*r3
+ vpaddq $T0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*2($inp),$T0 # load input
+ vpmuludq $H3,$T3,$T2 # h3*s3
+ vpmuludq $H2,$T3,$T3 # h2*s3
+ vpaddq $T2,$D1,$D1 # d1 += h3*s3
+ vmovdqu 16*3($inp),$T1 #
+ vpaddq $T3,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H2,$T4,$H2 # h2*s4
+ vpmuludq $H3,$T4,$H3 # h3*s4
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $H2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H4,$T4,$H4 # h4*s4
+ vpsrldq \$6,$T1,$T3 #
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
+ vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
+ vpmuludq $H1,$T4,$H0
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ #vpsrlq \$40,$T4,$T4 # 4
+ vpsrldq \$`40/8`,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpand 0(%rcx),$T4,$T4 # .Lmask24
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D0
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D0,$H0,$H0
+ vpsllq \$2,$D0,$D0
+ vpaddq $D0,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+ ################################################################
+ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
+ add \$32,$len
+ jnz .Long_tail_avx
+
+ vpaddq $H2,$T2,$T2
+ vpaddq $H0,$T0,$T0
+ vpaddq $H1,$T1,$T1
+ vpaddq $H3,$T3,$T3
+ vpaddq $H4,$T4,$T4
+
+.Long_tail_avx:
+ vmovdqa $H2,0x20(%r11)
+ vmovdqa $H0,0x00(%r11)
+ vmovdqa $H1,0x10(%r11)
+ vmovdqa $H3,0x30(%r11)
+ vmovdqa $H4,0x40(%r11)
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $T2,$D4,$D2 # d2 = h2*r0
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vpmuludq $T3,$H2,$H0 # h3*r1
+ vpaddq $H0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
+ vpmuludq $T2,$H2,$H1 # h2*r1
+ vpaddq $H1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
+ vpmuludq $T1,$H2,$H0 # h1*r1
+ vpaddq $H0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $T4,$H3,$H3 # h4*s1
+ vpaddq $H3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
+ vpmuludq $T2,$H4,$H1 # h2*r2
+ vpaddq $H1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H4,$H0 # h1*r2
+ vpaddq $H0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
+ vpmuludq $T0,$H4,$H4 # h0*r2
+ vpaddq $H4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H2,$H1 # h4*s2
+ vpaddq $H1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
+ vpmuludq $T3,$H2,$H2 # h3*s2
+ vpaddq $H2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $T1,$H3,$H0 # h1*r3
+ vpaddq $H0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $T0,$H3,$H3 # h0*r3
+ vpaddq $H3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
+ vpmuludq $T4,$H4,$H1 # h4*s3
+ vpaddq $H1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
+ vpmuludq $T3,$H4,$H0 # h3*s3
+ vpaddq $H0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H4,$H4 # h2*s3
+ vpaddq $H4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $T0,$H2,$H2 # h0*r4
+ vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
+ vpmuludq $T4,$H3,$H1 # h4*s4
+ vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
+ vpmuludq $T3,$H3,$H0 # h3*s4
+ vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
+ vpmuludq $T2,$H3,$H1 # h2*s4
+ vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
+ vpmuludq $T1,$H3,$H3 # h1*s4
+ vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 16*0($inp),$H0 # load input
+ vmovdqu 16*1($inp),$H1
+
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpsrldq \$6,$H1,$H3
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ vpsrlq \$40,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
+ vpaddq 0x00(%r11),$H0,$H0
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ ################################################################
+ # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpaddq $T0,$D0,$D0 # d0 += h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T1,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpaddq $T0,$D2,$D2 # d2 += h2*r0
+ vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T1,$D3,$D3 # d3 += h3*r0
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpaddq $T4,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
+ vpmuludq $H1,$T2,$T0 # h1*r1
+ vpaddq $T0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $H4,$T3,$T3 # h4*s1
+ vpaddq $T3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
+ vpmuludq $H2,$T4,$T1 # h2*r2
+ vpaddq $T1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $H1,$T4,$T0 # h1*r2
+ vpaddq $T0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
+ vpmuludq $H0,$T4,$T4 # h0*r2
+ vpaddq $T4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $H4,$T2,$T1 # h4*s2
+ vpaddq $T1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
+ vpmuludq $H3,$T2,$T2 # h3*s2
+ vpaddq $T2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $H1,$T3,$T0 # h1*r3
+ vpaddq $T0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T3,$T3 # h0*r3
+ vpaddq $T3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
+ vpmuludq $H4,$T4,$T1 # h4*s3
+ vpaddq $T1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
+ vpmuludq $H3,$T4,$T0 # h3*s3
+ vpaddq $T0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $H2,$T4,$T4 # h2*s3
+ vpaddq $T4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H0,$T2,$T2 # h0*r4
+ vpaddq $T2,$D4,$D4 # d4 += h0*r4
+ vpmuludq $H4,$T3,$T1 # h4*s4
+ vpaddq $T1,$D3,$D3 # d3 += h4*s4
+ vpmuludq $H3,$T3,$T0 # h3*s4
+ vpaddq $T0,$D2,$D2 # d2 += h3*s4
+ vpmuludq $H2,$T3,$T1 # h2*s4
+ vpaddq $T1,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H1,$T3,$T3 # h1*s4
+ vpaddq $T3,$D0,$D0 # d0 += h1*s4
+
+.Lshort_tail_avx:
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D4,$T4
+ vpsrldq \$8,$D3,$T3
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$D0,$T0
+ vpsrldq \$8,$D2,$T2
+ vpaddq $T3,$D3,$D3
+ vpaddq $T4,$D4,$D4
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$D2,$D2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D4,$H4
+ vpand $MASK,$D4,$D4
+
+ vpsrlq \$26,$D1,$H1
+ vpand $MASK,$D1,$D1
+ vpaddq $H1,$D2,$D2 # h1 -> h2
+
+ vpaddq $H4,$D0,$D0
+ vpsllq \$2,$H4,$H4
+ vpaddq $H4,$D0,$D0 # h4 -> h0
+
+ vpsrlq \$26,$D2,$H2
+ vpand $MASK,$D2,$D2
+ vpaddq $H2,$D3,$D3 # h2 -> h3
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
+ vmovd $D1,`4*1-48-64`($ctx)
+ vmovd $D2,`4*2-48-64`($ctx)
+ vmovd $D3,`4*3-48-64`($ctx)
+ vmovd $D4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa 0x50(%r11),%xmm6
+ vmovdqa 0x60(%r11),%xmm7
+ vmovdqa 0x70(%r11),%xmm8
+ vmovdqa 0x80(%r11),%xmm9
+ vmovdqa 0x90(%r11),%xmm10
+ vmovdqa 0xa0(%r11),%xmm11
+ vmovdqa 0xb0(%r11),%xmm12
+ vmovdqa 0xc0(%r11),%xmm13
+ vmovdqa 0xd0(%r11),%xmm14
+ vmovdqa 0xe0(%r11),%xmm15
+ lea 0xf8(%r11),%rsp
+.Ldo_avx_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea 0x58(%r11),%rsp
+___
+$code.=<<___;
+ vzeroupper
+ ret
+.size poly1305_blocks_avx,.-poly1305_blocks_avx
+
+.type poly1305_emit_avx,\@function,3
+.align 32
+poly1305_emit_avx:
+ cmpl \$0,20($ctx) # is_base2_26?
+ je .Lemit
+
+ mov 0($ctx),%eax # load hash value base 2^26
+ mov 4($ctx),%ecx
+ mov 8($ctx),%r8d
+ mov 12($ctx),%r11d
+ mov 16($ctx),%r10d
+
+ shl \$26,%rcx # base 2^26 -> base 2^64
+ mov %r8,%r9
+ shl \$52,%r8
+ add %rcx,%rax
+ shr \$12,%r9
+ add %rax,%r8 # h0
+ adc \$0,%r9
+
+ shl \$14,%r11
+ mov %r10,%rax
+ shr \$24,%r10
+ add %r11,%r9
+ shl \$40,%rax
+ add %rax,%r9 # h1
+ adc \$0,%r10 # h2
+
+ mov %r10,%rax # could be partially reduced, so reduce
+ mov %r10,%rcx
+ and \$3,%r10
+ shr \$2,%rax
+ and \$-4,%rcx
+ add %rcx,%rax
+ add %rax,%r8
+ adc \$0,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overfow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ ret
+.size poly1305_emit_avx,.-poly1305_emit_avx
+___
+
+if ($avx>1) {
+my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+ map("%ymm$_",(0..15));
+my $S4=$MASK;
+
+$code.=<<___;
+.type poly1305_blocks_avx2,\@function,4
+.align 32
+poly1305_blocks_avx2:
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx2
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2:
+ and \$-16,$len
+ jz .Lno_data_avx2
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx2
+
+ test \$63,$len
+ jz .Leven_avx2
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+.Lblocks_avx2_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+.Lbase2_26_pre_avx2:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_26_pre_avx2
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ test %r15,%r15
+ jz .Lstore_base2_26_avx2
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ jmp .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx2:
+ mov 0(%rsp),%r15
+ mov 8(%rsp),%r14
+ mov 16(%rsp),%r13
+ mov 24(%rsp),%r12
+ mov 32(%rsp),%rbp
+ mov 40(%rsp),%rbx
+ lea 48(%rsp),%rsp
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+ ret
+
+.align 32
+.Lbase2_64_avx2:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+.Lbase2_64_avx2_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$63,$len
+ jz .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx2:
+ mov %r15,$len
+
+ mov 0(%rsp),%r15
+ mov 8(%rsp),%r14
+ mov 16(%rsp),%r13
+ mov 24(%rsp),%r12
+ mov 32(%rsp),%rbp
+ mov 40(%rsp),%rbx
+ lea 48(%rsp),%rax
+ lea 48(%rsp),%rsp
+.Lbase2_64_avx2_epilogue:
+ jmp .Ldo_avx2
+
+.align 32
+.Leven_avx2:
+ vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
+ vmovd 4*1($ctx),%x#$H1
+ vmovd 4*2($ctx),%x#$H2
+ vmovd 4*3($ctx),%x#$H3
+ vmovd 4*4($ctx),%x#$H4
+
+.Ldo_avx2:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%rsp),%r11
+ sub \$0x128,%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xf8(%rsp),%r11
+ sub \$0x1c8,%rsp
+ vmovdqa %xmm6,0x50(%r11)
+ vmovdqa %xmm7,0x60(%r11)
+ vmovdqa %xmm8,0x70(%r11)
+ vmovdqa %xmm9,0x80(%r11)
+ vmovdqa %xmm10,0x90(%r11)
+ vmovdqa %xmm11,0xa0(%r11)
+ vmovdqa %xmm12,0xb0(%r11)
+ vmovdqa %xmm13,0xc0(%r11)
+ vmovdqa %xmm14,0xd0(%r11)
+ vmovdqa %xmm15,0xe0(%r11)
+.Ldo_avx2_body:
+___
+$code.=<<___;
+ lea 48+64($ctx),$ctx # size optimization
+ lea .Lconst(%rip),%rcx
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*0-64`($ctx),%x#$T2
+ and \$-512,%rsp
+ vmovdqu `16*1-64`($ctx),%x#$T3
+ vmovdqu `16*2-64`($ctx),%x#$T4
+ vmovdqu `16*3-64`($ctx),%x#$D0
+ vmovdqu `16*4-64`($ctx),%x#$D1
+ vmovdqu `16*5-64`($ctx),%x#$D2
+ vmovdqu `16*6-64`($ctx),%x#$D3
+ vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
+ vmovdqu `16*7-64`($ctx),%x#$D4
+ vpermq \$0x15,$T3,$T3
+ vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
+ vmovdqu `16*8-64`($ctx),%x#$MASK
+ vpermq \$0x15,$T4,$T4
+ vpshufd \$0xc8,$T3,$T3
+ vmovdqa $T2,0x00(%rsp)
+ vpermq \$0x15,$D0,$D0
+ vpshufd \$0xc8,$T4,$T4
+ vmovdqa $T3,0x20(%rsp)
+ vpermq \$0x15,$D1,$D1
+ vpshufd \$0xc8,$D0,$D0
+ vmovdqa $T4,0x40(%rsp)
+ vpermq \$0x15,$D2,$D2
+ vpshufd \$0xc8,$D1,$D1
+ vmovdqa $D0,0x60(%rsp)
+ vpermq \$0x15,$D3,$D3
+ vpshufd \$0xc8,$D2,$D2
+ vmovdqa $D1,0x80(%rsp)
+ vpermq \$0x15,$D4,$D4
+ vpshufd \$0xc8,$D3,$D3
+ vmovdqa $D2,0xa0(%rsp)
+ vpermq \$0x15,$MASK,$MASK
+ vpshufd \$0xc8,$D4,$D4
+ vmovdqa $D3,0xc0(%rsp)
+ vpshufd \$0xc8,$MASK,$MASK
+ vmovdqa $D4,0xe0(%rsp)
+ vmovdqa $MASK,0x100(%rsp)
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ ################################################################
+ # load input
+ vmovdqu 16*0($inp),%x#$T0
+ vmovdqu 16*1($inp),%x#$T1
+ vinserti128 \$1,16*2($inp),$T0,$T0
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T3,$T2,$T2 # 2:3
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+
+ vpsrlq \$30,$T2,$T3
+ vpsrlq \$4,$T2,$T2
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$40,$T4,$T4 # 4
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ lea 0x90(%rsp),%rax # size optimization
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$64,$len
+ jz .Ltail_avx2
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ ################################################################
+ # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
+ # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
+ # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
+ # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
+ # \________/\________/
+ ################################################################
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqa `32*0`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqa `32*1`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqa `32*3`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
+ vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # however, as h2 is "chronologically" first one available pull
+ # corresponding operations up, so it's
+ #
+ # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
+ # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
+ # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+ vmovdqa `32*4-0x90`(%rax),$T1 # s2
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vmovdqu 16*0($inp),%x#$T0 # load input
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+ vinserti128 \$1,16*2($inp),$T0,$T0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vmovdqu 16*1($inp),%x#$T1
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqa `32*5-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpsrldq \$6,$T1,$T3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+ vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # lazy reduction (interleaved with tail of input splat)
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$4,$T3,$T2
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpand $MASK,$T2,$T2 # 2
+ vpsrlq \$26,$T0,$T1
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
+ vpsrlq \$30,$T3,$T3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$40,$T4,$T4 # 4
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ sub \$64,$len
+ jnz .Loop_avx2
+
+ .byte 0x66,0x90
+.Ltail_avx2:
+ ################################################################
+ # while above multiplications were by r^4 in all lanes, in last
+ # iteration we multiply least significant lane by r^4 and most
+ # significant one by r, so copy of above except that references
+ # to the precomputed table are displaced by 4...
+
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqu `32*0+4`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqu `32*1+4`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqu `32*3+4`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
+ vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$H2,$T2
+ vpsrldq \$8,$H3,$T3
+ vpsrldq \$8,$H4,$T4
+ vpsrldq \$8,$H0,$T0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+
+ vpermq \$0x2,$H3,$T3
+ vpermq \$0x2,$H4,$T4
+ vpermq \$0x2,$H0,$T0
+ vpermq \$0x2,$D1,$T1
+ vpermq \$0x2,$H2,$T2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+ vmovd %x#$H1,`4*1-48-64`($ctx)
+ vmovd %x#$H2,`4*2-48-64`($ctx)
+ vmovd %x#$H3,`4*3-48-64`($ctx)
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa 0x50(%r11),%xmm6
+ vmovdqa 0x60(%r11),%xmm7
+ vmovdqa 0x70(%r11),%xmm8
+ vmovdqa 0x80(%r11),%xmm9
+ vmovdqa 0x90(%r11),%xmm10
+ vmovdqa 0xa0(%r11),%xmm11
+ vmovdqa 0xb0(%r11),%xmm12
+ vmovdqa 0xc0(%r11),%xmm13
+ vmovdqa 0xd0(%r11),%xmm14
+ vmovdqa 0xe0(%r11),%xmm15
+ lea 0xf8(%r11),%rsp
+.Ldo_avx2_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea 8(%r11),%rsp
+___
+$code.=<<___;
+ vzeroupper
+ ret
+.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
+___
+}
+$code.=<<___;
+.align 64
+.Lconst:
+.Lmask24:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lfive:
+.long 5,0,5,0,5,0,5,0
+___
+}
+
+$code.=<<___;
+.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size se_handler,.-se_handler
+
+.type avx_handler,\@abi-omnipotent
+.align 16
+avx_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 208($context),%rax # pull context->R11
+
+ lea 0x50(%rax),%rsi
+ lea 0xf8(%rax),%rax
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size avx_handler,.-avx_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_poly1305_init
+ .rva .LSEH_end_poly1305_init
+ .rva .LSEH_info_poly1305_init
+
+ .rva .LSEH_begin_poly1305_blocks
+ .rva .LSEH_end_poly1305_blocks
+ .rva .LSEH_info_poly1305_blocks
+
+ .rva .LSEH_begin_poly1305_emit
+ .rva .LSEH_end_poly1305_emit
+ .rva .LSEH_info_poly1305_emit
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_poly1305_blocks_avx
+ .rva .Lbase2_64_avx
+ .rva .LSEH_info_poly1305_blocks_avx_1
+
+ .rva .Lbase2_64_avx
+ .rva .Leven_avx
+ .rva .LSEH_info_poly1305_blocks_avx_2
+
+ .rva .Leven_avx
+ .rva .LSEH_end_poly1305_blocks_avx
+ .rva .LSEH_info_poly1305_blocks_avx_3
+
+ .rva .LSEH_begin_poly1305_emit_avx
+ .rva .LSEH_end_poly1305_emit_avx
+ .rva .LSEH_info_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_poly1305_blocks_avx2
+ .rva .Lbase2_64_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_1
+
+ .rva .Lbase2_64_avx2
+ .rva .Leven_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_2
+
+ .rva .Leven_avx2
+ .rva .LSEH_end_poly1305_blocks_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_3
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_poly1305_init:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
+
+.LSEH_info_poly1305_blocks:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_body,.Lblocks_epilogue
+
+.LSEH_info_poly1305_emit:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
+___
+$code.=<<___ if ($avx);
+.LSEH_info_poly1305_blocks_avx_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_emit_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_poly1305_blocks_avx2_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
+___
+}
+
+foreach (split('\n',$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+ s/%r([a-z]+)#d/%e$1/g;
+ s/%r([0-9]+)#d/%r$1d/g;
+ s/%x#%y/%x/g;
+
+ print $_,"\n";
+}
+close STDOUT;