aboutsummaryrefslogtreecommitdiff
path: root/openssl-1.1.0h/crypto/modes
diff options
context:
space:
mode:
Diffstat (limited to 'openssl-1.1.0h/crypto/modes')
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl1106
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl467
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl554
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl247
-rwxr-xr-xopenssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl470
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl738
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl258
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl581
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl1405
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl1762
-rwxr-xr-xopenssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl670
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl430
-rw-r--r--openssl-1.1.0h/crypto/modes/build.info27
-rw-r--r--openssl-1.1.0h/crypto/modes/cbc128.c161
-rw-r--r--openssl-1.1.0h/crypto/modes/ccm128.c432
-rw-r--r--openssl-1.1.0h/crypto/modes/cfb128.c198
-rw-r--r--openssl-1.1.0h/crypto/modes/ctr128.c209
-rw-r--r--openssl-1.1.0h/crypto/modes/cts128.c523
-rw-r--r--openssl-1.1.0h/crypto/modes/gcm128.c2301
-rw-r--r--openssl-1.1.0h/crypto/modes/modes_lcl.h185
-rw-r--r--openssl-1.1.0h/crypto/modes/ocb128.c568
-rw-r--r--openssl-1.1.0h/crypto/modes/ofb128.c74
-rw-r--r--openssl-1.1.0h/crypto/modes/wrap128.c329
-rw-r--r--openssl-1.1.0h/crypto/modes/xts128.c157
24 files changed, 13852 insertions, 0 deletions
diff --git a/openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl
new file mode 100644
index 0000000..5ad62b3
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1106 @@
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.20) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+if ($avx>1) {{{
+
+($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type _aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_ghash_6x:
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ sub \$6,$len
+ vpxor $Z0,$Z0,$Z0 # $Z0 = 0
+ vmovdqu 0x00-0x80($key),$rndkey
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpaddb $T2,$inout2,$inout3
+ vpaddb $T2,$inout3,$inout4
+ vpaddb $T2,$inout4,$inout5
+ vpxor $rndkey,$T1,$inout0
+ vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32 # discard $inout[1-5]?
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddb $T2,$inout5,$T1 # next counter value
+ vpxor $rndkey,$inout1,$inout1
+ vpxor $rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+ vmovdqu $T1,($ivp) # save next counter value
+ vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
+ vpxor $rndkey,$inout3,$inout3
+ vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
+ vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
+
+ # At this point, the current block of 96 (0x60) bytes has already been
+ # loaded into registers. Concurrently with processing it, we want to
+ # load the next 96 bytes of input for the next round. Obviously, we can
+ # only do this if there are at least 96 more bytes of input beyond the
+ # input we're currently processing, or else we'd read past the end of
+ # the input buffer. Here, we set |%r12| to 96 if there are at least 96
+ # bytes of input beyond the 96 bytes we're already processing, and we
+ # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
+ # we'll read in the next block so that it is in registers for the next
+ # loop iteration. In the case where we set |%r12| to 0, we'll re-read
+ # the current block and then ignore what we re-read.
+ #
+ # At this point, |$in0| points to the current (already read into
+ # registers) block, and |$end0| points to 2*96 bytes before the end of
+ # the input. Thus, |$in0| > |$end0| means that we do not have the next
+ # 96-byte block to read in, and |$in0| <= |$end0| means we do.
+ xor %r12,%r12
+ cmp $in0,$end0
+
+ vaesenc $T2,$inout0,$inout0
+ vmovdqu 0x30+8(%rsp),$Ii # I[4]
+ vpxor $rndkey,$inout4,$inout4
+ vpclmulqdq \$0x00,$Hkey,$Z3,$T1
+ vaesenc $T2,$inout1,$inout1
+ vpxor $rndkey,$inout5,$inout5
+ setnc %r12b
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vaesenc $T2,$inout2,$inout2
+ vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
+ neg %r12
+ vaesenc $T2,$inout3,$inout3
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+ vaesenc $T2,$inout4,$inout4
+ vpxor $Z1,$T1,$Z0
+ and \$0x60,%r12
+ vmovups 0x20-0x80($key),$rndkey
+ vpclmulqdq \$0x10,$Hkey,$Ii,$T1
+ vaesenc $T2,$inout5,$inout5
+
+ vpclmulqdq \$0x01,$Hkey,$Ii,$T2
+ lea ($in0,%r12),$in0
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
+ vmovdqu 0x40+8(%rsp),$Ii # I[3]
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x58($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x50($in0),%r12
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x20+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x28+8(%rsp)
+ vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x30-0x80($key),$rndkey
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x00,$Z1,$Ii,$T1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x10,$Z1,$Ii,$T2
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Hkey,$Z3,$Z3
+ vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
+ vaesenc $rndkey,$inout2,$inout2
+ vpclmulqdq \$0x11,$Z1,$Ii,$Z1
+ vmovdqu 0x50+8(%rsp),$Ii # I[2]
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor $T1,$Z0,$Z0
+ vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x40-0x80($key),$rndkey
+ vpxor $T2,$Z2,$Z2
+ vpclmulqdq \$0x00,$T1,$Ii,$T2
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x10,$T1,$Ii,$Hkey
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x48($in0),%r13
+ vpxor $Z1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T1,$Ii,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x40($in0),%r12
+ vpclmulqdq \$0x11,$T1,$Ii,$T1
+ vmovdqu 0x60+8(%rsp),$Ii # I[1]
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x30+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x38+8(%rsp)
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x50-0x80($key),$rndkey
+ vpxor $Hkey,$Z2,$Z2
+ vpclmulqdq \$0x00,$T2,$Ii,$Hkey
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$T2,$Ii,$Z1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x38($in0),%r13
+ vpxor $T1,$Z3,$Z3
+ vpclmulqdq \$0x01,$T2,$Ii,$T1
+ vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x30($in0),%r12
+ vpclmulqdq \$0x11,$T2,$Ii,$T2
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x40+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x48+8(%rsp)
+ vpxor $Hkey,$Z0,$Z0
+ vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
+ vaesenc $rndkey,$inout5,$inout5
+
+ vmovups 0x60-0x80($key),$rndkey
+ vpxor $Z1,$Z2,$Z2
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $T1,$Z2,$Z2
+ vpclmulqdq \$0x01,$Hkey,$Xi,$T1
+ vaesenc $rndkey,$inout1,$inout1
+ movbe 0x28($in0),%r13
+ vpxor $T2,$Z3,$Z3
+ vpclmulqdq \$0x00,$Hkey,$Xi,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x20($in0),%r12
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r13,0x50+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ mov %r12,0x58+8(%rsp)
+ vpxor $Z1,$Z2,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor $T1,$Z2,$Z2
+
+ vmovups 0x70-0x80($key),$rndkey
+ vpslldq \$8,$Z2,$Z1
+ vpxor $T2,$Z0,$Z0
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor $Xi,$Z3,$Z3
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor $Z1,$Z0,$Z0
+ movbe 0x18($in0),%r13
+ vaesenc $rndkey,$inout2,$inout2
+ movbe 0x10($in0),%r12
+ vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ mov %r13,0x60+8(%rsp)
+ vaesenc $rndkey,$inout3,$inout3
+ mov %r12,0x68+8(%rsp)
+ vaesenc $rndkey,$inout4,$inout4
+ vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vmovups 0x90-0x80($key),$rndkey
+ vaesenc $T1,$inout1,$inout1
+ vpsrldq \$8,$Z2,$Z2
+ vaesenc $T1,$inout2,$inout2
+ vpxor $Z2,$Z3,$Z3
+ vaesenc $T1,$inout3,$inout3
+ vpxor $Ii,$Z0,$Z0
+ movbe 0x08($in0),%r13
+ vaesenc $T1,$inout4,$inout4
+ movbe 0x00($in0),%r12
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xa0-0x80($key),$T1
+ cmp \$11,$rounds
+ jb .Lenc_tail # 128-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xb0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xc0-0x80($key),$T1
+ je .Lenc_tail # 192-bit key
+
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+
+ vaesenc $T1,$inout0,$inout0
+ vaesenc $T1,$inout1,$inout1
+ vaesenc $T1,$inout2,$inout2
+ vaesenc $T1,$inout3,$inout3
+ vaesenc $T1,$inout4,$inout4
+ vmovups 0xd0-0x80($key),$rndkey
+ vaesenc $T1,$inout5,$inout5
+ vmovups 0xe0-0x80($key),$T1
+ jmp .Lenc_tail # 256-bit key
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $rndkey,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $rndkey,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpshufb $Ii,$inout5,$inout5
+ vpshufb $Ii,$T1,$T1 # next counter value
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc $rndkey,$inout0,$inout0
+ vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
+ vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
+ vaesenc $rndkey,$inout1,$inout1
+ vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
+ vpxor 0x00($inp),$T1,$T2
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x10($inp),$T1,$Ii
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x20($inp),$T1,$Z1
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x30($inp),$T1,$Z2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x40($inp),$T1,$Z3
+ vpxor 0x50($inp),$T1,$Hkey
+ vmovdqu ($ivp),$T1 # load next counter value
+
+ vaesenclast $T2,$inout0,$inout0
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ vaesenclast $Ii,$inout1,$inout1
+ vpaddb $T2,$T1,$Ii
+ mov %r13,0x70+8(%rsp)
+ lea 0x60($inp),$inp
+ vaesenclast $Z1,$inout2,$inout2
+ vpaddb $T2,$Ii,$Z1
+ mov %r12,0x78+8(%rsp)
+ lea 0x60($out),$out
+ vmovdqu 0x00-0x80($key),$rndkey
+ vaesenclast $Z2,$inout3,$inout3
+ vpaddb $T2,$Z1,$Z2
+ vaesenclast $Z3, $inout4,$inout4
+ vpaddb $T2,$Z2,$Z3
+ vaesenclast $Hkey,$inout5,$inout5
+ vpaddb $T2,$Z3,$Hkey
+
+ add \$0x60,$ret
+ sub \$0x6,$len
+ jc .L6x_done
+
+ vmovups $inout0,-0x60($out) # save output
+ vpxor $rndkey,$T1,$inout0
+ vmovups $inout1,-0x50($out)
+ vmovdqa $Ii,$inout1 # 0 latency
+ vmovups $inout2,-0x40($out)
+ vmovdqa $Z1,$inout2 # 0 latency
+ vmovups $inout3,-0x30($out)
+ vmovdqa $Z2,$inout3 # 0 latency
+ vmovups $inout4,-0x20($out)
+ vmovdqa $Z3,$inout4 # 0 latency
+ vmovups $inout5,-0x10($out)
+ vmovdqa $Hkey,$inout5 # 0 latency
+ vmovdqu 0x20+8(%rsp),$Z3 # I[5]
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
+ vpxor $Z0,$Xi,$Xi # modulo-scheduled
+
+ ret
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+# const AES_KEY *key, unsigned char iv[16],
+# struct { u128 Xi,H,Htbl[9]; } *Xip);
+$code.=<<___;
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@function,6
+.align 32
+aesni_gcm_decrypt:
+ xor $ret,$ret
+
+ # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
+ # bytes of input.
+ cmp \$0x60,$len # minimal accepted length
+ jb .Lgcm_dec_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_dec_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ vmovdqu ($Xip),$Xi # load Xi
+ and \$-128,%rsp # ensure stack alignment
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ lea 0x80($key),$key # size optimization
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ mov 0xf0-0x80($key),$rounds
+ vpshufb $Ii,$Xi,$Xi
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Ldec_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Ldec_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+ vmovdqu 0x50($inp),$Z3 # I[5]
+ lea ($inp),$in0
+ vmovdqu 0x40($inp),$Z0
+
+ # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+ # bytes before the end of the input. Note, in particular, that this is
+ # correct even if |$len| is not an even multiple of 96 or 16. XXX: This
+ # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
+ # not be near the very beginning of the address space when |$len| < 2*96
+ # (0xc0).
+ lea -0xc0($inp,$len),$end0
+
+ vmovdqu 0x30($inp),$Z1
+ shr \$4,$len
+ xor $ret,$ret
+ vmovdqu 0x20($inp),$Z2
+ vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu 0x10($inp),$T2
+ vpshufb $Ii,$Z0,$Z0
+ vmovdqu ($inp),$Hkey
+ vpshufb $Ii,$Z1,$Z1
+ vmovdqu $Z0,0x30(%rsp)
+ vpshufb $Ii,$Z2,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$T2,$T2
+ vmovdqu $Z2,0x50(%rsp)
+ vpshufb $Ii,$Hkey,$Hkey
+ vmovdqu $T2,0x60(%rsp)
+ vmovdqu $Hkey,0x70(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups $inout0,-0x60($out) # save output
+ vmovups $inout1,-0x50($out)
+ vmovups $inout2,-0x40($out)
+ vmovups $inout3,-0x30($out)
+ vmovups $inout4,-0x20($out)
+ vmovups $inout5,-0x10($out)
+
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_dec_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type _aesni_ctr32_6x,\@abi-omnipotent
+.align 32
+_aesni_ctr32_6x:
+ vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
+ vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
+ lea -1($rounds),%r13
+ vmovups 0x10-0x80($key),$rndkey
+ lea 0x20-0x80($key),%r12
+ vpxor $Z0,$T1,$inout0
+ add \$`6<<24`,$counter
+ jc .Lhandle_ctr32_2
+ vpaddb $T2,$T1,$inout1
+ vpaddb $T2,$inout1,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddb $T2,$inout2,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddb $T2,$inout3,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpaddb $T2,$inout4,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpaddb $T2,$inout5,$T1
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc $rndkey,$inout0,$inout0
+ vaesenc $rndkey,$inout1,$inout1
+ vaesenc $rndkey,$inout2,$inout2
+ vaesenc $rndkey,$inout3,$inout3
+ vaesenc $rndkey,$inout4,$inout4
+ vaesenc $rndkey,$inout5,$inout5
+ vmovups (%r12),$rndkey
+ lea 0x10(%r12),%r12
+ dec %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),$Hkey # last round key
+ vaesenc $rndkey,$inout0,$inout0
+ vpxor 0x00($inp),$Hkey,$Z0
+ vaesenc $rndkey,$inout1,$inout1
+ vpxor 0x10($inp),$Hkey,$Z1
+ vaesenc $rndkey,$inout2,$inout2
+ vpxor 0x20($inp),$Hkey,$Z2
+ vaesenc $rndkey,$inout3,$inout3
+ vpxor 0x30($inp),$Hkey,$Xi
+ vaesenc $rndkey,$inout4,$inout4
+ vpxor 0x40($inp),$Hkey,$T2
+ vaesenc $rndkey,$inout5,$inout5
+ vpxor 0x50($inp),$Hkey,$Hkey
+ lea 0x60($inp),$inp
+
+ vaesenclast $Z0,$inout0,$inout0
+ vaesenclast $Z1,$inout1,$inout1
+ vaesenclast $Z2,$inout2,$inout2
+ vaesenclast $Xi,$inout3,$inout3
+ vaesenclast $T2,$inout4,$inout4
+ vaesenclast $Hkey,$inout5,$inout5
+ vmovups $inout0,0x00($out)
+ vmovups $inout1,0x10($out)
+ vmovups $inout2,0x20($out)
+ vmovups $inout3,0x30($out)
+ vmovups $inout4,0x40($out)
+ vmovups $inout5,0x50($out)
+ lea 0x60($out),$out
+
+ ret
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb $Ii,$T1,$Z2 # byte-swap counter
+ vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
+ vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
+ vpaddd $Z1,$Z2,$inout2
+ vpaddd $Z1,$inout1,$inout3
+ vpshufb $Ii,$inout1,$inout1
+ vpaddd $Z1,$inout2,$inout4
+ vpshufb $Ii,$inout2,$inout2
+ vpxor $Z0,$inout1,$inout1
+ vpaddd $Z1,$inout3,$inout5
+ vpshufb $Ii,$inout3,$inout3
+ vpxor $Z0,$inout2,$inout2
+ vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
+ vpshufb $Ii,$inout4,$inout4
+ vpxor $Z0,$inout3,$inout3
+ vpshufb $Ii,$inout5,$inout5
+ vpxor $Z0,$inout4,$inout4
+ vpshufb $Ii,$T1,$T1 # next counter value
+ vpxor $Z0,$inout5,$inout5
+ jmp .Loop_ctr32
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@function,6
+.align 32
+aesni_gcm_encrypt:
+ xor $ret,$ret
+
+ # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
+ # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
+ # least 96 more bytes of input.
+ cmp \$0x60*3,$len # minimal accepted length
+ jb .Lgcm_enc_abort
+
+ lea (%rsp),%rax # save stack pointer
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,-0xd8(%rax)
+ movaps %xmm7,-0xc8(%rax)
+ movaps %xmm8,-0xb8(%rax)
+ movaps %xmm9,-0xa8(%rax)
+ movaps %xmm10,-0x98(%rax)
+ movaps %xmm11,-0x88(%rax)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+.Lgcm_enc_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($ivp),$T1 # input counter value
+ add \$-128,%rsp
+ mov 12($ivp),$counter
+ lea .Lbswap_mask(%rip),$const
+ lea -0x80($key),$in0 # borrow $in0
+ mov \$0xf80,$end0 # borrow $end0
+ lea 0x80($key),$key # size optimization
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ and \$-128,%rsp # ensure stack alignment
+ mov 0xf0-0x80($key),$rounds
+
+ and $end0,$in0
+ and %rsp,$end0
+ sub $in0,$end0
+ jc .Lenc_no_key_aliasing
+ cmp \$768,$end0
+ jnc .Lenc_no_key_aliasing
+ sub $end0,%rsp # avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+ lea ($out),$in0
+
+ # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+ # bytes before the end of the input. Note, in particular, that this is
+ # correct even if |$len| is not an even multiple of 96 or 16. Unlike in
+ # the decryption case, there's no caveat that |$out| must not be near
+ # the very beginning of the address space, because we know that
+ # |$len| >= 3*96 from the check above, and so we know
+ # |$out| + |$len| >= 2*96 (0xc0).
+ lea -0xc0($out,$len),$end0
+
+ shr \$4,$len
+
+ call _aesni_ctr32_6x
+ vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
+ vpshufb $Ii,$inout1,$T2
+ vmovdqu $Xi,0x70(%rsp)
+ vpshufb $Ii,$inout2,$Z0
+ vmovdqu $T2,0x60(%rsp)
+ vpshufb $Ii,$inout3,$Z1
+ vmovdqu $Z0,0x50(%rsp)
+ vpshufb $Ii,$inout4,$Z2
+ vmovdqu $Z1,0x40(%rsp)
+ vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
+ vmovdqu $Z2,0x30(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu ($Xip),$Xi # load Xi
+ lea 0x20+0x20($Xip),$Xip # size optimization
+ sub \$12,$len
+ mov \$0x60*2,$ret
+ vpshufb $Ii,$Xi,$Xi
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 0x20(%rsp),$Z3 # I[5]
+ vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $Z3,$Z3,$T1
+ vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
+ vmovups $inout0,-0x60($out) # save output
+ vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
+ vpxor $Z3,$T1,$T1
+ vmovups $inout1,-0x50($out)
+ vpshufb $Ii,$inout1,$inout1
+ vmovups $inout2,-0x40($out)
+ vpshufb $Ii,$inout2,$inout2
+ vmovups $inout3,-0x30($out)
+ vpshufb $Ii,$inout3,$inout3
+ vmovups $inout4,-0x20($out)
+ vpshufb $Ii,$inout4,$inout4
+ vmovups $inout5,-0x10($out)
+ vpshufb $Ii,$inout5,$inout5
+ vmovdqu $inout0,0x10(%rsp) # free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+ vmovdqu 0x30(%rsp),$Z2 # I[4]
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpunpckhqdq $Z2,$Z2,$T2
+ vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
+ vpxor $Z2,$T2,$T2
+ vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+
+ vmovdqu 0x40(%rsp),$T3 # I[3]
+ vpclmulqdq \$0x00,$Ii,$Z2,$Z0
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $T3,$T3,$Z1
+ vpclmulqdq \$0x11,$Ii,$Z2,$Z2
+ vpxor $T3,$Z1,$Z1
+ vpxor $Z3,$Z2,$Z2
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vmovdqu 0x50(%rsp),$T1 # I[2]
+ vpclmulqdq \$0x00,$Hkey,$T3,$Z3
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z0,$Z3,$Z3
+ vpunpckhqdq $T1,$T1,$Z0
+ vpclmulqdq \$0x11,$Hkey,$T3,$T3
+ vpxor $T1,$Z0,$Z0
+ vpxor $Z2,$T3,$T3
+ vpclmulqdq \$0x00,$HK,$Z1,$Z1
+ vpxor $T2,$Z1,$Z1
+
+ vmovdqu 0x60(%rsp),$T2 # I[1]
+ vpclmulqdq \$0x00,$Ii,$T1,$Z2
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z3,$Z2,$Z2
+ vpunpckhqdq $T2,$T2,$Z3
+ vpclmulqdq \$0x11,$Ii,$T1,$T1
+ vpxor $T2,$Z3,$Z3
+ vpxor $T3,$T1,$T1
+ vpclmulqdq \$0x10,$HK,$Z0,$Z0
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $Z1,$Z0,$Z0
+
+ vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
+ vpclmulqdq \$0x00,$Hkey,$T2,$Z1
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpxor $Z2,$Z1,$Z1
+ vpclmulqdq \$0x11,$Hkey,$T2,$T2
+ vpxor $Xi,$T3,$T3
+ vpxor $T1,$T2,$T2
+ vpclmulqdq \$0x00,$HK,$Z3,$Z3
+ vpxor $Z0,$Z3,$Z0
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z2
+ vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
+ vpunpckhqdq $inout5,$inout5,$T1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Xi
+ vpxor $inout5,$T1,$T1
+ vpxor $Z1,$Z2,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$T3
+ vmovdqu 0x20-0x20($Xip),$HK
+ vpxor $T2,$Xi,$Z3
+ vpxor $Z0,$T3,$Z2
+
+ vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
+ vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
+ vpxor $T3,$Z2,$Z2
+ vpunpckhqdq $inout4,$inout4,$T2
+ vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
+ vpxor $inout4,$T2,$T2
+ vpslldq \$8,$Z2,$T3
+ vpclmulqdq \$0x00,$HK,$T1,$T1
+ vpxor $T3,$Z1,$Xi
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $Z2,$Z3,$Z3
+
+ vpclmulqdq \$0x00,$Ii,$inout4,$Z1
+ vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout3,$inout3,$T3
+ vpclmulqdq \$0x11,$Ii,$inout4,$inout4
+ vpxor $inout3,$T3,$T3
+ vpxor $inout5,$inout4,$inout4
+ vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
+ vpclmulqdq \$0x10,$HK,$T2,$T2
+ vmovdqu 0x50-0x20($Xip),$HK
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
+ vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $inout2,$inout2,$T1
+ vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
+ vpxor $inout2,$T1,$T1
+ vpxor $inout4,$inout3,$inout3
+ vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
+ vpclmulqdq \$0x00,$HK,$T3,$T3
+ vpxor $T2,$T3,$T3
+
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Ii,$inout2,$Z1
+ vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
+ vpxor $Z0,$Z1,$Z1
+ vpunpckhqdq $inout1,$inout1,$T2
+ vpclmulqdq \$0x11,$Ii,$inout2,$inout2
+ vpxor $inout1,$T2,$T2
+ vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
+ vpxor $inout3,$inout2,$inout2
+ vpclmulqdq \$0x10,$HK,$T1,$T1
+ vmovdqu 0x80-0x20($Xip),$HK
+ vpxor $T3,$T1,$T1
+
+ vxorps $Z3,$inout5,$inout5
+ vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
+ vxorps $inout5,$Xi,$Xi
+
+ vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
+ vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
+ vpxor $Z1,$Z0,$Z0
+ vpunpckhqdq $Xi,$Xi,$T3
+ vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
+ vpxor $Xi,$T3,$T3
+ vpxor $inout2,$inout1,$inout1
+ vpclmulqdq \$0x00,$HK,$T2,$T2
+ vpxor $T1,$T2,$T2
+
+ vpclmulqdq \$0x00,$Ii,$Xi,$Z1
+ vpclmulqdq \$0x11,$Ii,$Xi,$Z3
+ vpxor $Z0,$Z1,$Z1
+ vpclmulqdq \$0x10,$HK,$T3,$Z2
+ vpxor $inout1,$Z3,$Z3
+ vpxor $T2,$Z2,$Z2
+
+ vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
+ vpxor $Z0,$Z2,$Z2
+ vpslldq \$8,$Z2,$T1
+ vmovdqu 0x10($const),$Hkey # .Lpoly
+ vpsrldq \$8,$Z2,$Z2
+ vpxor $T1,$Z1,$Xi
+ vpxor $Z2,$Z3,$Z3
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
+ vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
+ vpxor $Z3,$T2,$T2
+ vpxor $T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+ vpshufb ($const),$Xi,$Xi # .Lbswap_mask
+ vmovdqu $Xi,-0x40($Xip) # output Xi
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lgcm_enc_abort:
+ mov $ret,%rax # return value
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+ .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern __imp_RtlVirtualUnwind
+.type gcm_se_handler,\@abi-omnipotent
+.align 16
+gcm_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 120($context),%rax # pull context->Rax
+
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ mov %r15,240($context)
+ mov %r14,232($context)
+ mov %r13,224($context)
+ mov %r12,216($context)
+ mov %rbp,160($context)
+ mov %rbx,144($context)
+
+ lea -0xd8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size gcm_se_handler,.-gcm_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_gcm_decrypt
+ .rva .LSEH_end_aesni_gcm_decrypt
+ .rva .LSEH_gcm_dec_info
+
+ .rva .LSEH_begin_aesni_gcm_encrypt
+ .rva .LSEH_end_aesni_gcm_encrypt
+ .rva .LSEH_gcm_enc_info
+.section .xdata
+.align 8
+.LSEH_gcm_dec_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+ .byte 9,0,0,0
+ .rva gcm_se_handler
+ .rva .Lgcm_enc_body,.Lgcm_enc_abort
+___
+}
+}}} else {{{
+$code=<<___; # assembler is too old
+.text
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+ xor %eax,%eax
+ ret
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 0000000..ccf6b2b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,467 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0"; # $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3"; # $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7"; # $8
+#################
+$Xi="a0"; # $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4"; # $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10"; # $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT"; # $28
+
+{ my $N;
+ sub loop() {
+
+ $N++;
+$code.=<<___;
+.align 4
+ extbl $Xlo,7,$nlo
+ and $nlo,0xf0,$nhi
+ sll $nlo,4,$nlo
+ and $nlo,0xf0,$nlo
+
+ addq $nlo,$Htbl,$nlo
+ ldq $Zlo,8($nlo)
+ addq $nhi,$Htbl,$nhi
+ ldq $Zhi,0($nlo)
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ lda $cnt,6(zero)
+ extbl $Xlo,6,$nlo
+
+ ldq $Tlo1,8($nhi)
+ s8addq $remp,$rem_4bit,$remp
+ ldq $Thi1,0($nhi)
+ srl $Zlo,4,$Zlo
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ and $nlo,0xf0,$nhi
+
+ xor $Tlo1,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ xor $Thi1,$Zhi,$Zhi
+ and $nlo,0xf0,$nlo
+
+ addq $nlo,$Htbl,$nlo
+ ldq $Tlo0,8($nlo)
+ addq $nhi,$Htbl,$nhi
+ ldq $Thi0,0($nlo)
+
+.Looplo$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subq $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xlo,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ bne $cnt,.Looplo$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ lda $cnt,7(zero)
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ unop
+
+
+.Loophi$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subq $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ bne $cnt,.Loophi$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo0,$Zlo,$Zlo
+ xor $Thi0,$Zhi,$Zhi
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ xor $t0,$Zlo,$Zlo
+ xor $rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.globl gcm_gmult_4bit
+.align 4
+.ent gcm_gmult_4bit
+gcm_gmult_4bit:
+ .frame sp,0,ra
+ .prologue 0
+
+ ldq $Xlo,8($Xi)
+ ldq $Xhi,0($Xi)
+
+ bsr $t0,picmeup
+ nop
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ stq $Xlo,8($Xi)
+ stq $Xhi,0($Xi)
+
+ ret (ra)
+.end gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.align 4
+.ent gcm_ghash_4bit
+gcm_ghash_4bit:
+ lda sp,-32(sp)
+ stq ra,0(sp)
+ stq s0,8(sp)
+ stq s1,16(sp)
+ .mask 0x04000600,-32
+ .frame sp,32,ra
+ .prologue 0
+
+ ldq_u $inhi,0($inp)
+ ldq_u $Thi0,7($inp)
+ ldq_u $inlo,8($inp)
+ ldq_u $Tlo0,15($inp)
+ ldq $Xhi,0($Xi)
+ ldq $Xlo,8($Xi)
+
+ bsr $t0,picmeup
+ nop
+
+.Louter:
+ extql $inhi,$inp,$inhi
+ extqh $Thi0,$inp,$Thi0
+ or $inhi,$Thi0,$inhi
+ lda $inp,16($inp)
+
+ extql $inlo,$inp,$inlo
+ extqh $Tlo0,$inp,$Tlo0
+ or $inlo,$Tlo0,$inlo
+ subq $len,16,$len
+
+ xor $Xlo,$inlo,$Xlo
+ xor $Xhi,$inhi,$Xhi
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+ beq $len,.Ldone
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+ ldq_u $inhi,0($inp)
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+ ldq_u $Thi0,7($inp)
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+ ldq_u $inlo,8($inp)
+ ldq_u $Tlo0,15($inp)
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ br zero,.Louter
+
+.Ldone:
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+
+ stq $Xlo,8($Xi)
+ stq $Xhi,0($Xi)
+
+ .set noreorder
+ /*ldq ra,0(sp)*/
+ ldq s0,8(sp)
+ ldq s1,16(sp)
+ lda sp,32(sp)
+ ret (ra)
+.end gcm_ghash_4bit
+
+.align 4
+.ent picmeup
+picmeup:
+ .frame sp,0,$t0
+ .prologue 0
+ br $rem_4bit,.Lpic
+.Lpic: lda $rem_4bit,12($rem_4bit)
+ ret ($t0)
+.end picmeup
+ nop
+rem_4bit:
+ .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
+ .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
+ .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
+ .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
+.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+
+___
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000..7d880c9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,554 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
+# Snapdragon S4 - in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+# bit shift operation is neatly fused with 128-bit xor here, and
+# "538B" variant would eliminate only 4-5 instructions out of 32
+# in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+# consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+$Xi="r0"; # argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4"; # variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp; # used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+ my $i=12;
+ my @args=@_;
+ for ($Zll,$Zlh,$Zhl,$Zhh) {
+ $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev $_,$_
+ str $_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+ str $_,[$Xi,#$i]
+#else
+ mov $Tlh,$_,lsr#8
+ strb $_,[$Xi,#$i+3]
+ mov $Thl,$_,lsr#16
+ strb $Tlh,[$Xi,#$i+2]
+ mov $Thh,$_,lsr#24
+ strb $Thl,[$Xi,#$i+1]
+ strb $Thh,[$Xi,#$i]
+#endif
+___
+ $code.="\t".shift(@args)."\n";
+ $i-=4;
+ }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#ifdef __clang__
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+
+.type rem_4bit,%object
+.align 5
+rem_4bit:
+.short 0x0000,0x1C20,0x3840,0x2460
+.short 0x7080,0x6CA0,0x48C0,0x54E0
+.short 0xE100,0xFD20,0xD940,0xC560
+.short 0x9180,0x8DA0,0xA9C0,0xB5E0
+.size rem_4bit,.-rem_4bit
+
+.type rem_4bit_get,%function
+rem_4bit_get:
+#if defined(__thumb2__)
+ adr $rem_4bit,rem_4bit
+#else
+ sub $rem_4bit,pc,#8+32 @ &rem_4bit
+#endif
+ b .Lrem_4bit_got
+ nop
+ nop
+.size rem_4bit_get,.-rem_4bit_get
+
+.global gcm_ghash_4bit
+.type gcm_ghash_4bit,%function
+.align 4
+gcm_ghash_4bit:
+#if defined(__thumb2__)
+ adr r12,rem_4bit
+#else
+ sub r12,pc,#8+48 @ &rem_4bit
+#endif
+ add $len,$inp,$len @ $len to point at the end
+ stmdb sp!,{r3-r11,lr} @ save $len/end too
+
+ ldmia r12,{r4-r11} @ copy rem_4bit ...
+ stmdb sp!,{r4-r11} @ ... to stack
+
+ ldrb $nlo,[$inp,#15]
+ ldrb $nhi,[$Xi,#15]
+.Louter:
+ eor $nlo,$nlo,$nhi
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ mov $cnt,#14
+
+ add $Zhh,$Htbl,$nlo,lsl#4
+ ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ add $Thh,$Htbl,$nhi
+ ldrb $nlo,[$inp,#14]
+
+ and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ add $nhi,$nhi,$nhi
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ ldrb $nhi,[$Xi,#14]
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ eor $nlo,$nlo,$nhi
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+ add $Thh,$Htbl,$nlo,lsl#4
+ and $nlo,$Zll,#0xf @ rem
+ subs $cnt,$cnt,#1
+ add $nlo,$nlo,$nlo
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
+ eor $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+ it pl
+#endif
+ ldrplb $nlo,[$inp,$cnt]
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ add $nhi,$nhi,$nhi
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ eor $Zll,$Tll,$Zll,lsr#4
+#ifdef __thumb2__
+ it pl
+#endif
+ ldrplb $Tll,[$Xi,$cnt]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrh $Tlh,[sp,$nhi]
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+#ifdef __thumb2__
+ it pl
+#endif
+ eorpl $nlo,$nlo,$Tll
+ eor $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+ itt pl
+#endif
+ andpl $nhi,$nlo,#0xf0
+ andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Linner
+
+ ldr $len,[sp,#32] @ re-load $len/end
+ add $inp,$inp,#16
+ mov $nhi,$Zll
+___
+ &Zsmash("cmp\t$inp,$len","\n".
+ "#ifdef __thumb2__\n".
+ " it ne\n".
+ "#endif\n".
+ " ldrneb $nlo,[$inp,#15]");
+$code.=<<___;
+ bne .Louter
+
+ add sp,sp,#36
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global gcm_gmult_4bit
+.type gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+ stmdb sp!,{r4-r11,lr}
+ ldrb $nlo,[$Xi,#15]
+ b rem_4bit_get
+.Lrem_4bit_got:
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ mov $cnt,#14
+
+ add $Zhh,$Htbl,$nlo,lsl#4
+ ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ ldrb $nlo,[$Xi,#14]
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ add $nhi,$nhi,$nhi
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ and $nhi,$nlo,#0xf0
+ eor $Zhh,$Zhh,$Tll,lsl#16
+ and $nlo,$nlo,#0x0f
+
+.Loop:
+ add $Thh,$Htbl,$nlo,lsl#4
+ and $nlo,$Zll,#0xf @ rem
+ subs $cnt,$cnt,#1
+ add $nlo,$nlo,$nlo
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
+ eor $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+ it pl
+#endif
+ ldrplb $nlo,[$Xi,$cnt]
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ add $nhi,$nhi,$nhi
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+ itt pl
+#endif
+ andpl $nhi,$nlo,#0xf0
+ andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Loop
+___
+ &Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
+
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+___
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ vld1.64 $IN#hi,[r1]! @ load H
+ vmov.i8 $t0,#0xe1
+ vld1.64 $IN#lo,[r1]
+ vshl.i64 $t0#hi,#57
+ vshr.u64 $t0#lo,#63 @ t0=0xc2....01
+ vdup.8 $t1,$IN#hi[7]
+ vshr.u64 $Hlo,$IN#lo,#63
+ vshr.s8 $t1,#7 @ broadcast carry bit
+ vshl.i64 $IN,$IN,#1
+ vand $t0,$t0,$t1
+ vorr $IN#hi,$Hlo @ H<<<=1
+ veor $IN,$IN,$t0 @ twisted H
+ vstmia r0,{$IN}
+
+ ret @ bx lr
+.size gcm_init_neon,.-gcm_init_neon
+
+.global gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ vld1.64 $IN#hi,[$Xi]! @ load Xi
+ vld1.64 $IN#lo,[$Xi]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 $IN,$IN
+#endif
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
+ mov $len,#16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.global gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ vld1.64 $Xl#hi,[$Xi]! @ load Xi
+ vld1.64 $Xl#lo,[$Xi]!
+ vmov.i64 $k48,#0x0000ffffffffffff
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
+ vmov.i64 $k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 $Xl,$Xl
+#endif
+ vmov.i64 $k16,#0x000000000000ffff
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
+
+.Loop_neon:
+ vld1.64 $IN#hi,[$inp]! @ load inp
+ vld1.64 $IN#lo,[$inp]!
+#ifdef __ARMEL__
+ vrev64.8 $IN,$IN
+#endif
+ veor $IN,$Xl @ inp^=Xi
+.Lgmult_neon:
+___
+ &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
+$code.=<<___;
+ veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
+___
+ &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+ &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
+$code.=<<___;
+ veor $Xm,$Xm,$Xl @ Karatsuba post-processing
+ veor $Xm,$Xm,$Xh
+ veor $Xl#hi,$Xl#hi,$Xm#lo
+ veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 $t1,$Xl,#57 @ 1st phase
+ vshl.i64 $t2,$Xl,#62
+ veor $t2,$t2,$t1 @
+ vshl.i64 $t1,$Xl,#63
+ veor $t2, $t2, $t1 @
+ veor $Xl#hi,$Xl#hi,$t2#lo @
+ veor $Xh#lo,$Xh#lo,$t2#hi
+
+ vshr.u64 $t2,$Xl,#1 @ 2nd phase
+ veor $Xh,$Xh,$Xl
+ veor $Xl,$Xl,$t2 @
+ vshr.u64 $t2,$t2,#6
+ vshr.u64 $Xl,$Xl,#1 @
+ veor $Xl,$Xl,$Xh @
+ veor $Xl,$Xl,$t2 @
+
+ subs $len,#16
+ bne .Loop_neon
+
+#ifdef __ARMEL__
+ vrev64.8 $Xl,$Xl
+#endif
+ sub $Xi,#16
+ vst1.64 $Xl#hi,[$Xi]! @ write out Xi
+ vst1.64 $Xl#lo,[$Xi]
+
+ ret @ bx lr
+.size gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl
new file mode 100644
index 0000000..3cadda3
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl
@@ -0,0 +1,247 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
+
+($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
+ $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
+ $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5"); # $rem zaps $Htable
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg gcm_gmult_1bit,_gcm_gmult_1bit
+ .asg gcm_gmult_4bit,_gcm_gmult_4bit
+ .asg gcm_ghash_4bit,_gcm_ghash_4bit
+ .endif
+
+ .asg B3,RA
+
+ .if 0
+ .global _gcm_gmult_1bit
+_gcm_gmult_1bit:
+ ADDAD $Htable,2,$Htable
+ .endif
+ .global _gcm_gmult_4bit
+_gcm_gmult_4bit:
+ .asmfunc
+ LDDW *${Htable}[-1],$H1:$H0 ; H.lo
+ LDDW *${Htable}[-2],$H3:$H2 ; H.hi
+|| MV $Xip,${xip} ; reassign Xi
+|| MVK 15,B1 ; SPLOOPD constant
+
+ MVK 0xE1,$E10000
+|| LDBU *++${xip}[15],$x1 ; Xi[15]
+ MVK 0xFF,$FF000000
+|| LDBU *--${xip},$x0 ; Xi[14]
+ SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
+ SHL $FF000000,24,$FF000000 ; upper byte mask
+|| BNOP ghash_loop?
+|| MVK 1,B0 ; take a single spin
+
+ PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
+ AND $H2,$FF000000,$H2u ; H2's upper byte
+ AND $H3,$FF000000,$H3u ; H3's upper byte
+|| SHRU $H2u,8,$H2u
+ SHRU $H3u,8,$H3u
+|| ZERO $Z1:$Z0
+ SHRU2 $xia,8,$H01u
+|| ZERO $Z3:$Z2
+ .endasmfunc
+
+ .global _gcm_ghash_4bit
+_gcm_ghash_4bit:
+ .asmfunc
+ LDDW *${Htable}[-1],$H1:$H0 ; H.lo
+|| SHRU $len,4,B0 ; reassign len
+ LDDW *${Htable}[-2],$H3:$H2 ; H.hi
+|| MV $Xip,${xip} ; reassign Xi
+|| MVK 15,B1 ; SPLOOPD constant
+
+ MVK 0xE1,$E10000
+|| [B0] LDNDW *${inp}[1],$H1x:$H0x
+ MVK 0xFF,$FF000000
+|| [B0] LDNDW *${inp}++[2],$H3x:$H2x
+ SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
+|| LDDW *${xip}[1],$Z1:$Z0
+ SHL $FF000000,24,$FF000000 ; upper byte mask
+|| LDDW *${xip}[0],$Z3:$Z2
+
+ PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
+ AND $H2,$FF000000,$H2u ; H2's upper byte
+ AND $H3,$FF000000,$H3u ; H3's upper byte
+|| SHRU $H2u,8,$H2u
+ SHRU $H3u,8,$H3u
+ SHRU2 $xia,8,$H01u
+
+|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
+|| [B0] XOR $H1x,$Z1,$Z1
+ .if .LITTLE_ENDIAN
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .else
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .endif
+ STDW $Z3:$Z2,*${xip}[0]
+|| [B0] ZERO $Z3:$Z2
+|| [B0] MV $xia,$x1
+ [B0] ADDK 14,${xip}
+
+ghash_loop?:
+ SPLOOPD 6 ; 6*16+7
+|| MVC B1,ILC
+|| [B0] SUB B0,1,B0
+|| ZERO A0
+|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib
+|| SHL $x1,1,$xia
+___
+
+########____________________________
+# 0 D2. M1 M2 |
+# 1 M1 |
+# 2 M1 M2 |
+# 3 D1. M1 M2 |
+# 4 S1. L1 |
+# 5 S2 S1x L1 D2 L2 |____________________________
+# 6/0 L1 S1 L2 S2x |D2. M1 M2 |
+# 7/1 L1 S1 D1x S2 M2 | M1 |
+# 8/2 S1 L1x S2 | M1 M2 |
+# 9/3 S1 L1x | D1. M1 M2 |
+# 10/4 D1x | S1. L1 |
+# 11/5 |S2 S1x L1 D2 L2 |____________
+# 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
+# 7/1 L1 S1 D1x S2 M2 | ....
+# 8/2 S1 L1x S2 | ....
+#####... ................|............
+$code.=<<___;
+ XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
+|| XORMPY $H01u,$xib,$H01y
+|| [A0] LDBU *--${xip},$x0
+ XORMPY $H1,$xia,$H1x ; 1
+ XORMPY $H2,$xia,$H2x ; 2
+|| XORMPY $H2u,$xib,$H2y
+ XORMPY $H3,$xia,$H3x ; 3
+|| XORMPY $H3u,$xib,$H3y
+||[!A0] MVK.D 15,A0 ; *--${xip} counter
+ XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
+|| [A0] SUB.S A0,1,A0
+ XOR.L $H1x,$Z1,$Z1 ; 5
+|| AND.D $H01y,$FF000000,$H0z
+|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
+|| SHL $x0,1,$xib
+|| SHL $x0,1,$xia
+
+ XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
+|| SHL $Z0,1,$rem ; ; rem=Z<<1
+|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
+|| AND.L $H1y,$FF000000,$H1z
+ XOR.L $H3x,$Z3,$Z3 ; 7/1
+|| SHRMB.S $Z2,$Z1,$Z1
+|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
+|| AND.S $H2y,$FF000000,$H2z
+|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
+ XOR.L $H1z,$Z1,$Z1 ; 8/2
+|| SHRMB.S $Z3,$Z2,$Z2
+|| AND.S $H3y,$FF000000,$H3z
+ XOR.L $H2z,$Z2,$Z2 ; 9/3
+|| SHRU $Z3,8,$Z3
+ XOR.D $H3z,$Z3,$Z3 ; 10/4
+ NOP ; 11/5
+
+ SPKERNEL 0,2
+|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
+
+ ; input pre-fetch is possible where D1 slot is available...
+ [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
+ [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
+ NOP ; 10/-
+ .if .LITTLE_ENDIAN
+ SWAP2 $Z0,$Z1 ; 11/-
+|| SWAP4 $Z1,$Z0
+ SWAP4 $Z1,$Z1 ; 12/-
+|| SWAP2 $Z0,$Z0
+ SWAP2 $Z2,$Z3
+|| SWAP4 $Z3,$Z2
+||[!B0] BNOP RA
+ SWAP4 $Z3,$Z3
+|| SWAP2 $Z2,$Z2
+|| [B0] BNOP ghash_loop?
+ [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
+|| [B0] XOR $H1x,$Z1,$Z1
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .else
+ [!B0] BNOP RA ; 11/-
+ [B0] BNOP ghash_loop? ; 12/-
+ [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
+|| [B0] XOR $H1x,$Z1,$Z1
+ [B0] XOR $H2x,$Z2,$Z2
+|| [B0] XOR $H3x,$Z3,$Z3
+|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
+ STDW $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
+|| [B0] ZERO $Z1:$Z0
+ .endif
+ STDW $Z3:$Z2,*${xip}[0]
+|| [B0] ZERO $Z3:$Z2
+|| [B0] MV $xia,$x1
+ [B0] ADDK 14,${xip}
+ .endasmfunc
+
+ .sect .const
+ .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 0000000..81e75f7
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,470 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+ $ADDP="addp4";
+ for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
+ $big_endian=0 if (/\-DL_ENDIAN/); }
+if (!defined($big_endian))
+ { $big_endian=(unpack('L',pack('N',1))==1); }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
+ (p19) dep rem=Zlo,rem_4bitp,3,4 }
+{ .mfi; (p19) xor Zhi=Zhi,Hhi
+ ($p17) xor xi[1]=xi[1],in[1] };;
+{ .mfi; (p18) ld8 Hhi=[Hi[1]]
+ (p19) shrp Zlo=Zhi,Zlo,4 }
+{ .mfi; (p19) ld8 rem=[rem]
+ (p18) and Hi[1]=mask0xf0,xi[2] };;
+{ .mmi; ($p16) ld1 in[0]=[inp],-1
+ (p18) xor Zlo=Zlo,Hlo
+ (p19) shr.u Zhi=Zhi,4 }
+{ .mib; (p19) xor Hhi=Hhi,rem
+ (p18) add Hi[1]=Htbl,Hi[1] };;
+
+{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
+ (p18) dep rem=Zlo,rem_4bitp,3,4 }
+{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
+ (p18) xor Zhi=Zhi,Hhi };;
+{ .mfi; (p18) ld8 Hhi=[Hi[1]]
+ (p18) shrp Zlo=Zhi,Zlo,4 }
+{ .mfi; (p18) ld8 rem=[rem]
+ (p17) and Hi[0]=mask0xf0,Hi[0] };;
+{ .mmi; (p16) ld1 xi[0]=[Xi],-1
+ (p18) xor Zlo=Zlo,Hlo
+ (p18) shr.u Zhi=Zhi,4 }
+{ .mib; (p18) xor Hhi=Hhi,rem
+ (p17) add Hi[0]=Htbl,Hi[0]
+ br.ctop.sptk $label };;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2; prevlc=r3; prevpr=r8;
+mask0xf0=r21;
+rem=r22; rem_4bitp=r23;
+Xi=r24; Htbl=r25;
+inp=r26; end=r27;
+Hhi=r28; Hlo=r29;
+Zhi=r30; Zlo=r31;
+
+.align 128
+.skip 16 // aligns loop body
+.global gcm_gmult_4bit#
+.proc gcm_gmult_4bit#
+gcm_gmult_4bit:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,2,6,0,8
+ $ADDP Xi=15,in0 // &Xi[15]
+ mov rem_4bitp=ip }
+{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+ .body
+ .rotr in[3],xi[3],Hi[2]
+
+{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
+ mov mask0xf0=0xf0
+ brp.loop.imp .Loop1,.Lend1-16};;
+{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
+ };;
+{ .mii; shladd Hi[1]=xi[2],4,r0
+ mov pr.rot=0x7<<16
+ mov ar.lc=13 };;
+{ .mii; and Hi[1]=mask0xf0,Hi[1]
+ mov ar.ec=3
+ xor Zlo=Zlo,Zlo };;
+{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
+ add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+ xor Zhi=Zhi,Zhi };;
+___
+ &loop (".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
+{ .mib; mux1 Zlo=Zlo,\@rev };;
+{ .mib; mux1 Zhi=Zhi,\@rev };;
+{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
+ add Hhi=1,Xi };; // pipeline flush on Itanium
+{ .mib; st8 [Hlo]=Zlo
+ mov pr=prevpr,0x1ffff };;
+{ .mib; st8 [Hhi]=Zhi
+ mov ar.lc=prevlc
+ br.ret.sptk.many b0 };;
+.endp gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+ for (my $i=0;$i<8;$i++) {
+ $code.=<<___;
+{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
+ ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
+{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
+ ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
+___
+ $code.=shift if (($i+$#_)==7);
+ $code.="\t};;\n"
+ }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align 32
+.skip 16 // aligns loop body
+.global gcm_ghash_4bit#
+.proc gcm_ghash_4bit#
+gcm_ghash_4bit:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,4,2,0,0
+ .vframe prevsp
+ mov prevsp=sp
+ mov $rem_8bit=ip };;
+ .body
+{ .mfi; $ADDP r8=0+0,$Htbl
+ $ADDP r9=0+8,$Htbl }
+{ .mfi; $ADDP r10=128+0,$Htbl
+ $ADDP r11=128+8,$Htbl };;
+___
+ &load_htable(
+ " $ADDP $Xip=15,$Xip", # &Xi[15]
+ " $ADDP $len=$len,$inp", # &inp[len]
+ " $ADDP $inp=15,$inp", # &inp[15]
+ " mov $mask0xff=0xff",
+ " add sp=-512,sp",
+ " andcm sp=sp,$mask0xff", # align stack frame
+ " add r14=0,sp",
+ " add r15=8,sp");
+$code.=<<___;
+{ .mmi; $sum 1<<1 // go big-endian
+ add r8=256+0,sp
+ add r9=256+8,sp }
+{ .mmi; add r10=256+128+0,sp
+ add r11=256+128+8,sp
+ add $len=-17,$len };;
+___
+for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
+ st8 [r9]=$rhi,16 // Htable[$i].hi
+ shrp $rlo=$rhi,$rlo,4 }//;;
+{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
+ stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
+ shr.u $rhi=$rhi,4 };;
+{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
+ st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
+ ld8 r17=[r9],16 };; // Htable[8].hi
+{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
+ ld8 r19=[r9],16 } // Htable[9].hi
+{ .mmi; rum 1<<5 // clear um.mfh
+ shrp r16=r17,r16,4 };;
+___
+for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
+ ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
+ shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
+{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
+ st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
+ shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
+___
+}
+$code.=<<___;
+{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
+{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
+ st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
+ shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
+{ .mmi; add $Htbl=256,sp // &Htable[0]
+ add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+ shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
+{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
+ st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___; # (p16)
+{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
+ ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ cmp.eq p0,p6=r0,r0 };; // clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p16),(p17)
+{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
+ dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+.align 32
+.LOOP:
+{ .mmi;
+(p6) st8 [$Xip]=$Zhi,13
+ xor $Zlo=$Zlo,$Zlo
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p16),(p17),(p18)
+{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
+{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
+ xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
+{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+ ld1 $in=[$inp],-1 } //(p16) *inp--
+{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
+ mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
+ ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___; # (p16),(p17),(p18),(p19)
+{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+ shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
+{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+ dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
+{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
+ xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
+ xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+ ld1 $in=[$inp],-1 //(p16) *inp--
+ shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
+{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
+ xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
+ ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+}
+
+$code.=<<___; # (p17),(p18),(p19)
+{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+ shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
+{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+ dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
+{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
+ xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
+ xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+ shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
+{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
+ xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
+ shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p18),(p19)
+{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
+{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
+{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+ xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
+ shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
+{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
+ xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
+{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
+ shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p19)
+{ .mmi; cmp.ltu p6,p0=$inp,$len
+ add $inp=32,$inp
+ shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
+{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
+ add $Xip=9,$Xip };; // &Xi.lo
+{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+(p6) ld1 $in=[$inp],-1 //[p16] *inp--
+(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
+{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
+(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
+{ .mmi; st8 [$Xip]=$Zlo,-8
+(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
+ shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6) ld1 $in=[$inp],-1 //[p16] *inp--
+ xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
+(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
+{ .mib;
+(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
+(p6) br.cond.dptk.many .LOOP };;
+
+{ .mib; st8 [$Xip]=$Zhi };;
+{ .mib; $rum 1<<1 // return to little-endian
+ .restore sp
+ mov sp=prevsp
+ br.ret.sptk.many b0 };;
+.endp gcm_ghash_4bit#
+___
+$code.=<<___;
+.align 128
+.type rem_4bit#,\@object
+rem_4bit:
+ data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+ data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+ data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+ data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size rem_4bit#,128
+.type rem_8bit#,\@object
+rem_8bit:
+ data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+ data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+ data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+ data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+ data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+ data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+ data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+ data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+ data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+ data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+ data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+ data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+ data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+ data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+ data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+ data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+ data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+ data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+ data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+ data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+ data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+ data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+ data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+ data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+ data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+ data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+ data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+ data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+ data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+ data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+ data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+ data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size rem_8bit#,512
+stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 0000000..1d62545
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,738 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+ $NREGS =6;
+} else {
+ $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+ $NREGS =11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+ # [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26"; # argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl; # variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+ $Zhl="%r6";
+ $Zlh="%r7";
+ $Hhl="%r8";
+ $Hlh="%r9";
+ $Thl="%r10";
+ $Tlh="%r11";
+}
+$rem2="%r6"; # used in PA-RISC 2.0 code
+
+$code.=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+ .ALIGN 64
+gcm_gmult_4bit
+ .PROC
+ .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+ blr %r0,$rem_4bit
+ ldi 3,$rem
+L\$pic_gmult
+ andcm $rem_4bit,$rem,$rem_4bit
+ addl $inp,$len,$len
+ ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+ ldi 0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldi 31,$rem
+ mtctl $rem,%cr11
+ extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
+ b L\$parisc1_gmult
+ nop
+___
+
+$code.=<<___;
+ ldb 15($Xi),$nlo
+ ldo 8($Htbl),$Hll
+
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ ldd $nlo($Hll),$Zll
+ ldd $nlo($Hhh),$Zhh
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldb 14($Xi),$nlo
+
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+ b L\$oop_gmult_pa2
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_gmult_pa2
+ xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
+ depd,z $Zll,60,4,$rem
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem
+ ldbx $cnt($Xi),$nlo
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+ ldd $rem($rem_4bit),$rem
+
+ xor $Tll,$Zll,$Zll
+ addib,uv -1,$cnt,L\$oop_gmult_pa2
+ xor $Thh,$Zhh,$Zhh
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ std $Zll,8($Xi)
+ std $Zhh,0($Xi)
+___
+
+$code.=<<___ if ($SIZE_T==4);
+ b L\$done_gmult
+ nop
+
+L\$parisc1_gmult
+ ldb 15($Xi),$nlo
+ ldo 12($Htbl),$Hll
+ ldo 8($Htbl),$Hlh
+ ldo 4($Htbl),$Hhl
+
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ ldwx $nlo($Hll),$Zll
+ ldwx $nlo($Hlh),$Zlh
+ ldwx $nlo($Hhl),$Zhl
+ ldwx $nlo($Hhh),$Zhh
+ zdep $Zll,28,4,$rem
+ ldb 14($Xi),$nlo
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ extru $Zhh,27,28,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ xor $rem,$Zhh,$Zhh
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $Thl,$Zhl,$Zhl
+ b L\$oop_gmult_pa1
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_gmult_pa1
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldbx $cnt($Xi),$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $rem,$Zhh,$Zhh
+ zdep $Zll,28,4,$rem
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zhl,$Zlh,4,$Zlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ and $mask0xf0,$nlo,$nhi
+ extru $Zhh,27,28,$Zhh
+ zdep $nlo,27,4,$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $rem,$Zhh,$Zhh
+ addib,uv -1,$cnt,L\$oop_gmult_pa1
+ xor $Thl,$Zhl,$Zhl
+
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $rem,$Zhh,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ zdep $Zll,28,4,$rem
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ extru $Zhh,27,28,$Zhh
+ xor $Tll,$Zll,$Zll
+ xor $Tlh,$Zlh,$Zlh
+ xor $rem,$Zhh,$Zhh
+ stw $Zll,12($Xi)
+ xor $Thl,$Zhl,$Zhl
+ stw $Zlh,8($Xi)
+ xor $Thh,$Zhh,$Zhh
+ stw $Zhl,4($Xi)
+ stw $Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+ .ALIGN 64
+gcm_ghash_4bit
+ .PROC
+ .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+ blr %r0,$rem_4bit
+ ldi 3,$rem
+L\$pic_ghash
+ andcm $rem_4bit,$rem,$rem_4bit
+ addl $inp,$len,$len
+ ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+ ldi 0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldi 31,$rem
+ mtctl $rem,%cr11
+ extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
+ b L\$parisc1_ghash
+ nop
+___
+
+$code.=<<___;
+ ldb 15($Xi),$nlo
+ ldo 8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+ ldb 15($inp),$nhi
+ xor $nhi,$nlo,$nlo
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ ldd $nlo($Hll),$Zll
+ ldd $nlo($Hhh),$Zhh
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldb 14($Xi),$nlo
+ ldb 14($inp),$byte
+
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+ xor $byte,$nlo,$nlo
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+ b L\$oop_ghash_pa2
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_ghash_pa2
+ xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
+ depd,z $Zll,60,4,$rem2
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldbx $cnt($Xi),$nlo
+ ldbx $cnt($inp),$byte
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ ldd $rem2($rem_4bit),$rem2
+
+ xor $rem2,$Zhh,$Zhh
+ xor $byte,$nlo,$nlo
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ extrd,u $Zhh,59,60,$Zhh
+ xor $Tll,$Zll,$Zll
+
+ ldd $rem($rem_4bit),$rem
+ addib,uv -1,$cnt,L\$oop_ghash_pa2
+ xor $Thh,$Zhh,$Zhh
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem2
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ ldd $rem2($rem_4bit),$rem2
+
+ xor $rem2,$Zhh,$Zhh
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ extrd,u $Zhh,59,60,$Zhh
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ std $Zll,8($Xi)
+ ldo 16($inp),$inp
+ std $Zhh,0($Xi)
+ cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+ copy $Zll,$nlo
+___
+
+$code.=<<___ if ($SIZE_T==4);
+ b L\$done_ghash
+ nop
+
+L\$parisc1_ghash
+ ldb 15($Xi),$nlo
+ ldo 12($Htbl),$Hll
+ ldo 8($Htbl),$Hlh
+ ldo 4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+ ldb 15($inp),$byte
+ xor $byte,$nlo,$nlo
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ ldwx $nlo($Hll),$Zll
+ ldwx $nlo($Hlh),$Zlh
+ ldwx $nlo($Hhl),$Zhl
+ ldwx $nlo($Hhh),$Zhh
+ zdep $Zll,28,4,$rem
+ ldb 14($Xi),$nlo
+ ldb 14($inp),$byte
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ extru $Zhh,27,28,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ xor $byte,$nlo,$nlo
+ xor $rem,$Zhh,$Zhh
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $Thl,$Zhl,$Zhl
+ b L\$oop_ghash_pa1
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_ghash_pa1
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldbx $cnt($Xi),$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ ldbx $cnt($inp),$byte
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $rem,$Zhh,$Zhh
+ zdep $Zll,28,4,$rem
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zhl,$Zlh,4,$Zlh
+ xor $byte,$nlo,$nlo
+ shrpw $Zhh,$Zhl,4,$Zhl
+ and $mask0xf0,$nlo,$nhi
+ extru $Zhh,27,28,$Zhh
+ zdep $nlo,27,4,$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $rem,$Zhh,$Zhh
+ addib,uv -1,$cnt,L\$oop_ghash_pa1
+ xor $Thl,$Zhl,$Zhl
+
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $rem,$Zhh,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ zdep $Zll,28,4,$rem
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ extru $Zhh,27,28,$Zhh
+ xor $Tll,$Zll,$Zll
+ xor $Tlh,$Zlh,$Zlh
+ xor $rem,$Zhh,$Zhh
+ stw $Zll,12($Xi)
+ xor $Thl,$Zhl,$Zhl
+ stw $Zlh,8($Xi)
+ xor $Thh,$Zhh,$Zhh
+ stw $Zhl,4($Xi)
+ ldo 16($inp),$inp
+ stw $Zhh,0($Xi)
+ comb,<> $inp,$len,L\$outer_ghash_pa1
+ copy $Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 64
+L\$rem_4bit
+ .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+ .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+ .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+ .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+ .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+ .ALIGN 64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "ldd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
+ { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
+ { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+ $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $std = sub {
+ my ($mod,$args) = @_;
+ my $orig = "std$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+ { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $extrd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "extrd$mod\t$args";
+
+ # I only have ",u" completer, it's implicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
+ { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+ my $len=32-$3;
+ $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
+ { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+ my $len=32-$2;
+ $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
+ $opcode |= (1<<13) if ($mod =~ /,\**=/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "shrpd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
+ { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+ my $cpos=63-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
+ { sprintf "\t.WORD\t0x%08x\t; %s",
+ (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $depd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "depd$mod\t$args";
+
+ # I only have ",z" completer, it's impicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
+ { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+ my $cpos=63-$2;
+ my $len=32-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+sub assemble {
+ my ($mnemonic,$mod,$args)=@_;
+ my $opcode = eval("\$$mnemonic");
+
+ ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ if ($SIZE_T==4) {
+ s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+ s/cmpb,\*/comb,/;
+ s/,\*/,/;
+ }
+ s/\bbv\b/bve/ if ($SIZE_T==8);
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 0000000..6e628d8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,258 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2"; # argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6"; # variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+.text
+
+.globl gcm_gmult_4bit
+.align 32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
+ larl %r1,OPENSSL_s390xcap_P
+ lghi %r0,0
+ lg %r1,24(%r1) # load second word of kimd capabilities vector
+ tmhh %r1,0x4000 # check for function 65
+ jz .Lsoft_gmult
+ stg %r0,16($sp) # arrange 16 bytes of zero input
+ stg %r0,24($sp)
+ lghi %r0,65 # function 65
+ la %r1,0($Xi) # H lies right after Xi in gcm128_context
+ la $inp,16($sp)
+ lghi $len,16
+ .long 0xb93e0004 # kimd %r0,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+ br %r14
+.align 32
+.Lsoft_gmult:
+___
+$code.=<<___;
+ stm${g} %r6,%r14,6*$SIZE_T($sp)
+
+ aghi $Xi,-1
+ lghi $len,1
+ lghi $x78,`0xf<<3`
+ larl $rem_4bit,rem_4bit
+
+ lg $Zlo,8+1($Xi) # Xi
+ j .Lgmult_shortcut
+.type gcm_gmult_4bit,\@function
+.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl gcm_ghash_4bit
+.align 32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+ larl %r1,OPENSSL_s390xcap_P
+ lg %r0,24(%r1) # load second word of kimd capabilities vector
+ tmhh %r0,0x4000 # check for function 65
+ jz .Lsoft_ghash
+ lghi %r0,65 # function 65
+ la %r1,0($Xi) # H lies right after Xi in gcm128_context
+ .long 0xb93e0004 # kimd %r0,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+ br %r14
+.align 32
+.Lsoft_ghash:
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+ llgfr $len,$len
+___
+$code.=<<___;
+ stm${g} %r6,%r14,6*$SIZE_T($sp)
+
+ aghi $Xi,-1
+ srlg $len,$len,4
+ lghi $x78,`0xf<<3`
+ larl $rem_4bit,rem_4bit
+
+ lg $Zlo,8+1($Xi) # Xi
+ lg $Zhi,0+1($Xi)
+ lghi $tmp,0
+.Louter:
+ xg $Zhi,0($inp) # Xi ^= inp
+ xg $Zlo,8($inp)
+ xgr $Zhi,$tmp
+ stg $Zlo,8+1($Xi)
+ stg $Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+ lghi $tmp,0xf0
+ sllg $nlo,$Zlo,4
+ srlg $xi,$Zlo,8 # extract second byte
+ ngr $nlo,$tmp
+ lgr $nhi,$Zlo
+ lghi $cnt,14
+ ngr $nhi,$tmp
+
+ lg $Zlo,8($nlo,$Htbl)
+ lg $Zhi,0($nlo,$Htbl)
+
+ sllg $nlo,$xi,4
+ sllg $rem0,$Zlo,3
+ ngr $nlo,$tmp
+ ngr $rem0,$x78
+ ngr $xi,$tmp
+
+ sllg $tmp,$Zhi,60
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nhi,$Htbl)
+ xg $Zhi,0($nhi,$Htbl)
+ lgr $nhi,$xi
+ sllg $rem1,$Zlo,3
+ xgr $Zlo,$tmp
+ ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
+ j .Lghash_inner
+.align 16
+.Lghash_inner:
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nlo,$Htbl)
+ llgc $xi,0($cnt,$Xi)
+ xg $Zhi,0($nlo,$Htbl)
+ sllg $nlo,$xi,4
+ xg $Zhi,0($rem0,$rem_4bit)
+ nill $nlo,0xf0
+ sllg $rem0,$Zlo,3
+ xgr $Zlo,$tmp
+ ngr $rem0,$x78
+ nill $xi,0xf0
+
+ sllg $tmp,$Zhi,60
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nhi,$Htbl)
+ xg $Zhi,0($nhi,$Htbl)
+ lgr $nhi,$xi
+ xg $Zhi,0($rem1,$rem_4bit)
+ sllg $rem1,$Zlo,3
+ xgr $Zlo,$tmp
+ ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
+ brct $cnt,.Lghash_inner
+
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nlo,$Htbl)
+ xg $Zhi,0($nlo,$Htbl)
+ sllg $xi,$Zlo,3
+ xg $Zhi,0($rem0,$rem_4bit)
+ xgr $Zlo,$tmp
+ ngr $xi,$x78
+
+ sllg $tmp,$Zhi,60
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nhi,$Htbl)
+ xg $Zhi,0($nhi,$Htbl)
+ xgr $Zlo,$tmp
+ xg $Zhi,0($rem1,$rem_4bit)
+
+ lg $tmp,0($xi,$rem_4bit)
+ la $inp,16($inp)
+ sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
+ brctg $len,.Louter
+
+ xgr $Zhi,$tmp
+ stg $Zlo,8+1($Xi)
+ stg $Zhi,0+1($Xi)
+ lm${g} %r6,%r14,6*$SIZE_T($sp)
+ br %r14
+.type gcm_ghash_4bit,\@function
+.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align 64
+rem_4bit:
+ .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+ .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+ .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+ .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type rem_4bit,\@object
+.size rem_4bit,(.-rem_4bit)
+.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 0000000..c4eb3b1
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,581 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+# gcc 3.3.x cc 5.2 this assembler
+#
+# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
+# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+# gcc 4.4.1 this assembler
+#
+# 32-bit build 566 50 (+1000%)
+# 64-bit build 56 50 (+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+#
+# October 2012
+#
+# Add VIS3 lookup-table-free implementation using polynomial
+# multiplication xmulx[hi] and extended addition addxc[cc]
+# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
+# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
+# saturates at ~15.5x single-process result on 8-core processor,
+# or ~20.5GBps per 2.85GHz socket.
+
+$output=pop;
+open STDOUT,">$output";
+
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
+
+$Zhi="%o0"; # 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0"; # small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0"; # input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+.section ".text",#alloc,#execinstr
+
+.align 64
+rem_4bit:
+ .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+ .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+ .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+ .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type rem_4bit,#object
+.size rem_4bit,(.-rem_4bit)
+
+.globl gcm_ghash_4bit
+.align 32
+gcm_ghash_4bit:
+ save %sp,-$frame,%sp
+ ldub [$inp+15],$nlo
+ ldub [$Xi+15],$xi0
+ ldub [$Xi+14],$xi1
+ add $len,$inp,$len
+ add $Htbl,8,$Htblo
+
+1: call .+8
+ add %o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+ xor $xi0,$nlo,$nlo
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ sll $nlo,4,$nlo
+ ldx [$Htblo+$nlo],$Zlo
+ ldx [$Htbl+$nlo],$Zhi
+
+ ldub [$inp+14],$nlo
+
+ ldx [$Htblo+$nhi],$Tlo
+ and $Zlo,0xf,$remi
+ ldx [$Htbl+$nhi],$Thi
+ sll $remi,3,$remi
+ ldx [$rem_4bit+$remi],$rem
+ srlx $Zlo,4,$Zlo
+ mov 13,$cnt
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+
+ xor $xi1,$nlo,$nlo
+ and $Zlo,0xf,$remi
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ ba .Lghash_inner
+ sll $nlo,4,$nlo
+.align 32
+.Lghash_inner:
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ ldub [$inp+$cnt],$nlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ ldub [$Xi+$cnt],$xi1
+ xor $Thi,$Zhi,$Zhi
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $xi1,$nlo,$nlo
+ srlx $Zhi,4,$Zhi
+ and $nlo,0xf0,$nhi
+ addcc $cnt,-1,$cnt
+ xor $Zlo,$tmp,$Zlo
+ and $nlo,0x0f,$nlo
+ xor $Tlo,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ blu .Lghash_inner
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+
+ add $inp,16,$inp
+ cmp $inp,$len
+ be,pn SIZE_T_CC,.Ldone
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ ldub [$inp+15],$nlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ stx $Zlo,[$Xi+8]
+ xor $rem,$Zhi,$Zhi
+ stx $Zhi,[$Xi]
+ srl $Zlo,8,$xi1
+ and $Zlo,0xff,$xi0
+ ba .Louter
+ and $xi1,0xff,$xi1
+.align 32
+.Ldone:
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ stx $Zlo,[$Xi+8]
+ xor $rem,$Zhi,$Zhi
+ stx $Zhi,[$Xi]
+
+ ret
+ restore
+.type gcm_ghash_4bit,#function
+.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl gcm_gmult_4bit
+.align 32
+gcm_gmult_4bit:
+ save %sp,-$frame,%sp
+ ldub [$Xi+15],$nlo
+ add $Htbl,8,$Htblo
+
+1: call .+8
+ add %o7,rem_4bit-1b,$rem_4bit
+
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ sll $nlo,4,$nlo
+ ldx [$Htblo+$nlo],$Zlo
+ ldx [$Htbl+$nlo],$Zhi
+
+ ldub [$Xi+14],$nlo
+
+ ldx [$Htblo+$nhi],$Tlo
+ and $Zlo,0xf,$remi
+ ldx [$Htbl+$nhi],$Thi
+ sll $remi,3,$remi
+ ldx [$rem_4bit+$remi],$rem
+ srlx $Zlo,4,$Zlo
+ mov 13,$cnt
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+
+ and $Zlo,0xf,$remi
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ ba .Lgmult_inner
+ sll $nlo,4,$nlo
+.align 32
+.Lgmult_inner:
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ ldub [$Xi+$cnt],$nlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ srlx $Zhi,4,$Zhi
+ and $nlo,0xf0,$nhi
+ addcc $cnt,-1,$cnt
+ xor $Zlo,$tmp,$Zlo
+ and $nlo,0x0f,$nlo
+ xor $Tlo,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ blu .Lgmult_inner
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ stx $Zlo,[$Xi+8]
+ xor $rem,$Zhi,$Zhi
+ stx $Zhi,[$Xi]
+
+ ret
+ restore
+.type gcm_gmult_4bit,#function
+.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
+___
+
+{{{
+# Straightforward 128x128-bit multiplication using Karatsuba algorithm
+# followed by pair of 64-bit reductions [with a shortcut in first one,
+# which allowed to break dependency between reductions and remove one
+# multiplication from critical path]. While it might be suboptimal
+# with regard to sheer number of multiplications, other methods [such
+# as aggregate reduction] would require more 64-bit registers, which
+# we don't have in 32-bit application context.
+
+($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
+
+($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
+ (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
+
+($shl,$shr)=map("%l$_",(0..7));
+
+# For details regarding "twisted H" see ghash-x86.pl.
+$code.=<<___;
+.globl gcm_init_vis3
+.align 32
+gcm_init_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [%i1+0],$Hhi
+ ldx [%i1+8],$Hlo
+ mov 0xE1,$Xhi
+ mov 1,$Xlo
+ sllx $Xhi,57,$Xhi
+ srax $Hhi,63,$C0 ! broadcast carry
+ addcc $Hlo,$Hlo,$Hlo ! H<<=1
+ addxc $Hhi,$Hhi,$Hhi
+ and $C0,$Xlo,$Xlo
+ and $C0,$Xhi,$Xhi
+ xor $Xlo,$Hlo,$Hlo
+ xor $Xhi,$Hhi,$Hhi
+ stx $Hlo,[%i0+8] ! save twisted H
+ stx $Hhi,[%i0+0]
+
+ sethi %hi(0xA0406080),$V
+ sethi %hi(0x20C0E000),%l0
+ or $V,%lo(0xA0406080),$V
+ or %l0,%lo(0x20C0E000),%l0
+ sllx $V,32,$V
+ or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
+ stx $V,[%i0+16]
+
+ ret
+ restore
+.type gcm_init_vis3,#function
+.size gcm_init_vis3,.-gcm_init_vis3
+
+.globl gcm_gmult_vis3
+.align 32
+gcm_gmult_vis3:
+ save %sp,-$frame,%sp
+
+ ldx [$Xip+8],$Xlo ! load Xi
+ ldx [$Xip+0],$Xhi
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_gmult_vis3,#function
+.size gcm_gmult_vis3,.-gcm_gmult_vis3
+
+.globl gcm_ghash_vis3
+.align 32
+gcm_ghash_vis3:
+ save %sp,-$frame,%sp
+ nop
+ srln $len,0,$len ! needed on v8+, "nop" on v9
+
+ ldx [$Xip+8],$C2 ! load Xi
+ ldx [$Xip+0],$C3
+ ldx [$Htable+8],$Hlo ! load twisted H
+ ldx [$Htable+0],$Hhi
+
+ mov 0xE1,%l7
+ sllx %l7,57,$xE1 ! 57 is not a typo
+ ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
+
+ and $inp,7,$shl
+ andn $inp,7,$inp
+ sll $shl,3,$shl
+ prefetch [$inp+63], 20
+ sub %g0,$shl,$shr
+
+ xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
+.Loop:
+ ldx [$inp+8],$Xlo
+ brz,pt $shl,1f
+ ldx [$inp+0],$Xhi
+
+ ldx [$inp+16],$C1 ! align data
+ srlx $Xlo,$shr,$C0
+ sllx $Xlo,$shl,$Xlo
+ sllx $Xhi,$shl,$Xhi
+ srlx $C1,$shr,$C1
+ or $C0,$Xhi,$Xhi
+ or $C1,$Xlo,$Xlo
+1:
+ add $inp,16,$inp
+ sub $len,16,$len
+ xor $C2,$Xlo,$Xlo
+ xor $C3,$Xhi,$Xhi
+ prefetch [$inp+63], 20
+
+ xmulx $Xlo,$Hlo,$C0
+ xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
+ xmulx $C2,$Hhl,$C1
+ xmulxhi $Xlo,$Hlo,$Xlo
+ xmulxhi $C2,$Hhl,$C2
+ xmulxhi $Xhi,$Hhi,$C3
+ xmulx $Xhi,$Hhi,$Xhi
+
+ sll $C0,3,$sqr
+ srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
+ xor $C0,$sqr,$sqr
+ sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+ xor $C0,$C1,$C1 ! Karatsuba post-processing
+ xor $Xlo,$C2,$C2
+ xor $sqr,$Xlo,$Xlo ! real destination is $C1
+ xor $C3,$C2,$C2
+ xor $Xlo,$C1,$C1
+ xor $Xhi,$C2,$C2
+ xor $Xhi,$C1,$C1
+
+ xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
+ xor $C0,$C2,$C2
+ xmulx $C1,$xE1,$C0
+ xor $C1,$C3,$C3
+ xmulxhi $C1,$xE1,$C1
+
+ xor $Xlo,$C2,$C2
+ xor $C0,$C2,$C2
+ brnz,pt $len,.Loop
+ xor $C1,$C3,$C3
+
+ stx $C2,[$Xip+8] ! save Xi
+ stx $C3,[$Xip+0]
+
+ ret
+ restore
+.type gcm_ghash_vis3,#function
+.size gcm_ghash_vis3,.-gcm_ghash_vis3
+___
+}}}
+$code.=<<___;
+.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "xmulx" => 0x115,
+ "xmulxhi" => 0x116 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl
new file mode 100644
index 0000000..cd84582
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1405 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+# gcc 2.95.3(*) SSE assembler x86 assembler
+#
+# Pentium 105/111(**) - 50
+# PIII 68 /75 12.2 24
+# P4 125/125 17.8 84(***)
+# Opteron 66 /70 10.1 30
+# Core2 54 /67 8.4 18
+# Atom 105/105 16.8 53
+# VIA Nano 69 /71 13.0 27
+#
+# (*) gcc 3.4.x was observed to generate few percent slower code,
+# which is one of reasons why 2.95.3 results were chosen,
+# another reason is lack of 3.4.x results for older CPUs;
+# comparison with SSE results is not completely fair, because C
+# results are for vanilla "256B" implementation, while
+# assembler results are for "528B";-)
+# (**) second number is result for code compiled with -fPIC flag,
+# which is actually more relevant, because assembler code is
+# position-independent;
+# (***) see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be between
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp = "edi";
+$Htbl = "esi";
+
+$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
+ # than unrolled, which has to be weighted against
+ # 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+ my $off = shift;
+ my $rem = "eax";
+
+ &mov ($Zhh,&DWP(4,$Htbl,$Zll));
+ &mov ($Zhl,&DWP(0,$Htbl,$Zll));
+ &mov ($Zlh,&DWP(12,$Htbl,$Zll));
+ &mov ($Zll,&DWP(8,$Htbl,$Zll));
+ &xor ($rem,$rem); # avoid partial register stalls on PIII
+
+ # shrd practically kills P4, 2.5x deterioration, but P4 has
+ # MMX code-path to execute. shrd runs tad faster [than twice
+ # the shifts, move's and or's] on pre-MMX Pentium (as well as
+ # PIII and Core2), *but* minimizes code size, spares register
+ # and thus allows to fold the loop...
+ if (!$unroll) {
+ my $cnt = $inp;
+ &mov ($cnt,15);
+ &jmp (&label("x86_loop"));
+ &set_label("x86_loop",16);
+ for($i=1;$i<=2;$i++) {
+ &mov (&LB($rem),&LB($Zll));
+ &shrd ($Zll,$Zlh,4);
+ &and (&LB($rem),0xf);
+ &shrd ($Zlh,$Zhl,4);
+ &shrd ($Zhl,$Zhh,4);
+ &shr ($Zhh,4);
+ &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+ &mov (&LB($rem),&BP($off,"esp",$cnt));
+ if ($i&1) {
+ &and (&LB($rem),0xf0);
+ } else {
+ &shl (&LB($rem),4);
+ }
+
+ &xor ($Zll,&DWP(8,$Htbl,$rem));
+ &xor ($Zlh,&DWP(12,$Htbl,$rem));
+ &xor ($Zhl,&DWP(0,$Htbl,$rem));
+ &xor ($Zhh,&DWP(4,$Htbl,$rem));
+
+ if ($i&1) {
+ &dec ($cnt);
+ &js (&label("x86_break"));
+ } else {
+ &jmp (&label("x86_loop"));
+ }
+ }
+ &set_label("x86_break",16);
+ } else {
+ for($i=1;$i<32;$i++) {
+ &comment($i);
+ &mov (&LB($rem),&LB($Zll));
+ &shrd ($Zll,$Zlh,4);
+ &and (&LB($rem),0xf);
+ &shrd ($Zlh,$Zhl,4);
+ &shrd ($Zhl,$Zhh,4);
+ &shr ($Zhh,4);
+ &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+ if ($i&1) {
+ &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
+ &and (&LB($rem),0xf0);
+ } else {
+ &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
+ &shl (&LB($rem),4);
+ }
+
+ &xor ($Zll,&DWP(8,$Htbl,$rem));
+ &xor ($Zlh,&DWP(12,$Htbl,$rem));
+ &xor ($Zhl,&DWP(0,$Htbl,$rem));
+ &xor ($Zhh,&DWP(4,$Htbl,$rem));
+ }
+ }
+ &bswap ($Zll);
+ &bswap ($Zlh);
+ &bswap ($Zhl);
+ if (!$x86only) {
+ &bswap ($Zhh);
+ } else {
+ &mov ("eax",$Zhh);
+ &bswap ("eax");
+ &mov ($Zhh,"eax");
+ }
+}
+
+if ($unroll) {
+ &function_begin_B("_x86_gmult_4bit_inner");
+ &x86_loop(4);
+ &ret ();
+ &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+ my $bias = shift;
+
+ &mov (&DWP($bias+0, "esp"),0x0000<<16);
+ &mov (&DWP($bias+4, "esp"),0x1C20<<16);
+ &mov (&DWP($bias+8, "esp"),0x3840<<16);
+ &mov (&DWP($bias+12,"esp"),0x2460<<16);
+ &mov (&DWP($bias+16,"esp"),0x7080<<16);
+ &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
+ &mov (&DWP($bias+24,"esp"),0x48C0<<16);
+ &mov (&DWP($bias+28,"esp"),0x54E0<<16);
+ &mov (&DWP($bias+32,"esp"),0xE100<<16);
+ &mov (&DWP($bias+36,"esp"),0xFD20<<16);
+ &mov (&DWP($bias+40,"esp"),0xD940<<16);
+ &mov (&DWP($bias+44,"esp"),0xC560<<16);
+ &mov (&DWP($bias+48,"esp"),0x9180<<16);
+ &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
+ &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
+ &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+ &stack_push(16+4+1); # +1 for stack alignment
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+
+ &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
+ &mov ($Zhl,&DWP(4,$inp));
+ &mov ($Zlh,&DWP(8,$inp));
+ &mov ($Zll,&DWP(12,$inp));
+
+ &deposit_rem_4bit(16);
+
+ &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
+ &mov (&DWP(4,"esp"),$Zhl);
+ &mov (&DWP(8,"esp"),$Zlh);
+ &mov (&DWP(12,"esp"),$Zll);
+ &shr ($Zll,20);
+ &and ($Zll,0xf0);
+
+ if ($unroll) {
+ &call ("_x86_gmult_4bit_inner");
+ } else {
+ &x86_loop(0);
+ &mov ($inp,&wparam(0));
+ }
+
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(0,$inp),$Zhh);
+ &stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+ &stack_push(16+4+1); # +1 for 64-bit alignment
+ &mov ($Zll,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+ &mov ($inp,&wparam(2)); # load in
+ &mov ("ecx",&wparam(3)); # load len
+ &add ("ecx",$inp);
+ &mov (&wparam(3),"ecx");
+
+ &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
+ &mov ($Zhl,&DWP(4,$Zll));
+ &mov ($Zlh,&DWP(8,$Zll));
+ &mov ($Zll,&DWP(12,$Zll));
+
+ &deposit_rem_4bit(16);
+
+ &set_label("x86_outer_loop",16);
+ &xor ($Zll,&DWP(12,$inp)); # xor with input
+ &xor ($Zlh,&DWP(8,$inp));
+ &xor ($Zhl,&DWP(4,$inp));
+ &xor ($Zhh,&DWP(0,$inp));
+ &mov (&DWP(12,"esp"),$Zll); # dump it on stack
+ &mov (&DWP(8,"esp"),$Zlh);
+ &mov (&DWP(4,"esp"),$Zhl);
+ &mov (&DWP(0,"esp"),$Zhh);
+
+ &shr ($Zll,20);
+ &and ($Zll,0xf0);
+
+ if ($unroll) {
+ &call ("_x86_gmult_4bit_inner");
+ } else {
+ &x86_loop(0);
+ &mov ($inp,&wparam(2));
+ }
+ &lea ($inp,&DWP(16,$inp));
+ &cmp ($inp,&wparam(3));
+ &mov (&wparam(2),$inp) if (!$unroll);
+ &jb (&label("x86_outer_loop"));
+
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(0,$inp),$Zhh);
+ &stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (!$sse2) {{ # pure-MMX "May" version...
+
+$S=12; # shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary µ-archs.
+{
+ my $cnt;
+ my $rem_4bit = "eax";
+ my @rem = ($Zhh,$Zll);
+ my $nhi = $Zhl;
+ my $nlo = $Zlh;
+
+ my ($Zlo,$Zhi) = ("mm0","mm1");
+ my $tmp = "mm2";
+
+ &xor ($nlo,$nlo); # avoid partial register stalls on PIII
+ &mov ($nhi,$Zll);
+ &mov (&LB($nlo),&LB($nhi));
+ &shl (&LB($nlo),4);
+ &and ($nhi,0xf0);
+ &movq ($Zlo,&QWP(8,$Htbl,$nlo));
+ &movq ($Zhi,&QWP(0,$Htbl,$nlo));
+ &movd ($rem[0],$Zlo);
+
+ for ($cnt=28;$cnt>=-2;$cnt--) {
+ my $odd = $cnt&1;
+ my $nix = $odd ? $nlo : $nhi;
+
+ &shl (&LB($nlo),4) if ($odd);
+ &psrlq ($Zlo,4);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nix));
+ &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
+ &psllq ($tmp,60);
+ &and ($nhi,0xf0) if ($odd);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+ &and ($rem[0],0xf);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nix));
+ &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
+ &movd ($rem[1],$Zlo);
+ &pxor ($Zlo,$tmp);
+
+ push (@rem,shift(@rem)); # "rotate" registers
+ }
+
+ &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
+
+ &psrlq ($Zlo,32); # lower part of Zlo is already there
+ &movd ($Zhl,$Zhi);
+ &psrlq ($Zhi,32);
+ &movd ($Zlh,$Zlo);
+ &movd ($Zhh,$Zhi);
+ &shl ($inp,4); # compensate for rem_4bit[i] being >>4
+
+ &bswap ($Zll);
+ &bswap ($Zhl);
+ &bswap ($Zlh);
+ &xor ($Zhh,$inp);
+ &bswap ($Zhh);
+
+ &ret ();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("eax");
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+ &movz ($Zll,&BP(15,$inp));
+
+ &call ("_mmx_gmult_4bit_inner");
+
+ &mov ($inp,&wparam(0)); # load Xi
+ &emms ();
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+ &mov ($Zhh,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+ &mov ($inp,&wparam(2)); # load in
+ &mov ($Zlh,&wparam(3)); # load len
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("eax");
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+ &add ($Zlh,$inp);
+ &mov (&wparam(3),$Zlh); # len to point at the end of input
+ &stack_push(4+1); # +1 for stack alignment
+
+ &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
+ &mov ($Zhl,&DWP(4,$Zhh));
+ &mov ($Zlh,&DWP(8,$Zhh));
+ &mov ($Zhh,&DWP(0,$Zhh));
+ &jmp (&label("mmx_outer_loop"));
+
+ &set_label("mmx_outer_loop",16);
+ &xor ($Zll,&DWP(12,$inp));
+ &xor ($Zhl,&DWP(4,$inp));
+ &xor ($Zlh,&DWP(8,$inp));
+ &xor ($Zhh,&DWP(0,$inp));
+ &mov (&wparam(2),$inp);
+ &mov (&DWP(12,"esp"),$Zll);
+ &mov (&DWP(4,"esp"),$Zhl);
+ &mov (&DWP(8,"esp"),$Zlh);
+ &mov (&DWP(0,"esp"),$Zhh);
+
+ &mov ($inp,"esp");
+ &shr ($Zll,24);
+
+ &call ("_mmx_gmult_4bit_inner");
+
+ &mov ($inp,&wparam(2));
+ &lea ($inp,&DWP(16,$inp));
+ &cmp ($inp,&wparam(3));
+ &jb (&label("mmx_outer_loop"));
+
+ &mov ($inp,&wparam(0)); # load Xi
+ &emms ();
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(0,$inp),$Zhh);
+
+ &stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{ # "June" MMX version...
+ # ... has slower "April" gcm_gmult_4bit_mmx with folded
+ # loop. This is done to conserve code size...
+$S=16; # shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+ my $inp = shift;
+ my $rem_4bit = shift;
+ my $cnt = $Zhh;
+ my $nhi = $Zhl;
+ my $nlo = $Zlh;
+ my $rem = $Zll;
+
+ my ($Zlo,$Zhi) = ("mm0","mm1");
+ my $tmp = "mm2";
+
+ &xor ($nlo,$nlo); # avoid partial register stalls on PIII
+ &mov ($nhi,$Zll);
+ &mov (&LB($nlo),&LB($nhi));
+ &mov ($cnt,14);
+ &shl (&LB($nlo),4);
+ &and ($nhi,0xf0);
+ &movq ($Zlo,&QWP(8,$Htbl,$nlo));
+ &movq ($Zhi,&QWP(0,$Htbl,$nlo));
+ &movd ($rem,$Zlo);
+ &jmp (&label("mmx_loop"));
+
+ &set_label("mmx_loop",16);
+ &psrlq ($Zlo,4);
+ &and ($rem,0xf);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
+ &mov (&LB($nlo),&BP(0,$inp,$cnt));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &dec ($cnt);
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
+ &mov ($nhi,$nlo);
+ &pxor ($Zlo,$tmp);
+ &js (&label("mmx_break"));
+
+ &shl (&LB($nlo),4);
+ &and ($rem,0xf);
+ &psrlq ($Zlo,4);
+ &and ($nhi,0xf0);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
+ &pxor ($Zlo,$tmp);
+ &jmp (&label("mmx_loop"));
+
+ &set_label("mmx_break",16);
+ &shl (&LB($nlo),4);
+ &and ($rem,0xf);
+ &psrlq ($Zlo,4);
+ &and ($nhi,0xf0);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
+ &pxor ($Zlo,$tmp);
+
+ &psrlq ($Zlo,4);
+ &and ($rem,0xf);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
+ &pxor ($Zlo,$tmp);
+
+ &psrlq ($Zlo,32); # lower part of Zlo is already there
+ &movd ($Zhl,$Zhi);
+ &psrlq ($Zhi,32);
+ &movd ($Zlh,$Zlo);
+ &movd ($Zhh,$Zhi);
+
+ &bswap ($Zll);
+ &bswap ($Zhl);
+ &bswap ($Zlh);
+ &bswap ($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("eax");
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+ &movz ($Zll,&BP(15,$inp));
+
+ &mmx_loop($inp,"eax");
+
+ &emms ();
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+ my $rem_8bit = "esi";
+ my $Htbl = "ebx";
+
+ # parameter block
+ &mov ("eax",&wparam(0)); # Xi
+ &mov ("ebx",&wparam(1)); # Htable
+ &mov ("ecx",&wparam(2)); # inp
+ &mov ("edx",&wparam(3)); # len
+ &mov ("ebp","esp"); # original %esp
+ &call (&label("pic_point"));
+ &set_label ("pic_point");
+ &blindpop ($rem_8bit);
+ &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+ &sub ("esp",512+16+16); # allocate stack frame...
+ &and ("esp",-64); # ...and align it
+ &sub ("esp",16); # place for (u8)(H[]<<4)
+
+ &add ("edx","ecx"); # pointer to the end of input
+ &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
+ &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
+ &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
+
+ { my @lo = ("mm0","mm1","mm2");
+ my @hi = ("mm3","mm4","mm5");
+ my @tmp = ("mm6","mm7");
+ my ($off1,$off2,$i) = (0,0,);
+
+ &add ($Htbl,128); # optimize for size
+ &lea ("edi",&DWP(16+128,"esp"));
+ &lea ("ebp",&DWP(16+256+128,"esp"));
+
+ # decompose Htable (low and high parts are kept separately),
+ # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+ for ($i=0;$i<18;$i++) {
+
+ &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
+ &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
+ &psllq ($tmp[1],60) if ($i>1);
+ &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
+ &por ($lo[2],$tmp[1]) if ($i>1);
+ &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
+ &psrlq ($lo[1],4) if ($i>0 && $i<17);
+ &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
+ &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
+ &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
+ &psrlq ($hi[1],4) if ($i>0 && $i<17);
+ &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
+ &shl ("edx",4) if ($i<16);
+ &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
+
+ unshift (@lo,pop(@lo)); # "rotate" registers
+ unshift (@hi,pop(@hi));
+ unshift (@tmp,pop(@tmp));
+ $off1 += 8 if ($i>0);
+ $off2 += 8 if ($i>1);
+ }
+ }
+
+ &movq ($Zhi,&QWP(0,"eax"));
+ &mov ("ebx",&DWP(8,"eax"));
+ &mov ("edx",&DWP(12,"eax")); # load Xi
+
+&set_label("outer",16);
+ { my $nlo = "eax";
+ my $dat = "edx";
+ my @nhi = ("edi","ebp");
+ my @rem = ("ebx","ecx");
+ my @red = ("mm0","mm1","mm2");
+ my $tmp = "mm3";
+
+ &xor ($dat,&DWP(12,"ecx")); # merge input data
+ &xor ("ebx",&DWP(8,"ecx"));
+ &pxor ($Zhi,&QWP(0,"ecx"));
+ &lea ("ecx",&DWP(16,"ecx")); # inp+=16
+ #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
+ &mov (&DWP(528+8,"esp"),"ebx");
+ &movq (&QWP(528+0,"esp"),$Zhi);
+ &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
+
+ &xor ($nlo,$nlo);
+ &rol ($dat,8);
+ &mov (&LB($nlo),&LB($dat));
+ &mov ($nhi[1],$nlo);
+ &and (&LB($nlo),0x0f);
+ &shr ($nhi[1],4);
+ &pxor ($red[0],$red[0]);
+ &rol ($dat,8); # next byte
+ &pxor ($red[1],$red[1]);
+ &pxor ($red[2],$red[2]);
+
+ # Just like in "May" version modulo-schedule for critical path in
+ # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+ # is scheduled so late that rem_8bit[] has to be shifted *right*
+ # by 16, which is why last argument to pinsrw is 2, which
+ # corresponds to <<32=<<48>>16...
+ for ($j=11,$i=0;$i<15;$i++) {
+
+ if ($i>0) {
+ &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
+ &rol ($dat,8); # next byte
+ &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+ &pxor ($Zlo,$tmp);
+ &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+ &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
+ } else {
+ &movq ($Zlo,&QWP(16,"esp",$nlo,8));
+ &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
+ }
+
+ &mov (&LB($nlo),&LB($dat));
+ &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
+
+ &movd ($rem[0],$Zlo);
+ &movz ($rem[1],&LB($rem[1])) if ($i>0);
+ &psrlq ($Zlo,8); # Z>>=8
+
+ &movq ($tmp,$Zhi);
+ &mov ($nhi[0],$nlo);
+ &psrlq ($Zhi,8);
+
+ &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
+ &and (&LB($nlo),0x0f);
+ &psllq ($tmp,56);
+
+ &pxor ($Zhi,$red[1]) if ($i>1);
+ &shr ($nhi[0],4);
+ &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
+
+ unshift (@red,pop(@red)); # "rotate" registers
+ unshift (@rem,pop(@rem));
+ unshift (@nhi,pop(@nhi));
+ }
+
+ &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
+ &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
+ &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
+
+ &pxor ($Zlo,$tmp);
+ &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+ &movz ($rem[1],&LB($rem[1]));
+
+ &pxor ($red[2],$red[2]); # clear 2nd word
+ &psllq ($red[1],4);
+
+ &movd ($rem[0],$Zlo);
+ &psrlq ($Zlo,4); # Z>>=4
+
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &shl ($rem[0],4); # rem<<4
+
+ &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
+ &psllq ($tmp,60);
+ &movz ($rem[0],&LB($rem[0]));
+
+ &pxor ($Zlo,$tmp);
+ &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+ &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+ &pxor ($Zhi,$red[1]);
+
+ &movd ($dat,$Zlo);
+ &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
+
+ &psllq ($red[0],12); # correct by <<16>>4
+ &pxor ($Zhi,$red[0]);
+ &psrlq ($Zlo,32);
+ &pxor ($Zhi,$red[2]);
+
+ &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
+ &movd ("ebx",$Zlo);
+ &movq ($tmp,$Zhi); # 01234567
+ &psllw ($Zhi,8); # 1.3.5.7.
+ &psrlw ($tmp,8); # .0.2.4.6
+ &por ($Zhi,$tmp); # 10325476
+ &bswap ($dat);
+ &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
+ &bswap ("ebx");
+
+ &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
+ &jne (&label("outer"));
+ }
+
+ &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
+ &mov (&DWP(12,"eax"),"edx");
+ &mov (&DWP(8,"eax"),"ebx");
+ &movq (&QWP(0,"eax"),$Zhi);
+
+ &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
+ &emms ();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 { # minimal "register" pressure
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+ &movdqa ($Xhi,$Xi); #
+ &pshufd ($T1,$Xi,0b01001110);
+ &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
+ &pxor ($T1,$Xi); #
+ &pxor ($T2,$Hkey) if (!defined($HK));
+ $HK=$T2 if (!defined($HK));
+
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T1,$HK,0x00); #######
+ &xorps ($T1,$Xi); #
+ &xorps ($T1,$Xhi); #
+
+ &movdqa ($T2,$T1); #
+ &psrldq ($T1,8);
+ &pslldq ($T2,8); #
+ &pxor ($Xhi,$T1);
+ &pxor ($Xi,$T2); #
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+ &movdqa ($T1,$Xi); #
+ &movdqa ($Xhi,$Xi);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pshufd ($T2,$T1,0b01001110); #
+ &pshufd ($T3,$Hkey,0b01001110);
+ &pxor ($T2,$T1); #
+ &pxor ($T3,$Hkey);
+ &pclmulqdq ($T2,$T3,0x00); #######
+ &pxor ($T2,$Xi); #
+ &pxor ($T2,$Xhi); #
+
+ &movdqa ($T3,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T3,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T3); #
+}
+
+if (1) { # Algorithm 9 with <<1 twist.
+ # Reduction is shorter and uses only two
+ # temporary registers, which makes it better
+ # candidate for interleaving with 64x64
+ # multiplication. Pre-modulo-scheduled loop
+ # was found to be ~20% faster than Algorithm 5
+ # below. Algorithm 9 was therefore chosen for
+ # further optimization...
+
+sub reduction_alg9 { # 17/11 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+ # 1st phase
+ &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
+ &psllq ($Xi,1);
+ &pxor ($Xi,$T1); #
+ &psllq ($Xi,57); #
+ &movdqa ($T1,$Xi); #
+ &pslldq ($Xi,8);
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
+
+ # 2nd phase
+ &movdqa ($T2,$Xi);
+ &psrlq ($Xi,1);
+ &pxor ($Xhi,$T2); #
+ &pxor ($T2,$Xi);
+ &psrlq ($Xi,5);
+ &pxor ($Xi,$T2); #
+ &psrlq ($Xi,1); #
+ &pxor ($Xi,$Xhi) #
+}
+
+&function_begin_B("gcm_init_clmul");
+ &mov ($Htbl,&wparam(0));
+ &mov ($Xip,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Hkey,&QWP(0,$Xip));
+ &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
+
+ # <<1 twist
+ &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
+ &movdqa ($T1,$Hkey);
+ &psllq ($Hkey,1);
+ &pxor ($T3,$T3); #
+ &psrlq ($T1,63);
+ &pcmpgtd ($T3,$T2); # broadcast carry bit
+ &pslldq ($T1,8);
+ &por ($Hkey,$T1); # H<<=1
+
+ # magic reduction
+ &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
+ &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
+
+ # calculate H^2
+ &movdqa ($Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &reduction_alg9 ($Xhi,$Xi);
+
+ &pshufd ($T1,$Hkey,0b01001110);
+ &pshufd ($T2,$Xi,0b01001110);
+ &pxor ($T1,$Hkey); # Karatsuba pre-processing
+ &movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &pxor ($T2,$Xi); # Karatsuba pre-processing
+ &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+ &palignr ($T2,$T1,8); # low part is H.lo^H.hi
+ &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
+
+ &ret ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($T3,&QWP(0,$const));
+ &movups ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$T3);
+ &movups ($T2,&QWP(32,$Htbl));
+
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
+ &reduction_alg9 ($Xhi,$Xi);
+
+ &pshufb ($Xi,$T3);
+ &movdqu (&QWP(0,$Xip),$Xi);
+
+ &ret ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+ &mov ($inp,&wparam(2));
+ &mov ($len,&wparam(3));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($T3,&QWP(0,$const));
+ &movdqu ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$T3);
+
+ &sub ($len,0x10);
+ &jz (&label("odd_tail"));
+
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pshufb ($T1,$T3);
+ &pshufb ($Xn,$T3);
+ &movdqu ($T3,&QWP(32,$Htbl));
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &pshufd ($T1,$Xn,0b01001110); # H*Ii+1
+ &movdqa ($Xhn,$Xn);
+ &pxor ($T1,$Xn); #
+ &lea ($inp,&DWP(32,$inp)); # i+=2
+
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &pclmulqdq ($T1,$T3,0x00); #######
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &nop ();
+
+ &sub ($len,0x20);
+ &jbe (&label("even_tail"));
+ &jmp (&label("mod_loop"));
+
+&set_label("mod_loop",32);
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
+ &nop ();
+
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
+
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &movdqa ($T3,&QWP(0,$const));
+ &xorps ($Xhi,$Xhn);
+ &movdqu ($Xhn,&QWP(0,$inp)); # Ii
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pxor ($T1,$Xhi); #
+
+ &pshufb ($Xhn,$T3);
+ &pxor ($T2,$T1); #
+
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
+ &pshufb ($Xn,$T3);
+ &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
+
+ &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+ &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
+ &psllq ($Xi,1);
+ &pxor ($Xi,$T1); #
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
+ &movups ($T3,&QWP(32,$Htbl));
+ &psllq ($Xi,57); #
+ &movdqa ($T1,$Xi); #
+ &pslldq ($Xi,8);
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
+ &pshufd ($T1,$Xhn,0b01001110);
+ &movdqa ($T2,$Xi); # 2nd phase
+ &psrlq ($Xi,1);
+ &pxor ($T1,$Xhn);
+ &pxor ($Xhi,$T2); #
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &pxor ($T2,$Xi);
+ &psrlq ($Xi,5);
+ &pxor ($Xi,$T2); #
+ &psrlq ($Xi,1); #
+ &pxor ($Xi,$Xhi) #
+ &pclmulqdq ($T1,$T3,0x00); #######
+
+ &lea ($inp,&DWP(32,$inp));
+ &sub ($len,0x20);
+ &ja (&label("mod_loop"));
+
+&set_label("even_tail");
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
+
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movdqa ($T3,&QWP(0,$const));
+
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &xorps ($Xhi,$Xhn);
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &pxor ($T1,$Xhi); #
+
+ &pxor ($T2,$T1); #
+
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
+
+ &reduction_alg9 ($Xhi,$Xi);
+
+ &test ($len,$len);
+ &jnz (&label("done"));
+
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("odd_tail");
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &pshufb ($T1,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &reduction_alg9 ($Xhi,$Xi);
+
+&set_label("done");
+ &pshufb ($Xi,$T3);
+ &movdqu (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else { # Algorithm 5. Kept for reference purposes.
+
+sub reduction_alg5 { # 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+ # <<1
+ &movdqa ($T1,$Xi); #
+ &movdqa ($T2,$Xhi);
+ &pslld ($Xi,1);
+ &pslld ($Xhi,1); #
+ &psrld ($T1,31);
+ &psrld ($T2,31); #
+ &movdqa ($T3,$T1);
+ &pslldq ($T1,4);
+ &psrldq ($T3,12); #
+ &pslldq ($T2,4);
+ &por ($Xhi,$T3); #
+ &por ($Xi,$T1);
+ &por ($Xhi,$T2); #
+
+ # 1st phase
+ &movdqa ($T1,$Xi);
+ &movdqa ($T2,$Xi);
+ &movdqa ($T3,$Xi); #
+ &pslld ($T1,31);
+ &pslld ($T2,30);
+ &pslld ($Xi,25); #
+ &pxor ($T1,$T2);
+ &pxor ($T1,$Xi); #
+ &movdqa ($T2,$T1); #
+ &pslldq ($T1,12);
+ &psrldq ($T2,4); #
+ &pxor ($T3,$T1);
+
+ # 2nd phase
+ &pxor ($Xhi,$T3); #
+ &movdqa ($Xi,$T3);
+ &movdqa ($T1,$T3);
+ &psrld ($Xi,1); #
+ &psrld ($T1,2);
+ &psrld ($T3,7); #
+ &pxor ($Xi,$T1);
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T3); #
+ &pxor ($Xi,$Xhi); #
+}
+
+&function_begin_B("gcm_init_clmul");
+ &mov ($Htbl,&wparam(0));
+ &mov ($Xip,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Hkey,&QWP(0,$Xip));
+ &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
+
+ # calculate H^2
+ &movdqa ($Xi,$Hkey);
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+
+ &ret ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($Xn,&QWP(0,$const));
+ &movdqu ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$Xn);
+
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &pshufb ($Xi,$Xn);
+ &movdqu (&QWP(0,$Xip),$Xi);
+
+ &ret ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+ &mov ($inp,&wparam(2));
+ &mov ($len,&wparam(3));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($T3,&QWP(0,$const));
+ &movdqu ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$T3);
+
+ &sub ($len,0x10);
+ &jz (&label("odd_tail"));
+
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pshufb ($T1,$T3);
+ &pshufb ($Xn,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+ &sub ($len,0x20);
+ &lea ($inp,&DWP(32,$inp)); # i+=2
+ &jbe (&label("even_tail"));
+
+&set_label("mod_loop");
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+ &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
+
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &pxor ($Xhi,$Xhn);
+
+ &reduction_alg5 ($Xhi,$Xi);
+
+ #######
+ &movdqa ($T3,&QWP(0,$const));
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pshufb ($T1,$T3);
+ &pshufb ($Xn,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+ &sub ($len,0x20);
+ &lea ($inp,&DWP(32,$inp));
+ &ja (&label("mod_loop"));
+
+&set_label("even_tail");
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &pxor ($Xhi,$Xhn);
+
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &movdqa ($T3,&QWP(0,$const));
+ &test ($len,$len);
+ &jnz (&label("done"));
+
+ &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("odd_tail");
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &pshufb ($T1,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &movdqa ($T3,&QWP(0,$const));
+&set_label("done");
+ &pshufb ($Xi,$T3);
+ &movdqu (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+ &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+ &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
+&set_label("rem_8bit",64);
+ &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+ &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+ &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+ &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+ &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+ &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+ &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+ &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+ &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+ &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+ &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+ &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+ &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+ &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+ &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+ &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+ &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+ &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+ &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+ &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+ &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+ &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+ &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+ &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+ &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+ &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+ &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+ &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+ &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+ &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+ &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+ &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}} # $sse2
+
+&set_label("rem_4bit",64);
+ &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+ &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+ &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+ &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+}}} # !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+close STDOUT;
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000..387e3f8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,1762 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+# gcc 3.4.x(*) assembler
+#
+# P4 28.6 14.0 +100%
+# Opteron 19.3 7.7 +150%
+# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
+#
+# (*) comparison is not completely fair, because C results are
+# for vanilla "256B" implementation, while assembler results
+# are for "528B";-)
+# (**) it's mystery [to me] why Core2 result is not same as for
+# Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Skylake 0.44(+110%)(if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
+# Silvermont 2.88(+13%)
+# Goldmont 1.08(+24%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.20) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$do4xaggr=1;
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
+ $r =~ s/%[er]([sd]i)/%\1l/ or
+ $r =~ s/%[er](bp)/%\1l/ or
+ $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+ sub loop() {
+ my $inp = shift;
+
+ $N++;
+$code.=<<___;
+ xor $nlo,$nlo
+ xor $nhi,$nhi
+ mov `&LB("$Zlo")`,`&LB("$nlo")`
+ mov `&LB("$Zlo")`,`&LB("$nhi")`
+ shl \$4,`&LB("$nlo")`
+ mov \$14,$cnt
+ mov 8($Htbl,$nlo),$Zlo
+ mov ($Htbl,$nlo),$Zhi
+ and \$0xf0,`&LB("$nhi")`
+ mov $Zlo,$rem
+ jmp .Loop$N
+
+.align 16
+.Loop$N:
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ mov ($inp,$cnt),`&LB("$nlo")`
+ shr \$4,$Zhi
+ xor 8($Htbl,$nhi),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nhi),$Zhi
+ mov `&LB("$nlo")`,`&LB("$nhi")`
+ xor ($rem_4bit,$rem,8),$Zhi
+ mov $Zlo,$rem
+ shl \$4,`&LB("$nlo")`
+ xor $tmp,$Zlo
+ dec $cnt
+ js .Lbreak$N
+
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ shr \$4,$Zhi
+ xor 8($Htbl,$nlo),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nlo),$Zhi
+ and \$0xf0,`&LB("$nhi")`
+ xor ($rem_4bit,$rem,8),$Zhi
+ mov $Zlo,$rem
+ xor $tmp,$Zlo
+ jmp .Loop$N
+
+.align 16
+.Lbreak$N:
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ shr \$4,$Zhi
+ xor 8($Htbl,$nlo),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nlo),$Zhi
+ and \$0xf0,`&LB("$nhi")`
+ xor ($rem_4bit,$rem,8),$Zhi
+ mov $Zlo,$rem
+ xor $tmp,$Zlo
+
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ shr \$4,$Zhi
+ xor 8($Htbl,$nhi),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nhi),$Zhi
+ xor $tmp,$Zlo
+ xor ($rem_4bit,$rem,8),$Zhi
+
+ bswap $Zlo
+ bswap $Zhi
+___
+}}
+
+$code=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+.globl gcm_gmult_4bit
+.type gcm_gmult_4bit,\@function,2
+.align 16
+gcm_gmult_4bit:
+ push %rbx
+ push %rbp # %rbp and %r12 are pushed exclusively in
+ push %r12 # order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+ movzb 15($Xi),$Zlo
+ lea .Lrem_4bit(%rip),$rem_4bit
+___
+ &loop ($Xi);
+$code.=<<___;
+ mov $Zlo,8($Xi)
+ mov $Zhi,($Xi)
+
+ mov 16(%rsp),%rbx
+ lea 24(%rsp),%rsp
+.Lgmult_epilogue:
+ ret
+.size gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.type gcm_ghash_4bit,\@function,4
+.align 16
+gcm_ghash_4bit:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ sub \$280,%rsp
+.Lghash_prologue:
+ mov $inp,%r14 # reassign couple of args
+ mov $len,%r15
+___
+{ my $inp="%r14";
+ my $dat="%edx";
+ my $len="%r15";
+ my @nhi=("%ebx","%ecx");
+ my @rem=("%r12","%r13");
+ my $Hshr4="%rbp";
+
+ &sub ($Htbl,-128); # size optimization
+ &lea ($Hshr4,"16+128(%rsp)");
+ { my @lo =($nlo,$nhi);
+ my @hi =($Zlo,$Zhi);
+
+ &xor ($dat,$dat);
+ for ($i=0,$j=-2;$i<18;$i++,$j++) {
+ &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
+ &or ($lo[0],$tmp) if ($i>1);
+ &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
+ &shr ($lo[1],4) if ($i>0 && $i<17);
+ &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
+ &shr ($hi[1],4) if ($i>0 && $i<17);
+ &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
+ &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
+ &shl (&LB($dat),4) if ($i>0 && $i<17);
+ &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
+ &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
+ &shl ($tmp,60) if ($i>0 && $i<17);
+
+ push (@lo,shift(@lo));
+ push (@hi,shift(@hi));
+ }
+ }
+ &add ($Htbl,-128);
+ &mov ($Zlo,"8($Xi)");
+ &mov ($Zhi,"0($Xi)");
+ &add ($len,$inp); # pointer to the end of data
+ &lea ($rem_8bit,".Lrem_8bit(%rip)");
+ &jmp (".Louter_loop");
+
+$code.=".align 16\n.Louter_loop:\n";
+ &xor ($Zhi,"($inp)");
+ &mov ("%rdx","8($inp)");
+ &lea ($inp,"16($inp)");
+ &xor ("%rdx",$Zlo);
+ &mov ("($Xi)",$Zhi);
+ &mov ("8($Xi)","%rdx");
+ &shr ("%rdx",32);
+
+ &xor ($nlo,$nlo);
+ &rol ($dat,8);
+ &mov (&LB($nlo),&LB($dat));
+ &movz ($nhi[0],&LB($dat));
+ &shl (&LB($nlo),4);
+ &shr ($nhi[0],4);
+
+ for ($j=11,$i=0;$i<15;$i++) {
+ &rol ($dat,8);
+ &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
+ &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
+ &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
+ &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
+
+ &mov (&LB($nlo),&LB($dat));
+ &xor ($Zlo,$tmp) if ($i>0);
+ &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
+
+ &movz ($nhi[1],&LB($dat));
+ &shl (&LB($nlo),4);
+ &movzb ($rem[0],"(%rsp,$nhi[0])");
+
+ &shr ($nhi[1],4) if ($i<14);
+ &and ($nhi[1],0xf0) if ($i==14);
+ &shl ($rem[1],48) if ($i>0);
+ &xor ($rem[0],$Zlo);
+
+ &mov ($tmp,$Zhi);
+ &xor ($Zhi,$rem[1]) if ($i>0);
+ &shr ($Zlo,8);
+
+ &movz ($rem[0],&LB($rem[0]));
+ &mov ($dat,"$j($Xi)") if (--$j%4==0);
+ &shr ($Zhi,8);
+
+ &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
+ &shl ($tmp,56);
+ &xor ($Zhi,"($Hshr4,$nhi[0],8)");
+
+ unshift (@nhi,pop(@nhi)); # "rotate" registers
+ unshift (@rem,pop(@rem));
+ }
+ &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
+ &xor ($Zlo,"8($Htbl,$nlo)");
+ &xor ($Zhi,"($Htbl,$nlo)");
+
+ &shl ($rem[1],48);
+ &xor ($Zlo,$tmp);
+
+ &xor ($Zhi,$rem[1]);
+ &movz ($rem[0],&LB($Zlo));
+ &shr ($Zlo,4);
+
+ &mov ($tmp,$Zhi);
+ &shl (&LB($rem[0]),4);
+ &shr ($Zhi,4);
+
+ &xor ($Zlo,"8($Htbl,$nhi[0])");
+ &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
+ &shl ($tmp,60);
+
+ &xor ($Zhi,"($Htbl,$nhi[0])");
+ &xor ($Zlo,$tmp);
+ &shl ($rem[0],48);
+
+ &bswap ($Zlo);
+ &xor ($Zhi,$rem[0]);
+
+ &bswap ($Zhi);
+ &cmp ($inp,$len);
+ &jb (".Louter_loop");
+}
+$code.=<<___;
+ mov $Zlo,8($Xi)
+ mov $Zhi,($Xi)
+
+ lea 280(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lghash_epilogue:
+ ret
+.size gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 { # minimal register pressure
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pshufd \$0b01001110,$Hkey,$T2
+ pxor $Xi,$T1 #
+ pxor $Hkey,$T2
+___
+} else {
+$code.=<<___;
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1 #
+___
+}
+$code.=<<___;
+ pclmulqdq \$0x00,$Hkey,$Xi #######
+ pclmulqdq \$0x11,$Hkey,$Xhi #######
+ pclmulqdq \$0x00,$HK,$T1 #######
+ pxor $Xi,$T1 #
+ pxor $Xhi,$T1 #
+
+ movdqa $T1,$T2 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
+___
+}
+
+sub reduction_alg9 { # 17/11 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+ # 1st phase
+ movdqa $Xi,$T2 #
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
+ psllq \$1,$Xi
+ pxor $T1,$Xi #
+ psllq \$57,$Xi #
+ movdqa $Xi,$T1 #
+ pslldq \$8,$Xi
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
+
+ # 2nd phase
+ movdqa $Xi,$T2
+ psrlq \$1,$Xi
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
+ psrlq \$5,$Xi
+ pxor $T2,$Xi #
+ psrlq \$1,$Xi #
+ pxor $Xhi,$Xi #
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+ my $HK="%xmm6";
+
+$code.=<<___;
+.globl gcm_init_clmul
+.type gcm_init_clmul,\@abi-omnipotent
+.align 16
+gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
+ movdqu ($Xip),$Hkey
+ pshufd \$0b01001110,$Hkey,$Hkey # dword swap
+
+ # <<1 twist
+ pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
+ movdqa $Hkey,$T1
+ psllq \$1,$Hkey
+ pxor $T3,$T3 #
+ psrlq \$63,$T1
+ pcmpgtd $T2,$T3 # broadcast carry bit
+ pslldq \$8,$T1
+ por $T1,$Hkey # H<<=1
+
+ # magic reduction
+ pand .L0x1c2_polynomial(%rip),$T3
+ pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
+
+ # calculate H^2
+ pshufd \$0b01001110,$Hkey,$HK
+ movdqa $Hkey,$Xi
+ pxor $Hkey,$HK
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ pshufd \$0b01001110,$Hkey,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $Hkey,$T1 # Karatsuba pre-processing
+ movdqu $Hkey,0x00($Htbl) # save H
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x10($Htbl) # save H^2
+ palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
+ movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
+___
+if ($do4xaggr) {
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ movdqa $Xi,$T3
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ pshufd \$0b01001110,$T3,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $T3,$T1 # Karatsuba pre-processing
+ movdqu $T3,0x30($Htbl) # save H^3
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x40($Htbl) # save H^4
+ palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
+ movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
+ ret
+.size gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,\@abi-omnipotent
+.align 16
+gcm_gmult_clmul:
+.L_gmult_clmul:
+ movdqu ($Xip),$Xi
+ movdqa .Lbswap_mask(%rip),$T3
+ movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$T2
+ pshufb $T3,$Xi
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+ # experimental alternative. special thing about is that there
+ # no dependency between the two multiplications...
+ mov \$`0xE1<<1`,%eax
+ mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
+ mov \$0x07,%r11d
+ movq %rax,$T1
+ movq %r10,$T2
+ movq %r11,$T3 # borrow $T3
+ pand $Xi,$T3
+ pshufb $T3,$T2 # ($Xi&7)·0xE0
+ movq %rax,$T3
+ pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
+ pxor $Xi,$T2
+ pslldq \$15,$T2
+ paddd $T2,$T2 # <<(64+56+1)
+ pxor $T2,$Xi
+ pclmulqdq \$0x01,$T3,$Xi
+ movdqa .Lbswap_mask(%rip),$T3 # reload $T3
+ psrldq \$1,$T1
+ pxor $T1,$Xhi
+ pslldq \$7,$Xi
+ pxor $Xhi,$Xi
+___
+$code.=<<___;
+ pshufb $T3,$Xi
+ movdqu $Xi,($Xip)
+ ret
+.size gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
+
+$code.=<<___;
+.globl gcm_ghash_clmul
+.type gcm_ghash_clmul,\@abi-omnipotent
+.align 32
+gcm_ghash_clmul:
+.L_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_clmul:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ movdqa .Lbswap_mask(%rip),$T3
+
+ movdqu ($Xip),$Xi
+ movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$HK
+ pshufb $T3,$Xi
+
+ sub \$0x10,$len
+ jz .Lodd_tail
+
+ movdqu 0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
+ cmp \$0x30,$len
+ jb .Lskip4x
+
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
+ sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
+ movdqu 0x30($Htbl),$Hkey3
+ movdqu 0x40($Htbl),$Hkey4
+
+ #######
+ # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+ #
+ movdqu 0x30($inp),$Xln
+ movdqu 0x20($inp),$Xl
+ pshufb $T3,$Xln
+ pshufb $T3,$Xl
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
+
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey2,$Xl
+ pclmulqdq \$0x11,$Hkey2,$Xh
+ pclmulqdq \$0x10,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ xorps $Xm,$Xmn
+
+ movdqu 0x10($inp),$Xl
+ movdqu 0($inp),$T1
+ pshufb $T3,$Xl
+ pshufb $T3,$T1
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T1,$Xi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xl,$Xln
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ xorps $Xm,$Xmn
+ movdqu 0x30($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ xorps $Xln,$Xi
+ movdqu 0x20($inp),$Xln
+ movdqa $Xl,$Xh
+ pclmulqdq \$0x10,$HK,$T1
+ pshufd \$0b01001110,$Xl,$Xm
+ xorps $Xhn,$Xhi
+ pxor $Xl,$Xm
+ pshufb $T3,$Xln
+ movups 0x20($Htbl),$HK
+ xorps $Xmn,$T1
+ pclmulqdq \$0x00,$Hkey,$Xl
+ pshufd \$0b01001110,$Xln,$Xmn
+
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ movdqa $Xln,$Xhn
+ pxor $Xhi,$T1 #
+ pxor $Xln,$Xmn
+ movdqa $T1,$T2 #
+ pclmulqdq \$0x11,$Hkey,$Xh
+ pslldq \$8,$T1
+ psrldq \$8,$T2 #
+ pxor $T1,$Xi
+ movdqa .L7_mask(%rip),$T1
+ pxor $T2,$Xhi #
+ movq %rax,$T2
+
+ pand $Xi,$T1 # 1st phase
+ pshufb $T1,$T2 #
+ pxor $Xi,$T2 #
+ pclmulqdq \$0x00,$HK,$Xm
+ psllq \$57,$T2 #
+ movdqa $T2,$T1 #
+ pslldq \$8,$T2
+ pclmulqdq \$0x00,$Hkey2,$Xln
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
+ movdqu 0($inp),$T1
+
+ movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhn
+ xorps $Xl,$Xln
+ movdqu 0x10($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x10,$HK,$Xmn
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ pshufb $T3,$T1
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
+ psrlq \$5,$Xi
+
+ movdqa $Xl,$Xh
+ pxor $Xm,$Xmn
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T2,$Xi #
+ pxor $T1,$Xhi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ psrlq \$1,$Xi #
+ pxor $Xhi,$Xi #
+ movdqa $Xi,$Xhi
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ xorps $Xl,$Xln
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jnc .Lmod4_loop
+
+.Ltail4x:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ pclmulqdq \$0x10,$HK,$T1
+ xorps $Xm,$Xmn
+ xorps $Xln,$Xi
+ xorps $Xhn,$Xhi
+ pxor $Xi,$Xhi # aggregated Karatsuba post-processing
+ pxor $Xmn,$T1
+
+ pxor $Xhi,$T1 #
+ pxor $Xi,$Xhi
+
+ movdqa $T1,$T2 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
+___
+ &reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+ add \$0x40,$len
+ jz .Ldone
+ movdqu 0x20($Htbl),$HK
+ sub \$0x10,$len
+ jz .Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+ movdqu ($inp),$T1 # Ii
+ movdqu 16($inp),$Xln # Ii+1
+ pshufb $T3,$T1
+ pshufb $T3,$Xln
+ pxor $T1,$Xi # Ii+Xi
+
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
+
+ lea 32($inp),$inp # i+=2
+ nop
+ sub \$0x20,$len
+ jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+
+.align 32
+.Lmod_loop:
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
+
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
+
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+ movdqu ($inp),$T2 # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pshufb $T3,$T2
+ movdqu 16($inp),$Xln # Ii+1
+
+ pxor $Xhi,$T1
+ pxor $T2,$Xhi # "Ii+Xi", consume early
+ pxor $T1,$Xmn
+ pshufb $T3,$Xln
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
+
+ movdqa $Xln,$Xhn #
+
+ movdqa $Xi,$T2 # 1st phase
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
+ psllq \$1,$Xi
+ pxor $T1,$Xi #
+ psllq \$57,$Xi #
+ movdqa $Xi,$T1 #
+ pslldq \$8,$Xi
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
+ pxor $T1,$Xhi #
+ pxor $Xhn,$Xmn #
+
+ movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
+ psrlq \$5,$Xi
+ pxor $T2,$Xi #
+ lea 32($inp),$inp
+ psrlq \$1,$Xi #
+ pclmulqdq \$0x00,$HK,$Xmn #######
+ pxor $Xhi,$Xi #
+
+ sub \$0x20,$len
+ ja .Lmod_loop
+
+.Leven_tail:
+ movdqa $Xi,$Xhi
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
+
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$Xmn
+
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+ pxor $Xi,$T1
+ pxor $Xhi,$T1
+ pxor $T1,$Xmn
+ movdqa $Xmn,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$Xmn #
+ pxor $T1,$Xhi
+ pxor $Xmn,$Xi #
+___
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ test $len,$len
+ jnz .Ldone
+
+.Lodd_tail:
+ movdqu ($inp),$T1 # Ii
+ pshufb $T3,$T1
+ pxor $T1,$Xi # Ii+Xi
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+ pshufb $T3,$Xi
+ movdqu $Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.globl gcm_init_avx
+.type gcm_init_avx,\@abi-omnipotent
+.align 32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Hkey
+ vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
+
+ # <<1 twist
+ vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
+ vpsrlq \$63,$Hkey,$T1
+ vpsllq \$1,$Hkey,$Hkey
+ vpxor $T3,$T3,$T3 #
+ vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
+ vpslldq \$8,$T1,$T1
+ vpor $T1,$Hkey,$Hkey # H<<=1
+
+ # magic reduction
+ vpand .L0x1c2_polynomial(%rip),$T3,$T3
+ vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
+
+ vpunpckhqdq $Hkey,$Hkey,$HK
+ vmovdqa $Hkey,$Xi
+ vpxor $Hkey,$HK,$HK
+ mov \$4,%r10 # up to H^8
+ jmp .Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpunpckhqdq $Hkey,$Hkey,$T2
+ vpxor $Xi,$T1,$T1 #
+ vpxor $Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpxor $Xi,$T1,$T1 #
+___
+}
+$code.=<<___;
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
+ vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
+ vpclmulqdq \$0x00,$HK,$T1,$T1 #######
+ vpxor $Xi,$Xhi,$T2 #
+ vpxor $T2,$T1,$T1 #
+
+ vpslldq \$8,$T1,$T2 #
+ vpsrldq \$8,$T1,$T1
+ vpxor $T2,$Xi,$Xi #
+ vpxor $T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+ vpsllq \$57,$Xi,$T1 # 1st phase
+ vpsllq \$62,$Xi,$T2
+ vpxor $T1,$T2,$T2 #
+ vpsllq \$63,$Xi,$T1
+ vpxor $T1,$T2,$T2 #
+ vpslldq \$8,$T2,$T1 #
+ vpsrldq \$8,$T2,$T2
+ vpxor $T1,$Xi,$Xi #
+ vpxor $T2,$Xhi,$Xhi
+
+ vpsrlq \$1,$Xi,$T2 # 2nd phase
+ vpxor $Xi,$Xhi,$Xhi
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$5,$T2,$T2
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$1,$Xi,$Xi #
+ vpxor $Xhi,$Xi,$Xi #
+___
+}
+
+$code.=<<___;
+.align 32
+.Linit_loop_avx:
+ vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
+ vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+ vmovdqa $Xi,$T3
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+ vpshufd \$0b01001110,$T3,$T1
+ vpshufd \$0b01001110,$Xi,$T2
+ vpxor $T3,$T1,$T1 # Karatsuba pre-processing
+ vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
+ vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
+ vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
+ lea 0x30($Htbl),$Htbl
+ sub \$1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
+ vmovdqu $T3,-0x10($Htbl)
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_init_clmul
+.size gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,\@abi-omnipotent
+.align 32
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+.size gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,\@abi-omnipotent
+.align 32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+ $Zlo,$Zhi,$Zmi,
+ $Hkey,$HK,$T1,$T2,
+ $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Xi # load $Xi
+ lea .L0x1c2_polynomial(%rip),%r10
+ lea 0x40($Htbl),$Htbl # size optimization
+ vmovdqu .Lbswap_mask(%rip),$bswap
+ vpshufb $bswap,$Xi,$Xi
+ cmp \$0x80,$len
+ jb .Lshort_avx
+ sub \$0x80,$len
+
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpshufb $bswap,$Ii,$Ii
+ vmovdqu 0x20-0x40($Htbl),$HK
+
+ vpunpckhqdq $Ii,$Ii,$T2
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Ii,$T2,$T2
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpxor $Ii,$T2,$T2
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpxor $Xmi,$Zmi,$Zmi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x10,$HK,$T2,$Xmi
+
+ lea 0x80($inp),$inp
+ cmp \$0x80,$len
+ jb .Ltail_avx
+
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+ sub \$0x80,$len
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpxor $Ij,$T1,$T1
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Tred
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Zlo,$Xi,$Xi # collect result
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vxorps $Zhi,$Xo,$Xo
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Zmi,$Tred,$Tred
+ vxorps $Ij,$T1,$T1
+
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Xo,$Tred,$Tred
+ vpslldq \$8,$Tred,$T2
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vpsrldq \$8,$Tred,$Tred
+ vpxor $T2, $Xi, $Xi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ii
+ vxorps $Tred,$Xo, $Xo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vxorps $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+ vxorps $Tred,$Xi,$Xi
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vxorps $Xo,$Tred,$Tred
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Tred,$Ij,$Ij
+ vpclmulqdq \$0x10,$HK, $T2,$Xmi
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jnc .Loop8x_avx
+
+ add \$0x80,$len
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -0x10($inp,$len),$Ii # very last word
+ lea ($inp,$len),$inp
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpshufb $bswap,$Ii,$Ij
+
+ vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
+ vmovdqa $Xhi,$Zhi # $Zhi and
+ vmovdqa $Xmi,$Zmi # $Zmi
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x20($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x30($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x40($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x50($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x60($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x70($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovq 0xb8-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+.Ltail_no_xor_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+
+ vmovdqu (%r10),$Tred
+
+ vpxor $Xlo,$Zlo,$Xi
+ vpxor $Xhi,$Zhi,$Xo
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
+ vpxor $Xo, $Zmi,$Zmi
+ vpslldq \$8, $Zmi,$T2
+ vpsrldq \$8, $Zmi,$Zmi
+ vpxor $T2, $Xi, $Xi
+ vpxor $Zmi,$Xo, $Xo
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $Xo,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ cmp \$0,$len
+ jne .Lshort_avx
+
+ vpshufb $bswap,$Xi,$Xi
+ vmovdqu $Xi,($Xip)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_ghash_clmul
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+ .long 7,0,7,0
+.L7_mask_poly:
+ .long 7,0,`0xE1<<1`,0
+.align 64
+.type .Lrem_4bit,\@object
+.Lrem_4bit:
+ .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+ .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+ .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+ .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type .Lrem_8bit,\@object
+.Lrem_8bit:
+ .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+ .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+ .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+ .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+ .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+ .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+ .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+ .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+ .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+ .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+ .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+ .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+ .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+ .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+ .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+ .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+ .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+ .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+ .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+ .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+ .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+ .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+ .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+ .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+ .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+ .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+ .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+ .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+ .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+ .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+ .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+ .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ lea 24(%rax),%rax # adjust "rsp"
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_gcm_gmult_4bit
+ .rva .LSEH_end_gcm_gmult_4bit
+ .rva .LSEH_info_gcm_gmult_4bit
+
+ .rva .LSEH_begin_gcm_ghash_4bit
+ .rva .LSEH_end_gcm_ghash_4bit
+ .rva .LSEH_info_gcm_ghash_4bit
+
+ .rva .LSEH_begin_gcm_init_clmul
+ .rva .LSEH_end_gcm_init_clmul
+ .rva .LSEH_info_gcm_init_clmul
+
+ .rva .LSEH_begin_gcm_ghash_clmul
+ .rva .LSEH_end_gcm_ghash_clmul
+ .rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_gcm_init_avx
+ .rva .LSEH_end_gcm_init_avx
+ .rva .LSEH_info_gcm_init_clmul
+
+ .rva .LSEH_begin_gcm_ghash_avx
+ .rva .LSEH_end_gcm_ghash_avx
+ .rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_gcm_gmult_4bit:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
+.LSEH_info_gcm_ghash_4bit:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
+.LSEH_info_gcm_init_clmul:
+ .byte 0x01,0x08,0x03,0x00
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
+.LSEH_info_gcm_ghash_clmul:
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
+ .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
+ .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
+ .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
+ .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
+ .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
+ .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
+ .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
+ .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl b/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl
new file mode 100755
index 0000000..f0598cb
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,670 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
+ $STU="stdu";
+ $POP="ld";
+ $PUSH="std";
+ $UCMP="cmpld";
+ $SHRI="srdi";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $LRSAVE=$SIZE_T;
+ $STU="stwu";
+ $POP="lwz";
+ $PUSH="stw";
+ $UCMP="cmplw";
+ $SHRI="srwi";
+} else { die "nonsense $flavour"; }
+
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
+my $vrsave="r12";
+
+$code=<<___;
+.machine "any"
+
+.text
+
+.globl .gcm_init_p8
+.align 5
+.gcm_init_p8:
+ li r0,-4096
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $H,0,r4 # load H
+
+ vspltisb $xC2,-16 # 0xf0
+ vspltisb $t0,1 # one
+ vaddubm $xC2,$xC2,$xC2 # 0xe0
+ vxor $zero,$zero,$zero
+ vor $xC2,$xC2,$t0 # 0xe1
+ vsldoi $xC2,$xC2,$zero,15 # 0xe1...
+ vsldoi $t1,$zero,$t0,1 # ...1
+ vaddubm $xC2,$xC2,$xC2 # 0xc2...
+ vspltisb $t2,7
+ vor $xC2,$xC2,$t1 # 0xc2....01
+ vspltb $t1,$H,0 # most significant byte
+ vsl $H,$H,$t0 # H<<=1
+ vsrab $t1,$t1,$t2 # broadcast carry bit
+ vand $t1,$t1,$xC2
+ vxor $IN,$H,$t1 # twisted H
+
+ vsldoi $H,$IN,$IN,8 # twist even more ...
+ vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
+ vsldoi $Hl,$zero,$H,8 # ... and split
+ vsldoi $Hh,$H,$zero,8
+
+ stvx_u $xC2,0,r3 # save pre-computed table
+ stvx_u $Hl,r8,r3
+ li r8,0x40
+ stvx_u $H, r9,r3
+ li r9,0x50
+ stvx_u $Hh,r10,r3
+ li r10,0x60
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
+ vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $IN1,$Xl,$t1
+
+ vsldoi $H2,$IN1,$IN1,8
+ vsldoi $H2l,$zero,$H2,8
+ vsldoi $H2h,$H2,$zero,8
+
+ stvx_u $H2l,r8,r3 # save H^2
+ li r8,0x70
+ stvx_u $H2,r9,r3
+ li r9,0x80
+ stvx_u $H2h,r10,r3
+ li r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+ vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
+ vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
+ vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
+ vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
+ vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
+ vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+ vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vsldoi $t4,$Xm1,$zero,8
+ vsldoi $t5,$zero,$Xm1,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+ vxor $Xl1,$Xl1,$t4
+ vxor $Xh1,$Xh1,$t5
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vsldoi $Xl1,$Xl1,$Xl1,8
+ vxor $Xl,$Xl,$t2
+ vxor $Xl1,$Xl1,$t6
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vpmsumd $Xl1,$Xl1,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $t5,$t5,$Xh1
+ vxor $Xl,$Xl,$t1
+ vxor $Xl1,$Xl1,$t5
+
+ vsldoi $H,$Xl,$Xl,8
+ vsldoi $H2,$Xl1,$Xl1,8
+ vsldoi $Hl,$zero,$H,8
+ vsldoi $Hh,$H,$zero,8
+ vsldoi $H2l,$zero,$H2,8
+ vsldoi $H2h,$H2,$zero,8
+
+ stvx_u $Hl,r8,r3 # save H^3
+ li r8,0xa0
+ stvx_u $H,r9,r3
+ li r9,0xb0
+ stvx_u $Hh,r10,r3
+ li r10,0xc0
+ stvx_u $H2l,r8,r3 # save H^4
+ stvx_u $H2,r9,r3
+ stvx_u $H2h,r10,r3
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_init_p8,.-.gcm_init_p8
+___
+}
+$code.=<<___;
+.globl .gcm_gmult_p8
+.align 5
+.gcm_gmult_p8:
+ lis r0,0xfff8
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $IN,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $zero,$zero,$zero
+
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl .gcm_ghash_p8
+.align 5
+.gcm_ghash_p8:
+ li r0,-4096
+ li r8,0x10
+ mfspr $vrsave,256
+ li r9,0x20
+ mtspr 256,r0
+ li r10,0x30
+ lvx_u $Xl,0,$Xip # load Xi
+
+ lvx_u $Hl,r8,$Htbl # load pre-computed table
+ li r8,0x40
+ le?lvsl $lemask,r0,r0
+ lvx_u $H, r9,$Htbl
+ li r9,0x50
+ le?vspltisb $t0,0x07
+ lvx_u $Hh,r10,$Htbl
+ li r10,0x60
+ le?vxor $lemask,$lemask,$t0
+ lvx_u $xC2,0,$Htbl
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ vxor $zero,$zero,$zero
+
+ ${UCMP}i $len,64
+ bge Lgcm_ghash_p8_4x
+
+ lvx_u $IN,0,$inp
+ addi $inp,$inp,16
+ subic. $len,$len,16
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $IN,$IN,$Xl
+ beq Lshort
+
+ lvx_u $H2l,r8,$Htbl # load H^2
+ li r8,16
+ lvx_u $H2, r9,$Htbl
+ add r9,$inp,$len # end of input
+ lvx_u $H2h,r10,$Htbl
+ be?b Loop_2x
+
+.align 5
+Loop_2x:
+ lvx_u $IN1,0,$inp
+ le?vperm $IN1,$IN1,$IN1,$lemask
+
+ subic $len,$len,32
+ vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
+ vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
+ subfe r0,r0,r0 # borrow?-1:0
+ vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
+ vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
+ and r0,r0,$len
+ vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
+ vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
+ add $inp,$inp,r0
+
+ vxor $Xl,$Xl,$Xl1
+ vxor $Xm,$Xm,$Xm1
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xh,$Xh,$Xh1
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+ lvx_u $IN,r8,$inp
+ addi $inp,$inp,32
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ le?vperm $IN,$IN,$IN,$lemask
+ vxor $t1,$t1,$Xh
+ vxor $IN,$IN,$t1
+ vxor $IN,$IN,$Xl
+ $UCMP r9,$inp
+ bgt Loop_2x # done yet?
+
+ cmplwi $len,0
+ bne Leven
+
+Lshort:
+ vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
+ vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
+ vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+
+Leven:
+ vxor $Xl,$Xl,$t1
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+ $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align 5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ li r10,0x60
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME-4`($sp) # save vrsave
+ mtspr 256,r0 # preserve all AltiVec registers
+
+ lvsl $t0,0,r8 # 0x0001..0e0f
+ #lvx_u $H2l,r8,$Htbl # load H^2
+ li r8,0x70
+ lvx_u $H2, r9,$Htbl
+ li r9,0x80
+ vspltisb $t1,8 # 0x0808..0808
+ #lvx_u $H2h,r10,$Htbl
+ li r10,0x90
+ lvx_u $H3l,r8,$Htbl # load H^3
+ li r8,0xa0
+ lvx_u $H3, r9,$Htbl
+ li r9,0xb0
+ lvx_u $H3h,r10,$Htbl
+ li r10,0xc0
+ lvx_u $H4l,r8,$Htbl # load H^4
+ li r8,0x10
+ lvx_u $H4, r9,$Htbl
+ li r9,0x20
+ lvx_u $H4h,r10,$Htbl
+ li r10,0x30
+
+ vsldoi $t2,$zero,$t1,8 # 0x0000..0808
+ vaddubm $hiperm,$t0,$t2 # 0x0001..1617
+ vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
+
+ $SHRI $len,$len,4 # this allows to use sign bit
+ # as carry
+ lvx_u $IN0,0,$inp # load input
+ lvx_u $IN1,r8,$inp
+ subic. $len,$len,8
+ lvx_u $IN2,r9,$inp
+ lvx_u $IN3,r10,$inp
+ addi $inp,$inp,0x40
+ le?vperm $IN0,$IN0,$IN0,$lemask
+ le?vperm $IN1,$IN1,$IN1,$lemask
+ le?vperm $IN2,$IN2,$IN2,$lemask
+ le?vperm $IN3,$IN3,$IN3,$lemask
+
+ vxor $Xh,$IN0,$Xl
+
+ vpmsumd $Xl1,$IN1,$H3l
+ vpmsumd $Xm1,$IN1,$H3
+ vpmsumd $Xh1,$IN1,$H3h
+
+ vperm $H21l,$H2,$H,$hiperm
+ vperm $t0,$IN2,$IN3,$loperm
+ vperm $H21h,$H2,$H,$loperm
+ vperm $t1,$IN2,$IN3,$hiperm
+ vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+ vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+ vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
+ vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+ vxor $Xm2,$Xm2,$Xm1
+ vxor $Xl3,$Xl3,$Xl1
+ vxor $Xm3,$Xm3,$Xm2
+ vxor $Xh3,$Xh3,$Xh1
+
+ blt Ltail_4x
+
+Loop_4x:
+ lvx_u $IN0,0,$inp
+ lvx_u $IN1,r8,$inp
+ subic. $len,$len,4
+ lvx_u $IN2,r9,$inp
+ lvx_u $IN3,r10,$inp
+ addi $inp,$inp,0x40
+ le?vperm $IN1,$IN1,$IN1,$lemask
+ le?vperm $IN2,$IN2,$IN2,$lemask
+ le?vperm $IN3,$IN3,$IN3,$lemask
+ le?vperm $IN0,$IN0,$IN0,$lemask
+
+ vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
+ vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
+ vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
+ vpmsumd $Xl1,$IN1,$H3l
+ vpmsumd $Xm1,$IN1,$H3
+ vpmsumd $Xh1,$IN1,$H3h
+
+ vxor $Xl,$Xl,$Xl3
+ vxor $Xm,$Xm,$Xm3
+ vxor $Xh,$Xh,$Xh3
+ vperm $t0,$IN2,$IN3,$loperm
+ vperm $t1,$IN2,$IN3,$hiperm
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+ vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
+ vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+ vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
+ vpmsumd $Xl,$Xl,$xC2
+
+ vxor $Xl3,$Xl3,$Xl1
+ vxor $Xh3,$Xh3,$Xh1
+ vxor $Xh,$Xh,$IN0
+ vxor $Xm2,$Xm2,$Xm1
+ vxor $Xh,$Xh,$t1
+ vxor $Xm3,$Xm3,$Xm2
+ vxor $Xh,$Xh,$Xl
+ bge Loop_4x
+
+Ltail_4x:
+ vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
+ vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
+ vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
+
+ vxor $Xl,$Xl,$Xl3
+ vxor $Xm,$Xm,$Xm3
+
+ vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
+
+ vsldoi $t0,$Xm,$zero,8
+ vsldoi $t1,$zero,$Xm,8
+ vxor $Xh,$Xh,$Xh3
+ vxor $Xl,$Xl,$t0
+ vxor $Xh,$Xh,$t1
+
+ vsldoi $Xl,$Xl,$Xl,8
+ vxor $Xl,$Xl,$t2
+
+ vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
+ vpmsumd $Xl,$Xl,$xC2
+ vxor $t1,$t1,$Xh
+ vxor $Xl,$Xl,$t1
+
+ addic. $len,$len,4
+ beq Ldone_4x
+
+ lvx_u $IN0,0,$inp
+ ${UCMP}i $len,2
+ li $len,-4
+ blt Lone
+ lvx_u $IN1,r8,$inp
+ beq Ltwo
+
+Lthree:
+ lvx_u $IN2,r9,$inp
+ le?vperm $IN0,$IN0,$IN0,$lemask
+ le?vperm $IN1,$IN1,$IN1,$lemask
+ le?vperm $IN2,$IN2,$IN2,$lemask
+
+ vxor $Xh,$IN0,$Xl
+ vmr $H4l,$H3l
+ vmr $H4, $H3
+ vmr $H4h,$H3h
+
+ vperm $t0,$IN1,$IN2,$loperm
+ vperm $t1,$IN1,$IN2,$hiperm
+ vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+ vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
+ vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+ vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+ vxor $Xm3,$Xm3,$Xm2
+ b Ltail_4x
+
+.align 4
+Ltwo:
+ le?vperm $IN0,$IN0,$IN0,$lemask
+ le?vperm $IN1,$IN1,$IN1,$lemask
+
+ vxor $Xh,$IN0,$Xl
+ vperm $t0,$zero,$IN1,$loperm
+ vperm $t1,$zero,$IN1,$hiperm
+
+ vsldoi $H4l,$zero,$H2,8
+ vmr $H4, $H2
+ vsldoi $H4h,$H2,$zero,8
+
+ vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
+ vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
+ vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
+
+ b Ltail_4x
+
+.align 4
+Lone:
+ le?vperm $IN0,$IN0,$IN0,$lemask
+
+ vsldoi $H4l,$zero,$H,8
+ vmr $H4, $H
+ vsldoi $H4h,$H,$zero,8
+
+ vxor $Xh,$IN0,$Xl
+ vxor $Xl3,$Xl3,$Xl3
+ vxor $Xm3,$Xm3,$Xm3
+ vxor $Xh3,$Xh3,$Xh3
+
+ b Ltail_4x
+
+Ldone_4x:
+ le?vperm $Xl,$Xl,$Xl,$lemask
+ stvx_u $Xl,0,$Xip # write out Xi
+
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,0,4,0
+ .long 0
+___
+}
+$code.=<<___;
+.size .gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o;
+ } else {
+ s/le\?/#le#/o or
+ s/be\?//o;
+ }
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl b/openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl
new file mode 100644
index 0000000..dcd5f59
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,430 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# July 2014
+#
+# Implement 2x aggregated reduction [see ghash-x86.pl for background
+# information].
+#
+# Current performance in cycles per processed byte:
+#
+# PMULL[2] 32-bit NEON(*)
+# Apple A7 0.92 5.62
+# Cortex-A53 1.01 8.39
+# Cortex-A57 1.17 7.61
+# Denver 0.71 6.02
+# Mongoose 1.10 8.06
+#
+# (*) presented for reference/comparison purposes;
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$Xi="x0"; # argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=<<___ if ($flavour !~ /64/);
+.fpu neon
+.code 32
+#undef __thumb2__
+___
+
+################################################################################
+# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
+#
+# input: 128-bit H - secret parameter E(K,0^128)
+# output: precomputed table filled with degrees of twisted H;
+# H is twisted to handle reverse bitness of GHASH;
+# only few of 16 slots of Htable[16] are used;
+# data is opaque to outside world (which allows to
+# optimize the code independently);
+#
+$code.=<<___;
+.global gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ vld1.64 {$t1},[x1] @ load input H
+ vmov.i8 $xC2,#0xe1
+ vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
+ vext.8 $IN,$t1,$t1,#8
+ vshr.u64 $t2,$xC2,#63
+ vdup.32 $t1,${t1}[1]
+ vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
+ vshr.u64 $t2,$IN,#63
+ vshr.s32 $t1,$t1,#31 @ broadcast carry bit
+ vand $t2,$t2,$t0
+ vshl.i64 $IN,$IN,#1
+ vext.8 $t2,$t2,$t2,#8
+ vand $t0,$t0,$t1
+ vorr $IN,$IN,$t2 @ H<<<=1
+ veor $H,$IN,$t0 @ twisted H
+ vst1.64 {$H},[x0],#16 @ store Htable[0]
+
+ @ calculate H^2
+ vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
+ vpmull.p64 $Xl,$H,$H
+ veor $t0,$t0,$H
+ vpmull2.p64 $Xh,$H,$H
+ vpmull.p64 $Xm,$t0,$t0
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $H2,$Xl,$t2
+
+ vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
+ veor $t1,$t1,$H2
+ vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
+ vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
+
+ ret
+.size gcm_init_v8,.-gcm_init_v8
+___
+################################################################################
+# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+#
+# input: Xi - current hash value;
+# Htable - table precomputed in gcm_init_v8;
+# output: Xi - next hash value Xi;
+#
+$code.=<<___;
+.global gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ vld1.64 {$t1},[$Xi] @ load Xi
+ vmov.i8 $xC2,#0xe1
+ vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
+ vshl.u64 $xC2,$xC2,#57
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vext.8 $IN,$t1,$t1,#8
+
+ vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $Xl,$Xl,$Xl,#8
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+ ret
+.size gcm_gmult_v8,.-gcm_gmult_v8
+___
+################################################################################
+# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#
+# input: table precomputed in gcm_init_v8;
+# current hash value Xi;
+# pointer to input data;
+# length of input data in bytes, but divisible by block size;
+# output: next hash value Xi;
+#
+$code.=<<___;
+.global gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vstmdb sp!,{d8-d15} @ 32-bit ABI says so
+___
+$code.=<<___;
+ vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
+ @ "[rotated]" means that
+ @ loaded value would have
+ @ to be rotated in order to
+ @ make it appear as in
+ @ alorithm specification
+ subs $len,$len,#32 @ see if $len is 32 or larger
+ mov $inc,#16 @ $inc is used as post-
+ @ increment for input pointer;
+ @ as loop is modulo-scheduled
+ @ $inc is zeroed just in time
+ @ to preclude oversteping
+ @ inp[len], which means that
+ @ last block[s] are actually
+ @ loaded twice, but last
+ @ copy is not processed
+ vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
+ vmov.i8 $xC2,#0xe1
+ vld1.64 {$H2},[$Htbl]
+ cclr $inc,eq @ is it time to zero $inc?
+ vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
+ vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
+ vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+ vrev64.8 $t0,$t0
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
+ b.lo .Lodd_tail_v8 @ $len was less than 32
+___
+{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+$code.=<<___;
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vext.8 $In,$t1,$t1,#8
+ veor $IN,$IN,$Xl @ I[i]^=Xi
+ vpmull.p64 $Xln,$H,$In @ H·Ii+1
+ veor $t1,$t1,$In @ Karatsuba pre-processing
+ vpmull2.p64 $Xhn,$H,$In
+ b .Loop_mod2x_v8
+
+.align 4
+.Loop_mod2x_v8:
+ vext.8 $t2,$IN,$IN,#8
+ subs $len,$len,#32 @ is there more data?
+ vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
+ cclr $inc,lo @ is it time to zero $inc?
+
+ vpmull.p64 $Xmn,$Hhl,$t1
+ veor $t2,$t2,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
+ veor $Xl,$Xl,$Xln @ accumulate
+ vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
+
+ veor $Xh,$Xh,$Xhn
+ cclr $inc,eq @ is it time to zero $inc?
+ veor $Xm,$Xm,$Xmn
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+ vrev64.8 $t0,$t0
+#endif
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+
+#ifndef __ARMEB__
+ vrev64.8 $t1,$t1
+#endif
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ vext.8 $In,$t1,$t1,#8
+ vext.8 $IN,$t0,$t0,#8
+ veor $Xl,$Xm,$t2
+ vpmull.p64 $Xln,$H,$In @ H·Ii+1
+ veor $IN,$IN,$Xh @ accumulate $IN early
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $IN,$IN,$t2
+ veor $t1,$t1,$In @ Karatsuba pre-processing
+ veor $IN,$IN,$Xl
+ vpmull2.p64 $Xhn,$H,$In
+ b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
+
+ veor $Xh,$Xh,$t2
+ vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
+ adds $len,$len,#32 @ re-construct $len
+ veor $Xl,$Xl,$Xh @ re-construct $Xl
+ b.eq .Ldone_v8 @ is $len zero?
+___
+}
+$code.=<<___;
+.Lodd_tail_v8:
+ vext.8 $t2,$Xl,$Xl,#8
+ veor $IN,$IN,$Xl @ inp^=Xi
+ veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
+
+ vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
+ vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
+ veor $t2,$Xl,$Xh
+ veor $Xm,$Xm,$t1
+ veor $Xm,$Xm,$t2
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
+
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
+ veor $Xl,$Xm,$t2
+
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
+ vpmull.p64 $Xl,$Xl,$xC2
+ veor $t2,$t2,$Xh
+ veor $Xl,$Xl,$t2
+
+.Ldone_v8:
+#ifndef __ARMEB__
+ vrev64.8 $Xl,$Xl
+#endif
+ vext.8 $Xl,$Xl,$Xl,#8
+ vst1.64 {$Xl},[$Xi] @ write out Xi
+
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15} @ 32-bit ABI says so
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($flavour =~ /64/) { ######## 64-bit code
+ sub unvmov {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+ }
+ foreach(split("\n",$code)) {
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vmov\s+(.*)/unvmov($1)/geo or
+ s/vext\.8/ext/o or
+ s/vshr\.s/sshr\.s/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8(\s)/$1/o;
+ s/\.[uis]?32//o and s/\.16b/\.4s/go;
+ m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
+ m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
+ s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+ sub unvpmullp64 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+ my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ $word |= 0x00010001 if ($mnemonic =~ "2");
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+
+ foreach(split("\n",$code)) {
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\],#[0-9]+/]!/o;
+
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT; # enforce flush
diff --git a/openssl-1.1.0h/crypto/modes/build.info b/openssl-1.1.0h/crypto/modes/build.info
new file mode 100644
index 0000000..38195c4
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/build.info
@@ -0,0 +1,27 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+ cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
+ ccm128.c xts128.c wrap128.c ocb128.c \
+ {- $target{modes_asm_src} -}
+
+INCLUDE[gcm128.o]=..
+
+GENERATE[ghash-ia64.s]=asm/ghash-ia64.pl $(CFLAGS) $(LIB_CFLAGS)
+GENERATE[ghash-x86.s]=asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+GENERATE[ghash-x86_64.s]=asm/ghash-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-gcm-x86_64.s]=asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[ghash-sparcv9.S]=asm/ghash-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[ghash-sparcv9.o]=..
+GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl $(PERLASM_SCHEME)
+GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl $(PERLASM_SCHEME)
+GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl $(PERLASM_SCHEME)
+GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl $(PERLASM_SCHEME)
+INCLUDE[ghash-armv4.o]=..
+GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME)
+INCLUDE[ghashv8-armx.o]=..
+
+BEGINRAW[Makefile]
+# GNU make "catch all"
+{- $builddir -}/ghash-%.S: {- $sourcedir -}/asm/ghash-%.pl
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+ENDRAW[Makefile]
diff --git a/openssl-1.1.0h/crypto/modes/cbc128.c b/openssl-1.1.0h/crypto/modes/cbc128.c
new file mode 100644
index 0000000..4ce5eb2
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/cbc128.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
+# define STRICT_ALIGNMENT 0
+#endif
+
+void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], block128_f block)
+{
+ size_t n;
+ const unsigned char *iv = ivec;
+
+ if (len == 0)
+ return;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (STRICT_ALIGNMENT &&
+ ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+ while (len >= 16) {
+ for (n = 0; n < 16; ++n)
+ out[n] = in[n] ^ iv[n];
+ (*block) (out, out, key);
+ iv = out;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ } else {
+ while (len >= 16) {
+ for (n = 0; n < 16; n += sizeof(size_t))
+ *(size_t *)(out + n) =
+ *(size_t *)(in + n) ^ *(size_t *)(iv + n);
+ (*block) (out, out, key);
+ iv = out;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ }
+#endif
+ while (len) {
+ for (n = 0; n < 16 && n < len; ++n)
+ out[n] = in[n] ^ iv[n];
+ for (; n < 16; ++n)
+ out[n] = iv[n];
+ (*block) (out, out, key);
+ iv = out;
+ if (len <= 16)
+ break;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ memcpy(ivec, iv, 16);
+}
+
+void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], block128_f block)
+{
+ size_t n;
+ union {
+ size_t t[16 / sizeof(size_t)];
+ unsigned char c[16];
+ } tmp;
+
+ if (len == 0)
+ return;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (in != out) {
+ const unsigned char *iv = ivec;
+
+ if (STRICT_ALIGNMENT &&
+ ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+ while (len >= 16) {
+ (*block) (in, out, key);
+ for (n = 0; n < 16; ++n)
+ out[n] ^= iv[n];
+ iv = in;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ } else if (16 % sizeof(size_t) == 0) { /* always true */
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv;
+
+ (*block) (in, out, key);
+ for (n = 0; n < 16 / sizeof(size_t); n++)
+ out_t[n] ^= iv_t[n];
+ iv = in;
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ }
+ memcpy(ivec, iv, 16);
+ } else {
+ if (STRICT_ALIGNMENT &&
+ ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+ unsigned char c;
+ while (len >= 16) {
+ (*block) (in, tmp.c, key);
+ for (n = 0; n < 16; ++n) {
+ c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = c;
+ }
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ } else if (16 % sizeof(size_t) == 0) { /* always true */
+ while (len >= 16) {
+ size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (in, tmp.c, key);
+ for (n = 0; n < 16 / sizeof(size_t); n++) {
+ c = in_t[n];
+ out_t[n] = tmp.t[n] ^ ivec_t[n];
+ ivec_t[n] = c;
+ }
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+ }
+ }
+#endif
+ while (len) {
+ unsigned char c;
+ (*block) (in, tmp.c, key);
+ for (n = 0; n < 16 && n < len; ++n) {
+ c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = c;
+ }
+ if (len <= 16) {
+ for (; n < 16; ++n)
+ ivec[n] = in[n];
+ break;
+ }
+ len -= 16;
+ in += 16;
+ out += 16;
+ }
+}
diff --git a/openssl-1.1.0h/crypto/modes/ccm128.c b/openssl-1.1.0h/crypto/modes/ccm128.c
new file mode 100644
index 0000000..85ce84f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/ccm128.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+/*
+ * First you setup M and L parameters and pass the key schedule. This is
+ * called once per session setup...
+ */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+ unsigned int M, unsigned int L, void *key,
+ block128_f block)
+{
+ memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
+ ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3;
+ ctx->blocks = 0;
+ ctx->block = block;
+ ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+ const unsigned char *nonce, size_t nlen, size_t mlen)
+{
+ unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */
+
+ if (nlen < (14 - L))
+ return -1; /* nonce is too short */
+
+ if (sizeof(mlen) == 8 && L >= 3) {
+ ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8)));
+ ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8)));
+ ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8)));
+ ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8)));
+ } else
+ ctx->nonce.u[1] = 0;
+
+ ctx->nonce.c[12] = (u8)(mlen >> 24);
+ ctx->nonce.c[13] = (u8)(mlen >> 16);
+ ctx->nonce.c[14] = (u8)(mlen >> 8);
+ ctx->nonce.c[15] = (u8)mlen;
+
+ ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
+ memcpy(&ctx->nonce.c[1], nonce, 14 - L);
+
+ return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+ const unsigned char *aad, size_t alen)
+{
+ unsigned int i;
+ block128_f block = ctx->block;
+
+ if (alen == 0)
+ return;
+
+ ctx->nonce.c[0] |= 0x40; /* set Adata flag */
+ (*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+
+ if (alen < (0x10000 - 0x100)) {
+ ctx->cmac.c[0] ^= (u8)(alen >> 8);
+ ctx->cmac.c[1] ^= (u8)alen;
+ i = 2;
+ } else if (sizeof(alen) == 8
+ && alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) {
+ ctx->cmac.c[0] ^= 0xFF;
+ ctx->cmac.c[1] ^= 0xFF;
+ ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8)));
+ ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8)));
+ ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8)));
+ ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8)));
+ ctx->cmac.c[6] ^= (u8)(alen >> 24);
+ ctx->cmac.c[7] ^= (u8)(alen >> 16);
+ ctx->cmac.c[8] ^= (u8)(alen >> 8);
+ ctx->cmac.c[9] ^= (u8)alen;
+ i = 10;
+ } else {
+ ctx->cmac.c[0] ^= 0xFF;
+ ctx->cmac.c[1] ^= 0xFE;
+ ctx->cmac.c[2] ^= (u8)(alen >> 24);
+ ctx->cmac.c[3] ^= (u8)(alen >> 16);
+ ctx->cmac.c[4] ^= (u8)(alen >> 8);
+ ctx->cmac.c[5] ^= (u8)alen;
+ i = 6;
+ }
+
+ do {
+ for (; i < 16 && alen; ++i, ++aad, --alen)
+ ctx->cmac.c[i] ^= *aad;
+ (*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+ i = 0;
+ } while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/*
+ * counter part of nonce may not be larger than L*8 bits, L is not larger
+ * than 8, therefore 64-bit counter...
+ */
+static void ctr64_inc(unsigned char *counter)
+{
+ unsigned int n = 8;
+ u8 c;
+
+ counter += 8;
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c)
+ return;
+ } while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len)
+{
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1; /* length mismatch */
+
+ ctx->blocks += ((len + 15) >> 3) | 1;
+ if (ctx->blocks > (U64(1) << 61))
+ return -2; /* too much data */
+
+ while (len >= 16) {
+#if defined(STRICT_ALIGNMENT)
+ union {
+ u64 u[2];
+ u8 c[16];
+ } temp;
+
+ memcpy(temp.c, inp, 16);
+ ctx->cmac.u[0] ^= temp.u[0];
+ ctx->cmac.u[1] ^= temp.u[1];
+#else
+ ctx->cmac.u[0] ^= ((u64 *)inp)[0];
+ ctx->cmac.u[1] ^= ((u64 *)inp)[1];
+#endif
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+ temp.u[0] ^= scratch.u[0];
+ temp.u[1] ^= scratch.u[1];
+ memcpy(out, temp.c, 16);
+#else
+ ((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0];
+ ((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1];
+#endif
+ inp += 16;
+ out += 16;
+ len -= 16;
+ }
+
+ if (len) {
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= inp[i];
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ out[i] = scratch.c[i] ^ inp[i];
+ }
+
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
+
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len)
+{
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1;
+
+ while (len >= 16) {
+#if defined(STRICT_ALIGNMENT)
+ union {
+ u64 u[2];
+ u8 c[16];
+ } temp;
+#endif
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+ memcpy(temp.c, inp, 16);
+ ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+ ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+ memcpy(out, scratch.c, 16);
+#else
+ ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]);
+ ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1]);
+#endif
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+
+ inp += 16;
+ out += 16;
+ len -= 16;
+ }
+
+ if (len) {
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ }
+
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
+
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+static void ctr64_add(unsigned char *counter, size_t inc)
+{
+ size_t n = 8, val = 0;
+
+ counter += 8;
+ do {
+ --n;
+ val += counter[n] + (inc & 0xff);
+ counter[n] = (unsigned char)val;
+ val >>= 8; /* carry bit */
+ inc >>= 8;
+ } while (n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len, ccm128_f stream)
+{
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1; /* length mismatch */
+
+ ctx->blocks += ((len + 15) >> 3) | 1;
+ if (ctx->blocks > (U64(1) << 61))
+ return -2; /* too much data */
+
+ if ((n = len / 16)) {
+ (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+ n *= 16;
+ inp += n;
+ out += n;
+ len -= n;
+ if (len)
+ ctr64_add(ctx->nonce.c, n / 16);
+ }
+
+ if (len) {
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= inp[i];
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ out[i] = scratch.c[i] ^ inp[i];
+ }
+
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
+
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len, ccm128_f stream)
+{
+ size_t n;
+ unsigned int i, L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+ union {
+ u64 u[2];
+ u8 c[16];
+ } scratch;
+
+ if (!(flags0 & 0x40))
+ (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+ ctx->nonce.c[0] = L = flags0 & 7;
+ for (n = 0, i = 15 - L; i < 15; ++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i] = 0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15] = 1;
+
+ if (n != len)
+ return -1;
+
+ if ((n = len / 16)) {
+ (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+ n *= 16;
+ inp += n;
+ out += n;
+ len -= n;
+ if (len)
+ ctr64_add(ctx->nonce.c, n / 16);
+ }
+
+ if (len) {
+ (*block) (ctx->nonce.c, scratch.c, key);
+ for (i = 0; i < len; ++i)
+ ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+ (*block) (ctx->cmac.c, ctx->cmac.c, key);
+ }
+
+ for (i = 15 - L; i < 16; ++i)
+ ctx->nonce.c[i] = 0;
+
+ (*block) (ctx->nonce.c, scratch.c, key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+ unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */
+
+ M *= 2;
+ M += 2;
+ if (len < M)
+ return 0;
+ memcpy(tag, ctx->cmac.c, M);
+ return M;
+}
diff --git a/openssl-1.1.0h/crypto/modes/cfb128.c b/openssl-1.1.0h/crypto/modes/cfb128.c
new file mode 100644
index 0000000..e439567
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/cfb128.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+/*
+ * The input and output encrypted as though 128bit cfb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
+ */
+void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block)
+{
+ unsigned int n;
+ size_t l = 0;
+
+ n = *num;
+
+ if (enc) {
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ *(out++) = ivec[n] ^= *(in++);
+ --len;
+ n = (n + 1) % 16;
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ivec) %
+ sizeof(size_t) != 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ivec, key);
+ for (; n < 16; n += sizeof(size_t)) {
+ *(size_t *)(out + n) =
+ *(size_t *)(ivec + n) ^= *(size_t *)(in + n);
+ }
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ivec, key);
+ while (len--) {
+ out[n] = ivec[n] ^= in[n];
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
+#endif
+ while (l < len) {
+ if (n == 0) {
+ (*block) (ivec, ivec, key);
+ }
+ out[l] = ivec[n] ^= in[l];
+ ++l;
+ n = (n + 1) % 16;
+ }
+ *num = n;
+ } else {
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ unsigned char c;
+ *(out++) = ivec[n] ^ (c = *(in++));
+ ivec[n] = c;
+ --len;
+ n = (n + 1) % 16;
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ivec) %
+ sizeof(size_t) != 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ivec, key);
+ for (; n < 16; n += sizeof(size_t)) {
+ size_t t = *(size_t *)(in + n);
+ *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t;
+ *(size_t *)(ivec + n) = t;
+ }
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ivec, key);
+ while (len--) {
+ unsigned char c;
+ out[n] = ivec[n] ^ (c = in[n]);
+ ivec[n] = c;
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
+#endif
+ while (l < len) {
+ unsigned char c;
+ if (n == 0) {
+ (*block) (ivec, ivec, key);
+ }
+ out[l] = ivec[n] ^ (c = in[l]);
+ ivec[n] = c;
+ ++l;
+ n = (n + 1) % 16;
+ }
+ *num = n;
+ }
+}
+
+/*
+ * This expects a single block of size nbits for both in and out. Note that
+ * it corrupts any extra bits in the last byte of out
+ */
+static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
+ int nbits, const void *key,
+ unsigned char ivec[16], int enc,
+ block128_f block)
+{
+ int n, rem, num;
+ unsigned char ovec[16 * 2 + 1]; /* +1 because we dereference (but don't
+ * use) one byte off the end */
+
+ if (nbits <= 0 || nbits > 128)
+ return;
+
+ /* fill in the first half of the new IV with the current IV */
+ memcpy(ovec, ivec, 16);
+ /* construct the new IV */
+ (*block) (ivec, ivec, key);
+ num = (nbits + 7) / 8;
+ if (enc) /* encrypt the input */
+ for (n = 0; n < num; ++n)
+ out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
+ else /* decrypt the input */
+ for (n = 0; n < num; ++n)
+ out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
+ /* shift ovec left... */
+ rem = nbits % 8;
+ num = nbits / 8;
+ if (rem == 0)
+ memcpy(ivec, ovec + num, 16);
+ else
+ for (n = 0; n < 16; ++n)
+ ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
+
+ /* it is not necessary to cleanse ovec, since the IV is not secret */
+}
+
+/* N.B. This expects the input to be packed, MS bit first */
+void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
+ size_t bits, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block)
+{
+ size_t n;
+ unsigned char c[1], d[1];
+
+ for (n = 0; n < bits; ++n) {
+ c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+ cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
+ out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
+ ((d[0] & 0x80) >> (unsigned int)(n % 8));
+ }
+}
+
+void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const void *key,
+ unsigned char ivec[16], int *num,
+ int enc, block128_f block)
+{
+ size_t n;
+
+ for (n = 0; n < length; ++n)
+ cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
+}
diff --git a/openssl-1.1.0h/crypto/modes/ctr128.c b/openssl-1.1.0h/crypto/modes/ctr128.c
new file mode 100644
index 0000000..03920b4
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/ctr128.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+/*
+ * NOTE: the IV/counter CTR mode is big-endian. The code itself is
+ * endian-neutral.
+ */
+
+/* increment counter (128-bit int) by 1 */
+static void ctr128_inc(unsigned char *counter)
+{
+ u32 n = 16, c = 1;
+
+ do {
+ --n;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
+ } while (n);
+}
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+static void ctr128_inc_aligned(unsigned char *counter)
+{
+ size_t *data, c, d, n;
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+
+ if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) {
+ ctr128_inc(counter);
+ return;
+ }
+
+ data = (size_t *)counter;
+ c = 1;
+ n = 16 / sizeof(size_t);
+ do {
+ --n;
+ d = data[n] += c;
+ /* did addition carry? */
+ c = ((d - c) & ~d) >> (sizeof(size_t) * 8 - 1);
+ } while (n);
+}
+#endif
+
+/*
+ * The input encrypted as though 128bit counter mode is being used. The
+ * extra state information to record how much of the 128bit block we have
+ * used is contained in *num, and the encrypted counter is kept in
+ * ecount_buf. Both *num and ecount_buf must be initialised with zeros
+ * before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes
+ * that the counter is in the x lower bits of the IV (ivec), and that the
+ * application has full control over overflow and the rest of the IV. This
+ * implementation takes NO responsibility for checking that the counter
+ * doesn't overflow into the rest of the IV when incremented.
+ */
+void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16],
+ unsigned char ecount_buf[16], unsigned int *num,
+ block128_f block)
+{
+ unsigned int n;
+ size_t l = 0;
+
+ n = *num;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ *(out++) = *(in++) ^ ecount_buf[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ecount_buf)
+ % sizeof(size_t) != 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ecount_buf, key);
+ ctr128_inc_aligned(ivec);
+ for (n = 0; n < 16; n += sizeof(size_t))
+ *(size_t *)(out + n) =
+ *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ecount_buf, key);
+ ctr128_inc_aligned(ivec);
+ while (len--) {
+ out[n] = in[n] ^ ecount_buf[n];
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
+#endif
+ while (l < len) {
+ if (n == 0) {
+ (*block) (ivec, ecount_buf, key);
+ ctr128_inc(ivec);
+ }
+ out[l] = in[l] ^ ecount_buf[n];
+ ++l;
+ n = (n + 1) % 16;
+ }
+
+ *num = n;
+}
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter)
+{
+ u32 n = 12, c = 1;
+
+ do {
+ --n;
+ c += counter[n];
+ counter[n] = (u8)c;
+ c >>= 8;
+ } while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16],
+ unsigned char ecount_buf[16],
+ unsigned int *num, ctr128_f func)
+{
+ unsigned int n, ctr32;
+
+ n = *num;
+
+ while (n && len) {
+ *(out++) = *(in++) ^ ecount_buf[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+
+ ctr32 = GETU32(ivec + 12);
+ while (len >= 16) {
+ size_t blocks = len / 16;
+ /*
+ * 1<<28 is just a not-so-small yet not-so-large number...
+ * Below condition is practically never met, but it has to
+ * be checked for code correctness.
+ */
+ if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28))
+ blocks = (1U << 28);
+ /*
+ * As (*func) operates on 32-bit counter, caller
+ * has to handle overflow. 'if' below detects the
+ * overflow, which is then handled by limiting the
+ * amount of blocks to the exact overflow point...
+ */
+ ctr32 += (u32)blocks;
+ if (ctr32 < blocks) {
+ blocks -= ctr32;
+ ctr32 = 0;
+ }
+ (*func) (in, out, blocks, key, ivec);
+ /* (*ctr) does not update ivec, caller does: */
+ PUTU32(ivec + 12, ctr32);
+ /* ... overflow was detected, propagate carry. */
+ if (ctr32 == 0)
+ ctr96_inc(ivec);
+ blocks *= 16;
+ len -= blocks;
+ out += blocks;
+ in += blocks;
+ }
+ if (len) {
+ memset(ecount_buf, 0, 16);
+ (*func) (ecount_buf, ecount_buf, 1, key, ivec);
+ ++ctr32;
+ PUTU32(ivec + 12, ctr32);
+ if (ctr32 == 0)
+ ctr96_inc(ivec);
+ while (len--) {
+ out[n] = in[n] ^ ecount_buf[n];
+ ++n;
+ }
+ }
+
+ *num = n;
+}
diff --git a/openssl-1.1.0h/crypto/modes/cts128.c b/openssl-1.1.0h/crypto/modes/cts128.c
new file mode 100644
index 0000000..77ec994
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/cts128.c
@@ -0,0 +1,523 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+/*
+ * Trouble with Ciphertext Stealing, CTS, mode is that there is no
+ * common official specification, but couple of cipher/application
+ * specific ones: RFC2040 and RFC3962. Then there is 'Proposal to
+ * Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which
+ * deviates from mentioned RFCs. Most notably it allows input to be
+ * of block length and it doesn't flip the order of the last two
+ * blocks. CTS is being discussed even in ECB context, but it's not
+ * adopted for any known application. This implementation provides
+ * two interfaces: one compliant with above mentioned RFCs and one
+ * compliant with the NIST proposal, both extending CBC mode.
+ */
+
+size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
+
+ if (len <= 16)
+ return 0;
+
+ if ((residue = len % 16) == 0)
+ residue = 16;
+
+ len -= residue;
+
+ CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
+
+ in += len;
+ out += len;
+
+ for (n = 0; n < residue; ++n)
+ ivec[n] ^= in[n];
+ (*block) (ivec, ivec, key);
+ memcpy(out, out - 16, residue);
+ memcpy(out - 16, ivec, 16);
+
+ return len + residue;
+}
+
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key,
+ unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
+
+ if (len < 16)
+ return 0;
+
+ residue = len % 16;
+
+ len -= residue;
+
+ CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
+
+ if (residue == 0)
+ return len;
+
+ in += len;
+ out += len;
+
+ for (n = 0; n < residue; ++n)
+ ivec[n] ^= in[n];
+ (*block) (ivec, ivec, key);
+ memcpy(out - 16 + residue, ivec, 16);
+
+ return len + residue;
+}
+
+size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[16];
+ } tmp;
+
+ if (len <= 16)
+ return 0;
+
+ if ((residue = len % 16) == 0)
+ residue = 16;
+
+ len -= residue;
+
+ (*cbc) (in, out, len, key, ivec, 1);
+
+ in += len;
+ out += len;
+
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+ memcpy(tmp.c, out - 16, 16);
+ (*cbc) (in, out - 16, residue, key, ivec, 1);
+ memcpy(out, tmp.c, residue);
+#else
+ memset(tmp.c, 0, sizeof(tmp));
+ memcpy(tmp.c, in, residue);
+ memcpy(out, out - 16, residue);
+ (*cbc) (tmp.c, out - 16, 16, key, ivec, 1);
+#endif
+ return len + residue;
+}
+
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[16];
+ } tmp;
+
+ if (len < 16)
+ return 0;
+
+ residue = len % 16;
+
+ len -= residue;
+
+ (*cbc) (in, out, len, key, ivec, 1);
+
+ if (residue == 0)
+ return len;
+
+ in += len;
+ out += len;
+
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+ (*cbc) (in, out - 16 + residue, residue, key, ivec, 1);
+#else
+ memset(tmp.c, 0, sizeof(tmp));
+ memcpy(tmp.c, in, residue);
+ (*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1);
+#endif
+ return len + residue;
+}
+
+size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key, unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
+
+ if (len <= 16)
+ return 0;
+
+ if ((residue = len % 16) == 0)
+ residue = 16;
+
+ len -= 16 + residue;
+
+ if (len) {
+ CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+ in += len;
+ out += len;
+ }
+
+ (*block) (in, tmp.c + 16, key);
+
+ memcpy(tmp.c, tmp.c + 16, 16);
+ memcpy(tmp.c, in + 16, residue);
+ (*block) (tmp.c, tmp.c, key);
+
+ for (n = 0; n < 16; ++n) {
+ unsigned char c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = c;
+ }
+ for (residue += 16; n < residue; ++n)
+ out[n] = tmp.c[n] ^ in[n];
+
+ return 16 + len + residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
+ unsigned char *out, size_t len,
+ const void *key,
+ unsigned char ivec[16],
+ block128_f block)
+{
+ size_t residue, n;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
+
+ if (len < 16)
+ return 0;
+
+ residue = len % 16;
+
+ if (residue == 0) {
+ CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+ return len;
+ }
+
+ len -= 16 + residue;
+
+ if (len) {
+ CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+ in += len;
+ out += len;
+ }
+
+ (*block) (in + residue, tmp.c + 16, key);
+
+ memcpy(tmp.c, tmp.c + 16, 16);
+ memcpy(tmp.c, in, residue);
+ (*block) (tmp.c, tmp.c, key);
+
+ for (n = 0; n < 16; ++n) {
+ unsigned char c = in[n];
+ out[n] = tmp.c[n] ^ ivec[n];
+ ivec[n] = in[n + residue];
+ tmp.c[n] = c;
+ }
+ for (residue += 16; n < residue; ++n)
+ out[n] = tmp.c[n] ^ tmp.c[n - 16];
+
+ return 16 + len + residue;
+}
+
+size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
+
+ if (len <= 16)
+ return 0;
+
+ if ((residue = len % 16) == 0)
+ residue = 16;
+
+ len -= 16 + residue;
+
+ if (len) {
+ (*cbc) (in, out, len, key, ivec, 0);
+ in += len;
+ out += len;
+ }
+
+ memset(tmp.c, 0, sizeof(tmp));
+ /*
+ * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+ */
+ (*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0);
+
+ memcpy(tmp.c, in + 16, residue);
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+ (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
+#else
+ (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+ memcpy(out, tmp.c, 16 + residue);
+#endif
+ return 16 + len + residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], cbc128_f cbc)
+{
+ size_t residue;
+ union {
+ size_t align;
+ unsigned char c[32];
+ } tmp;
+
+ if (len < 16)
+ return 0;
+
+ residue = len % 16;
+
+ if (residue == 0) {
+ (*cbc) (in, out, len, key, ivec, 0);
+ return len;
+ }
+
+ len -= 16 + residue;
+
+ if (len) {
+ (*cbc) (in, out, len, key, ivec, 0);
+ in += len;
+ out += len;
+ }
+
+ memset(tmp.c, 0, sizeof(tmp));
+ /*
+ * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+ */
+ (*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0);
+
+ memcpy(tmp.c, in, residue);
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+ (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
+#else
+ (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+ memcpy(out, tmp.c, 16 + residue);
+#endif
+ return 16 + len + residue;
+}
+
+#if defined(SELFTEST)
+# include <stdio.h>
+# include <openssl/aes.h>
+
+/* test vectors from RFC 3962 */
+static const unsigned char test_key[16] = "chicken teriyaki";
+static const unsigned char test_input[64] =
+ "I would like the" " General Gau's C"
+ "hicken, please, " "and wonton soup.";
+static const unsigned char test_iv[16] =
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static const unsigned char vector_17[17] = {
+ 0xc6, 0x35, 0x35, 0x68, 0xf2, 0xbf, 0x8c, 0xb4,
+ 0xd8, 0xa5, 0x80, 0x36, 0x2d, 0xa7, 0xff, 0x7f,
+ 0x97
+};
+
+static const unsigned char vector_31[31] = {
+ 0xfc, 0x00, 0x78, 0x3e, 0x0e, 0xfd, 0xb2, 0xc1,
+ 0xd4, 0x45, 0xd4, 0xc8, 0xef, 0xf7, 0xed, 0x22,
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5
+};
+
+static const unsigned char vector_32[32] = {
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8,
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84
+};
+
+static const unsigned char vector_47[47] = {
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+ 0xb3, 0xff, 0xfd, 0x94, 0x0c, 0x16, 0xa1, 0x8c,
+ 0x1b, 0x55, 0x49, 0xd2, 0xf8, 0x38, 0x02, 0x9e,
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5
+};
+
+static const unsigned char vector_48[48] = {
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+ 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0,
+ 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8,
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8
+};
+
+static const unsigned char vector_64[64] = {
+ 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0,
+ 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84,
+ 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5,
+ 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8,
+ 0x48, 0x07, 0xef, 0xe8, 0x36, 0xee, 0x89, 0xa5,
+ 0x26, 0x73, 0x0d, 0xbc, 0x2f, 0x7b, 0xc8, 0x40,
+ 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0,
+ 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8
+};
+
+static AES_KEY encks, decks;
+
+void test_vector(const unsigned char *vector, size_t len)
+{
+ unsigned char iv[sizeof(test_iv)];
+ unsigned char cleartext[64], ciphertext[64];
+ size_t tail;
+
+ printf("vector_%d\n", len);
+ fflush(stdout);
+
+ if ((tail = len % 16) == 0)
+ tail = 16;
+ tail += 16;
+
+ /* test block-based encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_encrypt_block(test_input, ciphertext, len, &encks, iv,
+ (block128_f) AES_encrypt);
+ if (memcmp(ciphertext, vector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(1);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(1);
+
+ /* test block-based decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_decrypt_block(ciphertext, cleartext, len, &decks, iv,
+ (block128_f) AES_decrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(2);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(2);
+
+ /* test streamed encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_encrypt(test_input, ciphertext, len, &encks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(ciphertext, vector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(3);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(3);
+
+ /* test streamed decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_cts128_decrypt(ciphertext, cleartext, len, &decks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(4);
+ if (memcmp(iv, vector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(4);
+}
+
+void test_nistvector(const unsigned char *vector, size_t len)
+{
+ unsigned char iv[sizeof(test_iv)];
+ unsigned char cleartext[64], ciphertext[64], nistvector[64];
+ size_t tail;
+
+ printf("nistvector_%d\n", len);
+ fflush(stdout);
+
+ if ((tail = len % 16) == 0)
+ tail = 16;
+
+ len -= 16 + tail;
+ memcpy(nistvector, vector, len);
+ /* flip two last blocks */
+ memcpy(nistvector + len, vector + len + 16, tail);
+ memcpy(nistvector + len + tail, vector + len, 16);
+ len += 16 + tail;
+ tail = 16;
+
+ /* test block-based encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_encrypt_block(test_input, ciphertext, len, &encks, iv,
+ (block128_f) AES_encrypt);
+ if (memcmp(ciphertext, nistvector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(1);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(1);
+
+ /* test block-based decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_decrypt_block(ciphertext, cleartext, len, &decks, iv,
+ (block128_f) AES_decrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(2);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(2);
+
+ /* test streamed encryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_encrypt(test_input, ciphertext, len, &encks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(ciphertext, nistvector, len))
+ fprintf(stderr, "output_%d mismatch\n", len), exit(3);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(3);
+
+ /* test streamed decryption */
+ memcpy(iv, test_iv, sizeof(test_iv));
+ CRYPTO_nistcts128_decrypt(ciphertext, cleartext, len, &decks, iv,
+ (cbc128_f) AES_cbc_encrypt);
+ if (memcmp(cleartext, test_input, len))
+ fprintf(stderr, "input_%d mismatch\n", len), exit(4);
+ if (memcmp(iv, nistvector + len - tail, sizeof(iv)))
+ fprintf(stderr, "iv_%d mismatch\n", len), exit(4);
+}
+
+int main()
+{
+ AES_set_encrypt_key(test_key, 128, &encks);
+ AES_set_decrypt_key(test_key, 128, &decks);
+
+ test_vector(vector_17, sizeof(vector_17));
+ test_vector(vector_31, sizeof(vector_31));
+ test_vector(vector_32, sizeof(vector_32));
+ test_vector(vector_47, sizeof(vector_47));
+ test_vector(vector_48, sizeof(vector_48));
+ test_vector(vector_64, sizeof(vector_64));
+
+ test_nistvector(vector_17, sizeof(vector_17));
+ test_nistvector(vector_31, sizeof(vector_31));
+ test_nistvector(vector_32, sizeof(vector_32));
+ test_nistvector(vector_47, sizeof(vector_47));
+ test_nistvector(vector_48, sizeof(vector_48));
+ test_nistvector(vector_64, sizeof(vector_64));
+
+ return 0;
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/modes/gcm128.c b/openssl-1.1.0h/crypto/modes/gcm128.c
new file mode 100644
index 0000000..a2b05c4
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/gcm128.c
@@ -0,0 +1,2301 @@
+/*
+ * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
+/* redefine, because alignment is ensured */
+# undef GETU32
+# define GETU32(p) BSWAP4(*(const u32 *)(p))
+# undef PUTU32
+# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V) do { \
+ if (sizeof(size_t)==8) { \
+ u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+ V.lo = (V.hi<<63)|(V.lo>>1); \
+ V.hi = (V.hi>>1 )^T; \
+ } \
+ else { \
+ u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+ V.lo = (V.hi<<63)|(V.lo>>1); \
+ V.hi = (V.hi>>1 )^((u64)T<<32); \
+ } \
+} while(0)
+
+/*-
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ * handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ * subsystem penalties (for example on Windows large enough free
+ * results in VM working set trimming, meaning that consequent
+ * malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ * performance of other code paths (not necessarily even from same
+ * thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
+#if TABLE_BITS==8
+
+static void gcm_init_8bit(u128 Htable[256], u64 H[2])
+{
+ int i, j;
+ u128 V;
+
+ Htable[0].hi = 0;
+ Htable[0].lo = 0;
+ V.hi = H[0];
+ V.lo = H[1];
+
+ for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
+ REDUCE1BIT(V);
+ Htable[i] = V;
+ }
+
+ for (i = 2; i < 256; i <<= 1) {
+ u128 *Hi = Htable + i, H0 = *Hi;
+ for (j = 1; j < i; ++j) {
+ Hi[j].hi = H0.hi ^ Htable[j].hi;
+ Hi[j].lo = H0.lo ^ Htable[j].lo;
+ }
+ }
+}
+
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
+{
+ u128 Z = { 0, 0 };
+ const u8 *xi = (const u8 *)Xi + 15;
+ size_t rem, n = *xi;
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+ static const size_t rem_8bit[256] = {
+ PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+ PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+ PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+ PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+ PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+ PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+ PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+ PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+ PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+ PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+ PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+ PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+ PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+ PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+ PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+ PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+ PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+ PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+ PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+ PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+ PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+ PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+ PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+ PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+ PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+ PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+ PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+ PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+ PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+ PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+ PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+ PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+ PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+ PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+ PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+ PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+ PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+ PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+ PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+ PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+ PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+ PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+ PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+ PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+ PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+ PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+ PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+ PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+ PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+ PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+ PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+ PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+ PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+ PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+ PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+ PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+ PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+ PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+ PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+ PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+ PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+ PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+ PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+ PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
+ };
+
+ while (1) {
+ Z.hi ^= Htable[n].hi;
+ Z.lo ^= Htable[n].lo;
+
+ if ((u8 *)Xi == xi)
+ break;
+
+ n = *(--xi);
+
+ rem = (size_t)Z.lo & 0xff;
+ Z.lo = (Z.hi << 56) | (Z.lo >> 8);
+ Z.hi = (Z.hi >> 8);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_8bit[rem];
+ else
+ Z.hi ^= (u64)rem_8bit[rem] << 32;
+ }
+
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+}
+
+# define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif TABLE_BITS==4
+
+static void gcm_init_4bit(u128 Htable[16], u64 H[2])
+{
+ u128 V;
+# if defined(OPENSSL_SMALL_FOOTPRINT)
+ int i;
+# endif
+
+ Htable[0].hi = 0;
+ Htable[0].lo = 0;
+ V.hi = H[0];
+ V.lo = H[1];
+
+# if defined(OPENSSL_SMALL_FOOTPRINT)
+ for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
+ REDUCE1BIT(V);
+ Htable[i] = V;
+ }
+
+ for (i = 2; i < 16; i <<= 1) {
+ u128 *Hi = Htable + i;
+ int j;
+ for (V = *Hi, j = 1; j < i; ++j) {
+ Hi[j].hi = V.hi ^ Htable[j].hi;
+ Hi[j].lo = V.lo ^ Htable[j].lo;
+ }
+ }
+# else
+ Htable[8] = V;
+ REDUCE1BIT(V);
+ Htable[4] = V;
+ REDUCE1BIT(V);
+ Htable[2] = V;
+ REDUCE1BIT(V);
+ Htable[1] = V;
+ Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
+ V = Htable[4];
+ Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
+ Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
+ Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
+ V = Htable[8];
+ Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
+ Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
+ Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
+ Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
+ Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
+ Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
+ Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
+# endif
+# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+ /*
+ * ARM assembler expects specific dword order in Htable.
+ */
+ {
+ int j;
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ if (is_endian.little)
+ for (j = 0; j < 16; ++j) {
+ V = Htable[j];
+ Htable[j].hi = V.lo;
+ Htable[j].lo = V.hi;
+ } else
+ for (j = 0; j < 16; ++j) {
+ V = Htable[j];
+ Htable[j].hi = V.lo << 32 | V.lo >> 32;
+ Htable[j].lo = V.hi << 32 | V.hi >> 32;
+ }
+ }
+# endif
+}
+
+# ifndef GHASH_ASM
+static const size_t rem_4bit[16] = {
+ PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+ PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+ PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+ PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
+};
+
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
+{
+ u128 Z;
+ int cnt = 15;
+ size_t rem, nlo, nhi;
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ nlo = ((const u8 *)Xi)[15];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ Z.hi = Htable[nlo].hi;
+ Z.lo = Htable[nlo].lo;
+
+ while (1) {
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+
+ if (--cnt < 0)
+ break;
+
+ nlo = ((const u8 *)Xi)[cnt];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+ }
+
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+}
+
+# if !defined(OPENSSL_SMALL_FOOTPRINT)
+/*
+ * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]...
+ */
+static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len)
+{
+ u128 Z;
+ int cnt;
+ size_t rem, nlo, nhi;
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+# if 1
+ do {
+ cnt = 15;
+ nlo = ((const u8 *)Xi)[15];
+ nlo ^= inp[15];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ Z.hi = Htable[nlo].hi;
+ Z.lo = Htable[nlo].lo;
+
+ while (1) {
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+
+ if (--cnt < 0)
+ break;
+
+ nlo = ((const u8 *)Xi)[cnt];
+ nlo ^= inp[cnt];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ rem = (size_t)Z.lo & 0xf;
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+ if (sizeof(size_t) == 8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem] << 32;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+ }
+# else
+ /*
+ * Extra 256+16 bytes per-key plus 512 bytes shared tables
+ * [should] give ~50% improvement... One could have PACK()-ed
+ * the rem_8bit even here, but the priority is to minimize
+ * cache footprint...
+ */
+ u128 Hshr4[16]; /* Htable shifted right by 4 bits */
+ u8 Hshl4[16]; /* Htable shifted left by 4 bits */
+ static const unsigned short rem_8bit[256] = {
+ 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+ 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+ 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+ 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+ 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+ 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+ 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+ 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+ 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+ 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+ 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+ 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+ 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+ 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+ 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+ 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+ 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+ 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+ 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+ 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+ 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+ 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+ 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+ 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+ 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+ 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+ 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+ 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+ 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+ 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+ 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+ 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
+ };
+ /*
+ * This pre-processing phase slows down procedure by approximately
+ * same time as it makes each loop spin faster. In other words
+ * single block performance is approximately same as straightforward
+ * "4-bit" implementation, and then it goes only faster...
+ */
+ for (cnt = 0; cnt < 16; ++cnt) {
+ Z.hi = Htable[cnt].hi;
+ Z.lo = Htable[cnt].lo;
+ Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
+ Hshr4[cnt].hi = (Z.hi >> 4);
+ Hshl4[cnt] = (u8)(Z.lo << 4);
+ }
+
+ do {
+ for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
+ nlo = ((const u8 *)Xi)[cnt];
+ nlo ^= inp[cnt];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+
+ rem = (size_t)Z.lo & 0xff;
+
+ Z.lo = (Z.hi << 56) | (Z.lo >> 8);
+ Z.hi = (Z.hi >> 8);
+
+ Z.hi ^= Hshr4[nhi].hi;
+ Z.lo ^= Hshr4[nhi].lo;
+ Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
+ }
+
+ nlo = ((const u8 *)Xi)[0];
+ nlo ^= inp[0];
+ nhi = nlo >> 4;
+ nlo &= 0xf;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+
+ rem = (size_t)Z.lo & 0xf;
+
+ Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+ Z.hi = (Z.hi >> 4);
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+ Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
+# endif
+
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+ } while (inp += 16, len -= 16);
+}
+# endif
+# else
+void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
+
+# define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/*
+ * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
+ * effect. In other words idea is to hash data while it's still in L1 cache
+ * after encryption pass...
+ */
+# define GHASH_CHUNK (3*1024)
+# endif
+
+#else /* TABLE_BITS */
+
+static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
+{
+ u128 V, Z = { 0, 0 };
+ long X;
+ int i, j;
+ const long *xi = (const long *)Xi;
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ V.hi = H[0]; /* H is in host byte order, no byte swapping */
+ V.lo = H[1];
+
+ for (j = 0; j < 16 / sizeof(long); ++j) {
+ if (is_endian.little) {
+ if (sizeof(long) == 8) {
+# ifdef BSWAP8
+ X = (long)(BSWAP8(xi[j]));
+# else
+ const u8 *p = (const u8 *)(xi + j);
+ X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
+# endif
+ } else {
+ const u8 *p = (const u8 *)(xi + j);
+ X = (long)GETU32(p);
+ }
+ } else
+ X = xi[j];
+
+ for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
+ u64 M = (u64)(X >> (8 * sizeof(long) - 1));
+ Z.hi ^= V.hi & M;
+ Z.lo ^= V.lo & M;
+
+ REDUCE1BIT(V);
+ }
+ }
+
+ if (is_endian.little) {
+# ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+# else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi >> 32);
+ PUTU32(p, v);
+ v = (u32)(Z.hi);
+ PUTU32(p + 4, v);
+ v = (u32)(Z.lo >> 32);
+ PUTU32(p + 8, v);
+ v = (u32)(Z.lo);
+ PUTU32(p + 12, v);
+# endif
+ } else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+}
+
+# define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+#endif
+
+#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
+# if !defined(I386_ONLY) && \
+ (defined(__i386) || defined(__i386__) || \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+# define GHASH_ASM_X86_OR_64
+# define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ia32cap_P[];
+
+void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# define gcm_init_avx gcm_init_clmul
+# define gcm_gmult_avx gcm_gmult_clmul
+# define gcm_ghash_avx gcm_ghash_clmul
+# else
+void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
+
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+
+void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
+# include "arm_arch.h"
+# if __ARM_MAX_ARCH__>=7
+# define GHASH_ASM_ARM
+# define GCM_FUNCREF_4BIT
+# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
+# if defined(__arm__) || defined(__arm)
+# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
+# elif defined(__sparc__) || defined(__sparc)
+# include "sparc_arch.h"
+# define GHASH_ASM_SPARC
+# define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_sparcv9cap_P[];
+void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+# include "ppc_arch.h"
+# define GHASH_ASM_PPC
+# define GCM_FUNCREF_4BIT
+void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+# endif
+#endif
+
+#ifdef GCM_FUNCREF_4BIT
+# undef GCM_MUL
+# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+# undef GHASH
+# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
+#endif
+
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->block = block;
+ ctx->key = key;
+
+ (*block) (ctx->H.c, ctx->H.c, key);
+
+ if (is_endian.little) {
+ /* H is stored in host byte order */
+#ifdef BSWAP8
+ ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+ ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+ u8 *p = ctx->H.c;
+ u64 hi, lo;
+ hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
+ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+ ctx->H.u[0] = hi;
+ ctx->H.u[1] = lo;
+#endif
+ }
+#if TABLE_BITS==8
+ gcm_init_8bit(ctx->Htable, ctx->H.u);
+#elif TABLE_BITS==4
+# if defined(GHASH)
+# define CTX__GHASH(f) (ctx->ghash = (f))
+# else
+# define CTX__GHASH(f) (ctx->ghash = NULL)
+# endif
+# if defined(GHASH_ASM_X86_OR_64)
+# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+ if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
+ if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
+ gcm_init_avx(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_avx;
+ CTX__GHASH(gcm_ghash_avx);
+ } else {
+ gcm_init_clmul(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_clmul;
+ CTX__GHASH(gcm_ghash_clmul);
+ }
+ return;
+ }
+# endif
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+# if defined(GHASH_ASM_X86) /* x86 only */
+# if defined(OPENSSL_IA32_SSE2)
+ if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
+# else
+ if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
+# endif
+ ctx->gmult = gcm_gmult_4bit_mmx;
+ CTX__GHASH(gcm_ghash_4bit_mmx);
+ } else {
+ ctx->gmult = gcm_gmult_4bit_x86;
+ CTX__GHASH(gcm_ghash_4bit_x86);
+ }
+# else
+ ctx->gmult = gcm_gmult_4bit;
+ CTX__GHASH(gcm_ghash_4bit);
+# endif
+# elif defined(GHASH_ASM_ARM)
+# ifdef PMULL_CAPABLE
+ if (PMULL_CAPABLE) {
+ gcm_init_v8(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_v8;
+ CTX__GHASH(gcm_ghash_v8);
+ } else
+# endif
+# ifdef NEON_CAPABLE
+ if (NEON_CAPABLE) {
+ gcm_init_neon(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_neon;
+ CTX__GHASH(gcm_ghash_neon);
+ } else
+# endif
+ {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ CTX__GHASH(gcm_ghash_4bit);
+ }
+# elif defined(GHASH_ASM_SPARC)
+ if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
+ gcm_init_vis3(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_vis3;
+ CTX__GHASH(gcm_ghash_vis3);
+ } else {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ CTX__GHASH(gcm_ghash_4bit);
+ }
+# elif defined(GHASH_ASM_PPC)
+ if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
+ gcm_init_p8(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_p8;
+ CTX__GHASH(gcm_ghash_p8);
+ } else {
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ CTX__GHASH(gcm_ghash_4bit);
+ }
+# else
+ gcm_init_4bit(ctx->Htable, ctx->H.u);
+# endif
+# undef CTX__GHASH
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
+ size_t len)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+ unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+#endif
+
+ ctx->Yi.u[0] = 0;
+ ctx->Yi.u[1] = 0;
+ ctx->Xi.u[0] = 0;
+ ctx->Xi.u[1] = 0;
+ ctx->len.u[0] = 0; /* AAD length */
+ ctx->len.u[1] = 0; /* message length */
+ ctx->ares = 0;
+ ctx->mres = 0;
+
+ if (len == 12) {
+ memcpy(ctx->Yi.c, iv, 12);
+ ctx->Yi.c[15] = 1;
+ ctr = 1;
+ } else {
+ size_t i;
+ u64 len0 = len;
+
+ while (len >= 16) {
+ for (i = 0; i < 16; ++i)
+ ctx->Yi.c[i] ^= iv[i];
+ GCM_MUL(ctx, Yi);
+ iv += 16;
+ len -= 16;
+ }
+ if (len) {
+ for (i = 0; i < len; ++i)
+ ctx->Yi.c[i] ^= iv[i];
+ GCM_MUL(ctx, Yi);
+ }
+ len0 <<= 3;
+ if (is_endian.little) {
+#ifdef BSWAP8
+ ctx->Yi.u[1] ^= BSWAP8(len0);
+#else
+ ctx->Yi.c[8] ^= (u8)(len0 >> 56);
+ ctx->Yi.c[9] ^= (u8)(len0 >> 48);
+ ctx->Yi.c[10] ^= (u8)(len0 >> 40);
+ ctx->Yi.c[11] ^= (u8)(len0 >> 32);
+ ctx->Yi.c[12] ^= (u8)(len0 >> 24);
+ ctx->Yi.c[13] ^= (u8)(len0 >> 16);
+ ctx->Yi.c[14] ^= (u8)(len0 >> 8);
+ ctx->Yi.c[15] ^= (u8)(len0);
+#endif
+ } else
+ ctx->Yi.u[1] ^= len0;
+
+ GCM_MUL(ctx, Yi);
+
+ if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
+ ctr = GETU32(ctx->Yi.c + 12);
+#endif
+ else
+ ctr = ctx->Yi.d[3];
+ }
+
+ (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
+ ++ctr;
+ if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
+ PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+ else
+ ctx->Yi.d[3] = ctr;
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
+ size_t len)
+{
+ size_t i;
+ unsigned int n;
+ u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
+# endif
+#endif
+
+ if (ctx->len.u[1])
+ return -2;
+
+ alen += len;
+ if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
+ return -1;
+ ctx->len.u[0] = alen;
+
+ n = ctx->ares;
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(aad++);
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->ares = n;
+ return 0;
+ }
+ }
+#ifdef GHASH
+ if ((i = (len & (size_t)-16))) {
+ GHASH(ctx, aad, i);
+ aad += i;
+ len -= i;
+ }
+#else
+ while (len >= 16) {
+ for (i = 0; i < 16; ++i)
+ ctx->Xi.c[i] ^= aad[i];
+ GCM_MUL(ctx, Xi);
+ aad += 16;
+ len -= 16;
+ }
+#endif
+ if (len) {
+ n = (unsigned int)len;
+ for (i = 0; i < len; ++i)
+ ctx->Xi.c[i] ^= aad[i];
+ }
+
+ ctx->ares = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
+# endif
+#endif
+
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to encrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
+ ctr = GETU32(ctx->Yi.c + 12);
+#endif
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
+ break;
+# endif
+# if defined(GHASH)
+# if defined(GHASH_CHUNK)
+ while (len >= GHASH_CHUNK) {
+ size_t j = GHASH_CHUNK;
+
+ while (j) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ j -= 16;
+ }
+ GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
+ len -= GHASH_CHUNK;
+ }
+# endif
+ if ((i = (len & (size_t)-16))) {
+ size_t j = i;
+
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+ GHASH(ctx, out - j, j);
+ }
+# else
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ GCM_MUL(ctx, Xi);
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+# endif
+ if (len) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+ } while (0);
+ }
+#endif
+ for (i = 0; i < len; ++i) {
+ if (n == 0) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
+ PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+ else
+ ctx->Yi.d[3] = ctr;
+ }
+ ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
+ n = (n + 1) % 16;
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ }
+
+ ctx->mres = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
+# endif
+#endif
+
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to decrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+#ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+#else
+ ctr = GETU32(ctx->Yi.c + 12);
+#endif
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ if (n) {
+ while (n && len) {
+ u8 c = *(in++);
+ *(out++) = c ^ ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
+ break;
+# endif
+# if defined(GHASH)
+# if defined(GHASH_CHUNK)
+ while (len >= GHASH_CHUNK) {
+ size_t j = GHASH_CHUNK;
+
+ GHASH(ctx, in, GHASH_CHUNK);
+ while (j) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ j -= 16;
+ }
+ len -= GHASH_CHUNK;
+ }
+# endif
+ if ((i = (len & (size_t)-16))) {
+ GHASH(ctx, in, i);
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+ }
+# else
+ while (len >= 16) {
+ size_t *out_t = (size_t *)out;
+ const size_t *in_t = (const size_t *)in;
+
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i = 0; i < 16 / sizeof(size_t); ++i) {
+ size_t c = in[i];
+ out[i] = c ^ ctx->EKi.t[i];
+ ctx->Xi.t[i] ^= c;
+ }
+ GCM_MUL(ctx, Xi);
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+# endif
+ if (len) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ u8 c = in[n];
+ ctx->Xi.c[n] ^= c;
+ out[n] = c ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+ } while (0);
+ }
+#endif
+ for (i = 0; i < len; ++i) {
+ u8 c;
+ if (n == 0) {
+ (*block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+#ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+#else
+ PUTU32(ctx->Yi.c + 12, ctr);
+#endif
+ else
+ ctx->Yi.d[3] = ctr;
+ }
+ c = in[i];
+ out[i] = c ^ ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ n = (n + 1) % 16;
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ }
+
+ ctx->mres = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len, ctr128_f stream)
+{
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+ return CRYPTO_gcm128_encrypt(ctx, in, out, len);
+#else
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ void *key = ctx->key;
+# ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
+# endif
+# endif
+
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to encrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+# else
+ ctr = GETU32(ctx->Yi.c + 12);
+# endif
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+# if defined(GHASH) && defined(GHASH_CHUNK)
+ while (len >= GHASH_CHUNK) {
+ (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+ ctr += GHASH_CHUNK / 16;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ GHASH(ctx, out, GHASH_CHUNK);
+ out += GHASH_CHUNK;
+ in += GHASH_CHUNK;
+ len -= GHASH_CHUNK;
+ }
+# endif
+ if ((i = (len & (size_t)-16))) {
+ size_t j = i / 16;
+
+ (*stream) (in, out, j, key, ctx->Yi.c);
+ ctr += (unsigned int)j;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ in += i;
+ len -= i;
+# if defined(GHASH)
+ GHASH(ctx, out, i);
+ out += i;
+# else
+ while (j--) {
+ for (i = 0; i < 16; ++i)
+ ctx->Xi.c[i] ^= out[i];
+ GCM_MUL(ctx, Xi);
+ out += 16;
+ }
+# endif
+ }
+ if (len) {
+ (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+#endif
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len, ctr128_f stream)
+{
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+ return CRYPTO_gcm128_decrypt(ctx, in, out, len);
+#else
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ void *key = ctx->key;
+# ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx->ghash;
+# endif
+# endif
+
+ mlen += len;
+ if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to decrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx, Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctr = BSWAP4(ctx->Yi.d[3]);
+# else
+ ctr = GETU32(ctx->Yi.c + 12);
+# endif
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+ if (n) {
+ while (n && len) {
+ u8 c = *(in++);
+ *(out++) = c ^ ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ --len;
+ n = (n + 1) % 16;
+ }
+ if (n == 0)
+ GCM_MUL(ctx, Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+# if defined(GHASH) && defined(GHASH_CHUNK)
+ while (len >= GHASH_CHUNK) {
+ GHASH(ctx, in, GHASH_CHUNK);
+ (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+ ctr += GHASH_CHUNK / 16;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ out += GHASH_CHUNK;
+ in += GHASH_CHUNK;
+ len -= GHASH_CHUNK;
+ }
+# endif
+ if ((i = (len & (size_t)-16))) {
+ size_t j = i / 16;
+
+# if defined(GHASH)
+ GHASH(ctx, in, i);
+# else
+ while (j--) {
+ size_t k;
+ for (k = 0; k < 16; ++k)
+ ctx->Xi.c[k] ^= in[k];
+ GCM_MUL(ctx, Xi);
+ in += 16;
+ }
+ j = i / 16;
+ in -= i;
+# endif
+ (*stream) (in, out, j, key, ctx->Yi.c);
+ ctr += (unsigned int)j;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ out += i;
+ in += i;
+ len -= i;
+ }
+ if (len) {
+ (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
+ ++ctr;
+ if (is_endian.little)
+# ifdef BSWAP4
+ ctx->Yi.d[3] = BSWAP4(ctr);
+# else
+ PUTU32(ctx->Yi.c + 12, ctr);
+# endif
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ u8 c = in[n];
+ ctx->Xi.c[n] ^= c;
+ out[n] = c ^ ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+#endif
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
+ size_t len)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+ u64 alen = ctx->len.u[0] << 3;
+ u64 clen = ctx->len.u[1] << 3;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
+#endif
+
+ if (ctx->mres || ctx->ares)
+ GCM_MUL(ctx, Xi);
+
+ if (is_endian.little) {
+#ifdef BSWAP8
+ alen = BSWAP8(alen);
+ clen = BSWAP8(clen);
+#else
+ u8 *p = ctx->len.c;
+
+ ctx->len.u[0] = alen;
+ ctx->len.u[1] = clen;
+
+ alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
+ clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+#endif
+ }
+
+ ctx->Xi.u[0] ^= alen;
+ ctx->Xi.u[1] ^= clen;
+ GCM_MUL(ctx, Xi);
+
+ ctx->Xi.u[0] ^= ctx->EK0.u[0];
+ ctx->Xi.u[1] ^= ctx->EK0.u[1];
+
+ if (tag && len <= sizeof(ctx->Xi))
+ return CRYPTO_memcmp(ctx->Xi.c, tag, len);
+ else
+ return -1;
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+ CRYPTO_gcm128_finish(ctx, NULL, 0);
+ memcpy(tag, ctx->Xi.c,
+ len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
+}
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
+{
+ GCM128_CONTEXT *ret;
+
+ if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
+ CRYPTO_gcm128_init(ret, key, block);
+
+ return ret;
+}
+
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
+{
+ OPENSSL_clear_free(ctx, sizeof(*ctx));
+}
+
+#if defined(SELFTEST)
+# include <stdio.h>
+# include <openssl/aes.h>
+
+/* Test Case 1 */
+static const u8 K1[16], *P1 = NULL, *A1 = NULL, IV1[12], *C1 = NULL;
+static const u8 T1[] = {
+ 0x58, 0xe2, 0xfc, 0xce, 0xfa, 0x7e, 0x30, 0x61,
+ 0x36, 0x7f, 0x1d, 0x57, 0xa4, 0xe7, 0x45, 0x5a
+};
+
+/* Test Case 2 */
+# define K2 K1
+# define A2 A1
+# define IV2 IV1
+static const u8 P2[16];
+static const u8 C2[] = {
+ 0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92,
+ 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78
+};
+
+static const u8 T2[] = {
+ 0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd,
+ 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf
+};
+
+/* Test Case 3 */
+# define A3 A2
+static const u8 K3[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
+};
+
+static const u8 P3[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV3[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+ 0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C3[] = {
+ 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
+ 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
+ 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
+ 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
+ 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
+ 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
+ 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
+ 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85
+};
+
+static const u8 T3[] = {
+ 0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6,
+ 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4
+};
+
+/* Test Case 4 */
+# define K4 K3
+# define IV4 IV3
+static const u8 P4[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A4[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C4[] = {
+ 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
+ 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
+ 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
+ 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
+ 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
+ 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
+ 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
+ 0x3d, 0x58, 0xe0, 0x91
+};
+
+static const u8 T4[] = {
+ 0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb,
+ 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47
+};
+
+/* Test Case 5 */
+# define K5 K4
+# define P5 P4
+# define A5 A4
+static const u8 IV5[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad
+};
+
+static const u8 C5[] = {
+ 0x61, 0x35, 0x3b, 0x4c, 0x28, 0x06, 0x93, 0x4a,
+ 0x77, 0x7f, 0xf5, 0x1f, 0xa2, 0x2a, 0x47, 0x55,
+ 0x69, 0x9b, 0x2a, 0x71, 0x4f, 0xcd, 0xc6, 0xf8,
+ 0x37, 0x66, 0xe5, 0xf9, 0x7b, 0x6c, 0x74, 0x23,
+ 0x73, 0x80, 0x69, 0x00, 0xe4, 0x9f, 0x24, 0xb2,
+ 0x2b, 0x09, 0x75, 0x44, 0xd4, 0x89, 0x6b, 0x42,
+ 0x49, 0x89, 0xb5, 0xe1, 0xeb, 0xac, 0x0f, 0x07,
+ 0xc2, 0x3f, 0x45, 0x98
+};
+
+static const u8 T5[] = {
+ 0x36, 0x12, 0xd2, 0xe7, 0x9e, 0x3b, 0x07, 0x85,
+ 0x56, 0x1b, 0xe1, 0x4a, 0xac, 0xa2, 0xfc, 0xcb
+};
+
+/* Test Case 6 */
+# define K6 K5
+# define P6 P5
+# define A6 A5
+static const u8 IV6[] = {
+ 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+ 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+ 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+ 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+ 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+ 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+ 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+ 0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C6[] = {
+ 0x8c, 0xe2, 0x49, 0x98, 0x62, 0x56, 0x15, 0xb6,
+ 0x03, 0xa0, 0x33, 0xac, 0xa1, 0x3f, 0xb8, 0x94,
+ 0xbe, 0x91, 0x12, 0xa5, 0xc3, 0xa2, 0x11, 0xa8,
+ 0xba, 0x26, 0x2a, 0x3c, 0xca, 0x7e, 0x2c, 0xa7,
+ 0x01, 0xe4, 0xa9, 0xa4, 0xfb, 0xa4, 0x3c, 0x90,
+ 0xcc, 0xdc, 0xb2, 0x81, 0xd4, 0x8c, 0x7c, 0x6f,
+ 0xd6, 0x28, 0x75, 0xd2, 0xac, 0xa4, 0x17, 0x03,
+ 0x4c, 0x34, 0xae, 0xe5
+};
+
+static const u8 T6[] = {
+ 0x61, 0x9c, 0xc5, 0xae, 0xff, 0xfe, 0x0b, 0xfa,
+ 0x46, 0x2a, 0xf4, 0x3c, 0x16, 0x99, 0xd0, 0x50
+};
+
+/* Test Case 7 */
+static const u8 K7[24], *P7 = NULL, *A7 = NULL, IV7[12], *C7 = NULL;
+static const u8 T7[] = {
+ 0xcd, 0x33, 0xb2, 0x8a, 0xc7, 0x73, 0xf7, 0x4b,
+ 0xa0, 0x0e, 0xd1, 0xf3, 0x12, 0x57, 0x24, 0x35
+};
+
+/* Test Case 8 */
+# define K8 K7
+# define IV8 IV7
+# define A8 A7
+static const u8 P8[16];
+static const u8 C8[] = {
+ 0x98, 0xe7, 0x24, 0x7c, 0x07, 0xf0, 0xfe, 0x41,
+ 0x1c, 0x26, 0x7e, 0x43, 0x84, 0xb0, 0xf6, 0x00
+};
+
+static const u8 T8[] = {
+ 0x2f, 0xf5, 0x8d, 0x80, 0x03, 0x39, 0x27, 0xab,
+ 0x8e, 0xf4, 0xd4, 0x58, 0x75, 0x14, 0xf0, 0xfb
+};
+
+/* Test Case 9 */
+# define A9 A8
+static const u8 K9[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c
+};
+
+static const u8 P9[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV9[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+ 0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C9[] = {
+ 0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
+ 0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
+ 0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
+ 0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
+ 0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
+ 0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
+ 0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
+ 0xcc, 0xda, 0x27, 0x10, 0xac, 0xad, 0xe2, 0x56
+};
+
+static const u8 T9[] = {
+ 0x99, 0x24, 0xa7, 0xc8, 0x58, 0x73, 0x36, 0xbf,
+ 0xb1, 0x18, 0x02, 0x4d, 0xb8, 0x67, 0x4a, 0x14
+};
+
+/* Test Case 10 */
+# define K10 K9
+# define IV10 IV9
+static const u8 P10[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A10[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C10[] = {
+ 0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
+ 0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
+ 0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
+ 0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
+ 0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
+ 0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
+ 0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
+ 0xcc, 0xda, 0x27, 0x10
+};
+
+static const u8 T10[] = {
+ 0x25, 0x19, 0x49, 0x8e, 0x80, 0xf1, 0x47, 0x8f,
+ 0x37, 0xba, 0x55, 0xbd, 0x6d, 0x27, 0x61, 0x8c
+};
+
+/* Test Case 11 */
+# define K11 K10
+# define P11 P10
+# define A11 A10
+static const u8 IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
+
+static const u8 C11[] = {
+ 0x0f, 0x10, 0xf5, 0x99, 0xae, 0x14, 0xa1, 0x54,
+ 0xed, 0x24, 0xb3, 0x6e, 0x25, 0x32, 0x4d, 0xb8,
+ 0xc5, 0x66, 0x63, 0x2e, 0xf2, 0xbb, 0xb3, 0x4f,
+ 0x83, 0x47, 0x28, 0x0f, 0xc4, 0x50, 0x70, 0x57,
+ 0xfd, 0xdc, 0x29, 0xdf, 0x9a, 0x47, 0x1f, 0x75,
+ 0xc6, 0x65, 0x41, 0xd4, 0xd4, 0xda, 0xd1, 0xc9,
+ 0xe9, 0x3a, 0x19, 0xa5, 0x8e, 0x8b, 0x47, 0x3f,
+ 0xa0, 0xf0, 0x62, 0xf7
+};
+
+static const u8 T11[] = {
+ 0x65, 0xdc, 0xc5, 0x7f, 0xcf, 0x62, 0x3a, 0x24,
+ 0x09, 0x4f, 0xcc, 0xa4, 0x0d, 0x35, 0x33, 0xf8
+};
+
+/* Test Case 12 */
+# define K12 K11
+# define P12 P11
+# define A12 A11
+static const u8 IV12[] = {
+ 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+ 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+ 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+ 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+ 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+ 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+ 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+ 0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C12[] = {
+ 0xd2, 0x7e, 0x88, 0x68, 0x1c, 0xe3, 0x24, 0x3c,
+ 0x48, 0x30, 0x16, 0x5a, 0x8f, 0xdc, 0xf9, 0xff,
+ 0x1d, 0xe9, 0xa1, 0xd8, 0xe6, 0xb4, 0x47, 0xef,
+ 0x6e, 0xf7, 0xb7, 0x98, 0x28, 0x66, 0x6e, 0x45,
+ 0x81, 0xe7, 0x90, 0x12, 0xaf, 0x34, 0xdd, 0xd9,
+ 0xe2, 0xf0, 0x37, 0x58, 0x9b, 0x29, 0x2d, 0xb3,
+ 0xe6, 0x7c, 0x03, 0x67, 0x45, 0xfa, 0x22, 0xe7,
+ 0xe9, 0xb7, 0x37, 0x3b
+};
+
+static const u8 T12[] = {
+ 0xdc, 0xf5, 0x66, 0xff, 0x29, 0x1c, 0x25, 0xbb,
+ 0xb8, 0x56, 0x8f, 0xc3, 0xd3, 0x76, 0xa6, 0xd9
+};
+
+/* Test Case 13 */
+static const u8 K13[32], *P13 = NULL, *A13 = NULL, IV13[12], *C13 = NULL;
+static const u8 T13[] = {
+ 0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9,
+ 0xa9, 0x63, 0xb4, 0xf1, 0xc4, 0xcb, 0x73, 0x8b
+};
+
+/* Test Case 14 */
+# define K14 K13
+# define A14 A13
+static const u8 P14[16], IV14[12];
+static const u8 C14[] = {
+ 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
+ 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
+};
+
+static const u8 T14[] = {
+ 0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0,
+ 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19
+};
+
+/* Test Case 15 */
+# define A15 A14
+static const u8 K15[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
+ 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
+};
+
+static const u8 P15[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+
+static const u8 IV15[] = {
+ 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
+ 0xde, 0xca, 0xf8, 0x88
+};
+
+static const u8 C15[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+ 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+ 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+ 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+ 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+
+static const u8 T15[] = {
+ 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd,
+ 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c
+};
+
+/* Test Case 16 */
+# define K16 K15
+# define IV16 IV15
+static const u8 P16[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39
+};
+
+static const u8 A16[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2
+};
+
+static const u8 C16[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+ 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+ 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+ 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+ 0xbc, 0xc9, 0xf6, 0x62
+};
+
+static const u8 T16[] = {
+ 0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68,
+ 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b
+};
+
+/* Test Case 17 */
+# define K17 K16
+# define P17 P16
+# define A17 A16
+static const u8 IV17[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
+
+static const u8 C17[] = {
+ 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32,
+ 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
+ 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa,
+ 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
+ 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0,
+ 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
+ 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99,
+ 0xf4, 0x7c, 0x9b, 0x1f
+};
+
+static const u8 T17[] = {
+ 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4,
+ 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2
+};
+
+/* Test Case 18 */
+# define K18 K17
+# define P18 P17
+# define A18 A17
+static const u8 IV18[] = {
+ 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
+ 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
+ 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
+ 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
+ 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
+ 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
+ 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
+ 0xa6, 0x37, 0xb3, 0x9b
+};
+
+static const u8 C18[] = {
+ 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1,
+ 0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20,
+ 0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19,
+ 0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
+ 0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45,
+ 0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde,
+ 0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e,
+ 0x44, 0xae, 0x7e, 0x3f
+};
+
+static const u8 T18[] = {
+ 0xa4, 0x4a, 0x82, 0x66, 0xee, 0x1c, 0x8e, 0xb0,
+ 0xc8, 0xb5, 0xd4, 0xcf, 0x5a, 0xe9, 0xf1, 0x9a
+};
+
+/* Test Case 19 */
+# define K19 K1
+# define P19 P1
+# define IV19 IV1
+# define C19 C1
+static const u8 A19[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
+ 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
+ 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
+ 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
+ 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55,
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
+ 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
+ 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
+ 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
+ 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+
+static const u8 T19[] = {
+ 0x5f, 0xea, 0x79, 0x3a, 0x2d, 0x6f, 0x97, 0x4d,
+ 0x37, 0xe6, 0x8e, 0x0c, 0xb8, 0xff, 0x94, 0x92
+};
+
+/* Test Case 20 */
+# define K20 K1
+# define A20 A1
+/* this results in 0xff in counter LSB */
+static const u8 IV20[64] = { 0xff, 0xff, 0xff, 0xff };
+
+static const u8 P20[288];
+static const u8 C20[] = {
+ 0x56, 0xb3, 0x37, 0x3c, 0xa9, 0xef, 0x6e, 0x4a,
+ 0x2b, 0x64, 0xfe, 0x1e, 0x9a, 0x17, 0xb6, 0x14,
+ 0x25, 0xf1, 0x0d, 0x47, 0xa7, 0x5a, 0x5f, 0xce,
+ 0x13, 0xef, 0xc6, 0xbc, 0x78, 0x4a, 0xf2, 0x4f,
+ 0x41, 0x41, 0xbd, 0xd4, 0x8c, 0xf7, 0xc7, 0x70,
+ 0x88, 0x7a, 0xfd, 0x57, 0x3c, 0xca, 0x54, 0x18,
+ 0xa9, 0xae, 0xff, 0xcd, 0x7c, 0x5c, 0xed, 0xdf,
+ 0xc6, 0xa7, 0x83, 0x97, 0xb9, 0xa8, 0x5b, 0x49,
+ 0x9d, 0xa5, 0x58, 0x25, 0x72, 0x67, 0xca, 0xab,
+ 0x2a, 0xd0, 0xb2, 0x3c, 0xa4, 0x76, 0xa5, 0x3c,
+ 0xb1, 0x7f, 0xb4, 0x1c, 0x4b, 0x8b, 0x47, 0x5c,
+ 0xb4, 0xf3, 0xf7, 0x16, 0x50, 0x94, 0xc2, 0x29,
+ 0xc9, 0xe8, 0xc4, 0xdc, 0x0a, 0x2a, 0x5f, 0xf1,
+ 0x90, 0x3e, 0x50, 0x15, 0x11, 0x22, 0x13, 0x76,
+ 0xa1, 0xcd, 0xb8, 0x36, 0x4c, 0x50, 0x61, 0xa2,
+ 0x0c, 0xae, 0x74, 0xbc, 0x4a, 0xcd, 0x76, 0xce,
+ 0xb0, 0xab, 0xc9, 0xfd, 0x32, 0x17, 0xef, 0x9f,
+ 0x8c, 0x90, 0xbe, 0x40, 0x2d, 0xdf, 0x6d, 0x86,
+ 0x97, 0xf4, 0xf8, 0x80, 0xdf, 0xf1, 0x5b, 0xfb,
+ 0x7a, 0x6b, 0x28, 0x24, 0x1e, 0xc8, 0xfe, 0x18,
+ 0x3c, 0x2d, 0x59, 0xe3, 0xf9, 0xdf, 0xff, 0x65,
+ 0x3c, 0x71, 0x26, 0xf0, 0xac, 0xb9, 0xe6, 0x42,
+ 0x11, 0xf4, 0x2b, 0xae, 0x12, 0xaf, 0x46, 0x2b,
+ 0x10, 0x70, 0xbe, 0xf1, 0xab, 0x5e, 0x36, 0x06,
+ 0x87, 0x2c, 0xa1, 0x0d, 0xee, 0x15, 0xb3, 0x24,
+ 0x9b, 0x1a, 0x1b, 0x95, 0x8f, 0x23, 0x13, 0x4c,
+ 0x4b, 0xcc, 0xb7, 0xd0, 0x32, 0x00, 0xbc, 0xe4,
+ 0x20, 0xa2, 0xf8, 0xeb, 0x66, 0xdc, 0xf3, 0x64,
+ 0x4d, 0x14, 0x23, 0xc1, 0xb5, 0x69, 0x90, 0x03,
+ 0xc1, 0x3e, 0xce, 0xf4, 0xbf, 0x38, 0xa3, 0xb6,
+ 0x0e, 0xed, 0xc3, 0x40, 0x33, 0xba, 0xc1, 0x90,
+ 0x27, 0x83, 0xdc, 0x6d, 0x89, 0xe2, 0xe7, 0x74,
+ 0x18, 0x8a, 0x43, 0x9c, 0x7e, 0xbc, 0xc0, 0x67,
+ 0x2d, 0xbd, 0xa4, 0xdd, 0xcf, 0xb2, 0x79, 0x46,
+ 0x13, 0xb0, 0xbe, 0x41, 0x31, 0x5e, 0xf7, 0x78,
+ 0x70, 0x8a, 0x70, 0xee, 0x7d, 0x75, 0x16, 0x5c
+};
+
+static const u8 T20[] = {
+ 0x8b, 0x30, 0x7f, 0x6b, 0x33, 0x28, 0x6d, 0x0a,
+ 0xb0, 0x26, 0xa9, 0xed, 0x3f, 0xe1, 0xe8, 0x5f
+};
+
+# define TEST_CASE(n) do { \
+ u8 out[sizeof(P##n)]; \
+ AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
+ CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
+ CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
+ memset(out,0,sizeof(out)); \
+ if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
+ if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
+ if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
+ (C##n && memcmp(out,C##n,sizeof(out)))) \
+ ret++, printf ("encrypt test#%d failed.\n",n); \
+ CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
+ memset(out,0,sizeof(out)); \
+ if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
+ if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
+ if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
+ (P##n && memcmp(out,P##n,sizeof(out)))) \
+ ret++, printf ("decrypt test#%d failed.\n",n); \
+ } while(0)
+
+int main()
+{
+ GCM128_CONTEXT ctx;
+ AES_KEY key;
+ int ret = 0;
+
+ TEST_CASE(1);
+ TEST_CASE(2);
+ TEST_CASE(3);
+ TEST_CASE(4);
+ TEST_CASE(5);
+ TEST_CASE(6);
+ TEST_CASE(7);
+ TEST_CASE(8);
+ TEST_CASE(9);
+ TEST_CASE(10);
+ TEST_CASE(11);
+ TEST_CASE(12);
+ TEST_CASE(13);
+ TEST_CASE(14);
+ TEST_CASE(15);
+ TEST_CASE(16);
+ TEST_CASE(17);
+ TEST_CASE(18);
+ TEST_CASE(19);
+ TEST_CASE(20);
+
+# ifdef OPENSSL_CPUID_OBJ
+ {
+ size_t start, stop, gcm_t, ctr_t, OPENSSL_rdtsc();
+ union {
+ u64 u;
+ u8 c[1024];
+ } buf;
+ int i;
+
+ AES_set_encrypt_key(K1, sizeof(K1) * 8, &key);
+ CRYPTO_gcm128_init(&ctx, &key, (block128_f) AES_encrypt);
+ CRYPTO_gcm128_setiv(&ctx, IV1, sizeof(IV1));
+
+ CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
+ start = OPENSSL_rdtsc();
+ CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
+ gcm_t = OPENSSL_rdtsc() - start;
+
+ CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
+ &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
+ (block128_f) AES_encrypt);
+ start = OPENSSL_rdtsc();
+ CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
+ &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
+ (block128_f) AES_encrypt);
+ ctr_t = OPENSSL_rdtsc() - start;
+
+ printf("%.2f-%.2f=%.2f\n",
+ gcm_t / (double)sizeof(buf),
+ ctr_t / (double)sizeof(buf),
+ (gcm_t - ctr_t) / (double)sizeof(buf));
+# ifdef GHASH
+ {
+ void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
+ const u8 *inp, size_t len) = ctx.ghash;
+
+ GHASH((&ctx), buf.c, sizeof(buf));
+ start = OPENSSL_rdtsc();
+ for (i = 0; i < 100; ++i)
+ GHASH((&ctx), buf.c, sizeof(buf));
+ gcm_t = OPENSSL_rdtsc() - start;
+ printf("%.2f\n", gcm_t / (double)sizeof(buf) / (double)i);
+ }
+# endif
+ }
+# endif
+
+ return ret;
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/modes/modes_lcl.h b/openssl-1.1.0h/crypto/modes/modes_lcl.h
new file mode 100644
index 0000000..7a1603b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/modes_lcl.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/modes.h>
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef __int64 i64;
+typedef unsigned __int64 u64;
+# define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef long i64;
+typedef unsigned long u64;
+# define U64(C) C##UL
+#else
+typedef long long i64;
+typedef unsigned long long u64;
+# define U64(C) C##ULL
+#endif
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define STRICT_ALIGNMENT 1
+#ifndef PEDANTIC
+# if defined(__i386) || defined(__i386__) || \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__aarch64__) || \
+ defined(__s390__) || defined(__s390x__)
+# undef STRICT_ALIGNMENT
+# endif
+#endif
+
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+# if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+# define BSWAP8(x) ({ u64 ret_=(x); \
+ asm ("bswapq %0" \
+ : "+r"(ret_)); ret_; })
+# define BSWAP4(x) ({ u32 ret_=(x); \
+ asm ("bswapl %0" \
+ : "+r"(ret_)); ret_; })
+# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \
+ asm ("bswapl %0; bswapl %1" \
+ : "+r"(hi_),"+r"(lo_)); \
+ (u64)hi_<<32|lo_; })
+# define BSWAP4(x) ({ u32 ret_=(x); \
+ asm ("bswapl %0" \
+ : "+r"(ret_)); ret_; })
+# elif defined(__aarch64__)
+# define BSWAP8(x) ({ u64 ret_; \
+ asm ("rev %0,%1" \
+ : "=r"(ret_) : "r"(x)); ret_; })
+# define BSWAP4(x) ({ u32 ret_; \
+ asm ("rev %w0,%w1" \
+ : "=r"(ret_) : "r"(x)); ret_; })
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \
+ asm ("rev %0,%0; rev %1,%1" \
+ : "+r"(hi_),"+r"(lo_)); \
+ (u64)hi_<<32|lo_; })
+# define BSWAP4(x) ({ u32 ret_; \
+ asm ("rev %0,%1" \
+ : "=r"(ret_) : "r"((u32)(x))); \
+ ret_; })
+# endif
+# elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+# define BSWAP8(x) _byteswap_uint64((u64)(x))
+# define BSWAP4(x) _byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+__inline u32 _bswap4(u32 val)
+{
+_asm mov eax, val _asm bswap eax}
+# define BSWAP4(x) _bswap4(x)
+# endif
+# endif
+#endif
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+# define GETU32(p) BSWAP4(*(const u32 *)(p))
+# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
+#else
+# define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+# define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+/*- GCM definitions */ typedef struct {
+ u64 hi, lo;
+} u128;
+
+#ifdef TABLE_BITS
+# undef TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8 [or 1]. For further information see gcm128.c.
+ */
+#define TABLE_BITS 4
+
+struct gcm128_context {
+ /* Following 6 names follow names in GCM specification */
+ union {
+ u64 u[2];
+ u32 d[4];
+ u8 c[16];
+ size_t t[16 / sizeof(size_t)];
+ } Yi, EKi, EK0, len, Xi, H;
+ /*
+ * Relative position of Xi, H and pre-computed Htable is used in some
+ * assembler modules, i.e. don't change the order!
+ */
+#if TABLE_BITS==8
+ u128 Htable[256];
+#else
+ u128 Htable[16];
+ void (*gmult) (u64 Xi[2], const u128 Htable[16]);
+ void (*ghash) (u64 Xi[2], const u128 Htable[16], const u8 *inp,
+ size_t len);
+#endif
+ unsigned int mres, ares;
+ block128_f block;
+ void *key;
+};
+
+struct xts128_context {
+ void *key1, *key2;
+ block128_f block1, block2;
+};
+
+struct ccm128_context {
+ union {
+ u64 u[2];
+ u8 c[16];
+ } nonce, cmac;
+ u64 blocks;
+ block128_f block;
+ void *key;
+};
+
+#ifndef OPENSSL_NO_OCB
+
+typedef union {
+ u64 a[2];
+ unsigned char c[16];
+} OCB_BLOCK;
+# define ocb_block16_xor(in1,in2,out) \
+ ( (out)->a[0]=(in1)->a[0]^(in2)->a[0], \
+ (out)->a[1]=(in1)->a[1]^(in2)->a[1] )
+# if STRICT_ALIGNMENT
+# define ocb_block16_xor_misaligned(in1,in2,out) \
+ ocb_block_xor((in1)->c,(in2)->c,16,(out)->c)
+# else
+# define ocb_block16_xor_misaligned ocb_block16_xor
+# endif
+
+struct ocb128_context {
+ /* Need both encrypt and decrypt key schedules for decryption */
+ block128_f encrypt;
+ block128_f decrypt;
+ void *keyenc;
+ void *keydec;
+ ocb128_f stream; /* direction dependent */
+ /* Key dependent variables. Can be reused if key remains the same */
+ size_t l_index;
+ size_t max_l_index;
+ OCB_BLOCK l_star;
+ OCB_BLOCK l_dollar;
+ OCB_BLOCK *l;
+ /* Must be reset for each session */
+ u64 blocks_hashed;
+ u64 blocks_processed;
+ OCB_BLOCK tag;
+ OCB_BLOCK offset_aad;
+ OCB_BLOCK sum;
+ OCB_BLOCK offset;
+ OCB_BLOCK checksum;
+};
+#endif /* OPENSSL_NO_OCB */
diff --git a/openssl-1.1.0h/crypto/modes/ocb128.c b/openssl-1.1.0h/crypto/modes/ocb128.c
new file mode 100644
index 0000000..db794d0
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/ocb128.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+
+#ifndef OPENSSL_NO_OCB
+
+/*
+ * Calculate the number of binary trailing zero's in any given number
+ */
+static u32 ocb_ntz(u64 n)
+{
+ u32 cnt = 0;
+
+ /*
+ * We do a right-to-left simple sequential search. This is surprisingly
+ * efficient as the distribution of trailing zeros is not uniform,
+ * e.g. the number of possible inputs with no trailing zeros is equal to
+ * the number with 1 or more; the number with exactly 1 is equal to the
+ * number with 2 or more, etc. Checking the last two bits covers 75% of
+ * all numbers. Checking the last three covers 87.5%
+ */
+ while (!(n & 1)) {
+ n >>= 1;
+ cnt++;
+ }
+ return cnt;
+}
+
+/*
+ * Shift a block of 16 bytes left by shift bits
+ */
+static void ocb_block_lshift(const unsigned char *in, size_t shift,
+ unsigned char *out)
+{
+ unsigned char shift_mask;
+ int i;
+ unsigned char mask[15];
+
+ shift_mask = 0xff;
+ shift_mask <<= (8 - shift);
+ for (i = 15; i >= 0; i--) {
+ if (i > 0) {
+ mask[i - 1] = in[i] & shift_mask;
+ mask[i - 1] >>= 8 - shift;
+ }
+ out[i] = in[i] << shift;
+
+ if (i != 15) {
+ out[i] ^= mask[i];
+ }
+ }
+}
+
+/*
+ * Perform a "double" operation as per OCB spec
+ */
+static void ocb_double(OCB_BLOCK *in, OCB_BLOCK *out)
+{
+ unsigned char mask;
+
+ /*
+ * Calculate the mask based on the most significant bit. There are more
+ * efficient ways to do this - but this way is constant time
+ */
+ mask = in->c[0] & 0x80;
+ mask >>= 7;
+ mask *= 135;
+
+ ocb_block_lshift(in->c, 1, out->c);
+
+ out->c[15] ^= mask;
+}
+
+/*
+ * Perform an xor on in1 and in2 - each of len bytes. Store result in out
+ */
+static void ocb_block_xor(const unsigned char *in1,
+ const unsigned char *in2, size_t len,
+ unsigned char *out)
+{
+ size_t i;
+ for (i = 0; i < len; i++) {
+ out[i] = in1[i] ^ in2[i];
+ }
+}
+
+/*
+ * Lookup L_index in our lookup table. If we haven't already got it we need to
+ * calculate it
+ */
+static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
+{
+ size_t l_index = ctx->l_index;
+
+ if (idx <= l_index) {
+ return ctx->l + idx;
+ }
+
+ /* We don't have it - so calculate it */
+ if (idx >= ctx->max_l_index) {
+ void *tmp_ptr;
+ /*
+ * Each additional entry allows to process almost double as
+ * much data, so that in linear world the table will need to
+ * be expanded with smaller and smaller increments. Originally
+ * it was doubling in size, which was a waste. Growing it
+ * linearly is not formally optimal, but is simpler to implement.
+ * We grow table by minimally required 4*n that would accommodate
+ * the index.
+ */
+ ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3;
+ tmp_ptr =
+ OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK));
+ if (tmp_ptr == NULL) /* prevent ctx->l from being clobbered */
+ return NULL;
+ ctx->l = tmp_ptr;
+ }
+ while (l_index < idx) {
+ ocb_double(ctx->l + l_index, ctx->l + l_index + 1);
+ l_index++;
+ }
+ ctx->l_index = l_index;
+
+ return ctx->l + idx;
+}
+
+/*
+ * Create a new OCB128_CONTEXT
+ */
+OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
+ block128_f encrypt, block128_f decrypt,
+ ocb128_f stream)
+{
+ OCB128_CONTEXT *octx;
+ int ret;
+
+ if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) {
+ ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt,
+ stream);
+ if (ret)
+ return octx;
+ OPENSSL_free(octx);
+ }
+
+ return NULL;
+}
+
+/*
+ * Initialise an existing OCB128_CONTEXT
+ */
+int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
+ block128_f encrypt, block128_f decrypt,
+ ocb128_f stream)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->l_index = 0;
+ ctx->max_l_index = 5;
+ ctx->l = OPENSSL_malloc(ctx->max_l_index * 16);
+ if (ctx->l == NULL)
+ return 0;
+
+ /*
+ * We set both the encryption and decryption key schedules - decryption
+ * needs both. Don't really need decryption schedule if only doing
+ * encryption - but it simplifies things to take it anyway
+ */
+ ctx->encrypt = encrypt;
+ ctx->decrypt = decrypt;
+ ctx->stream = stream;
+ ctx->keyenc = keyenc;
+ ctx->keydec = keydec;
+
+ /* L_* = ENCIPHER(K, zeros(128)) */
+ ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc);
+
+ /* L_$ = double(L_*) */
+ ocb_double(&ctx->l_star, &ctx->l_dollar);
+
+ /* L_0 = double(L_$) */
+ ocb_double(&ctx->l_dollar, ctx->l);
+
+ /* L_{i} = double(L_{i-1}) */
+ ocb_double(ctx->l, ctx->l+1);
+ ocb_double(ctx->l+1, ctx->l+2);
+ ocb_double(ctx->l+2, ctx->l+3);
+ ocb_double(ctx->l+3, ctx->l+4);
+ ctx->l_index = 4; /* enough to process up to 496 bytes */
+
+ return 1;
+}
+
+/*
+ * Copy an OCB128_CONTEXT object
+ */
+int CRYPTO_ocb128_copy_ctx(OCB128_CONTEXT *dest, OCB128_CONTEXT *src,
+ void *keyenc, void *keydec)
+{
+ memcpy(dest, src, sizeof(OCB128_CONTEXT));
+ if (keyenc)
+ dest->keyenc = keyenc;
+ if (keydec)
+ dest->keydec = keydec;
+ if (src->l) {
+ dest->l = OPENSSL_malloc(src->max_l_index * 16);
+ if (dest->l == NULL)
+ return 0;
+ memcpy(dest->l, src->l, (src->l_index + 1) * 16);
+ }
+ return 1;
+}
+
+/*
+ * Set the IV to be used for this operation. Must be 1 - 15 bytes.
+ */
+int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
+ size_t len, size_t taglen)
+{
+ unsigned char ktop[16], tmp[16], mask;
+ unsigned char stretch[24], nonce[16];
+ size_t bottom, shift;
+
+ /*
+ * Spec says IV is 120 bits or fewer - it allows non byte aligned lengths.
+ * We don't support this at this stage
+ */
+ if ((len > 15) || (len < 1) || (taglen > 16) || (taglen < 1)) {
+ return -1;
+ }
+
+ /* Nonce = num2str(TAGLEN mod 128,7) || zeros(120-bitlen(N)) || 1 || N */
+ nonce[0] = ((taglen * 8) % 128) << 1;
+ memset(nonce + 1, 0, 15);
+ memcpy(nonce + 16 - len, iv, len);
+ nonce[15 - len] |= 1;
+
+ /* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
+ memcpy(tmp, nonce, 16);
+ tmp[15] &= 0xc0;
+ ctx->encrypt(tmp, ktop, ctx->keyenc);
+
+ /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
+ memcpy(stretch, ktop, 16);
+ ocb_block_xor(ktop, ktop + 1, 8, stretch + 16);
+
+ /* bottom = str2num(Nonce[123..128]) */
+ bottom = nonce[15] & 0x3f;
+
+ /* Offset_0 = Stretch[1+bottom..128+bottom] */
+ shift = bottom % 8;
+ ocb_block_lshift(stretch + (bottom / 8), shift, ctx->offset.c);
+ mask = 0xff;
+ mask <<= 8 - shift;
+ ctx->offset.c[15] |=
+ (*(stretch + (bottom / 8) + 16) & mask) >> (8 - shift);
+
+ return 1;
+}
+
+/*
+ * Provide any AAD. This can be called multiple times. Only the final time can
+ * have a partial block
+ */
+int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
+ size_t len)
+{
+ u64 i, all_num_blocks;
+ size_t num_blocks, last_len;
+ OCB_BLOCK tmp;
+
+ /* Calculate the number of blocks of AAD provided now, and so far */
+ num_blocks = len / 16;
+ all_num_blocks = num_blocks + ctx->blocks_hashed;
+
+ /* Loop through all full blocks of AAD */
+ for (i = ctx->blocks_hashed + 1; i <= all_num_blocks; i++) {
+ OCB_BLOCK *lookup;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+ if (lookup == NULL)
+ return 0;
+ ocb_block16_xor(&ctx->offset_aad, lookup, &ctx->offset_aad);
+
+ memcpy(tmp.c, aad, 16);
+ aad += 16;
+
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ ocb_block16_xor(&ctx->offset_aad, &tmp, &tmp);
+ ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+ ocb_block16_xor(&tmp, &ctx->sum, &ctx->sum);
+ }
+
+ /*
+ * Check if we have any partial blocks left over. This is only valid in the
+ * last call to this function
+ */
+ last_len = len % 16;
+
+ if (last_len > 0) {
+ /* Offset_* = Offset_m xor L_* */
+ ocb_block16_xor(&ctx->offset_aad, &ctx->l_star, &ctx->offset_aad);
+
+ /* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
+ memset(tmp.c, 0, 16);
+ memcpy(tmp.c, aad, last_len);
+ tmp.c[last_len] = 0x80;
+ ocb_block16_xor(&ctx->offset_aad, &tmp, &tmp);
+
+ /* Sum = Sum_m xor ENCIPHER(K, CipherInput) */
+ ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+ ocb_block16_xor(&tmp, &ctx->sum, &ctx->sum);
+ }
+
+ ctx->blocks_hashed = all_num_blocks;
+
+ return 1;
+}
+
+/*
+ * Provide any data to be encrypted. This can be called multiple times. Only
+ * the final time can have a partial block
+ */
+int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len)
+{
+ u64 i, all_num_blocks;
+ size_t num_blocks, last_len;
+
+ /*
+ * Calculate the number of blocks of data to be encrypted provided now, and
+ * so far
+ */
+ num_blocks = len / 16;
+ all_num_blocks = num_blocks + ctx->blocks_processed;
+
+ if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+ && ctx->stream != NULL) {
+ size_t max_idx = 0, top = (size_t)all_num_blocks;
+
+ /*
+ * See how many L_{i} entries we need to process data at hand
+ * and pre-compute missing entries in the table [if any]...
+ */
+ while (top >>= 1)
+ max_idx++;
+ if (ocb_lookup_l(ctx, max_idx) == NULL)
+ return 0;
+
+ ctx->stream(in, out, num_blocks, ctx->keyenc,
+ (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+ (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+ } else {
+ /* Loop through all full blocks to be encrypted */
+ for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+ OCB_BLOCK *lookup;
+ OCB_BLOCK tmp;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+ if (lookup == NULL)
+ return 0;
+ ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+ memcpy(tmp.c, in, 16);
+ in += 16;
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ ocb_block16_xor(&tmp, &ctx->checksum, &ctx->checksum);
+
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ ocb_block16_xor(&ctx->offset, &tmp, &tmp);
+ ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+ ocb_block16_xor(&ctx->offset, &tmp, &tmp);
+
+ memcpy(out, tmp.c, 16);
+ out += 16;
+ }
+ }
+
+ /*
+ * Check if we have any partial blocks left over. This is only valid in the
+ * last call to this function
+ */
+ last_len = len % 16;
+
+ if (last_len > 0) {
+ OCB_BLOCK pad;
+
+ /* Offset_* = Offset_m xor L_* */
+ ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
+
+ /* Pad = ENCIPHER(K, Offset_*) */
+ ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
+
+ /* C_* = P_* xor Pad[1..bitlen(P_*)] */
+ ocb_block_xor(in, pad.c, last_len, out);
+
+ /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+ memset(pad.c, 0, 16); /* borrow pad */
+ memcpy(pad.c, in, last_len);
+ pad.c[last_len] = 0x80;
+ ocb_block16_xor(&pad, &ctx->checksum, &ctx->checksum);
+ }
+
+ ctx->blocks_processed = all_num_blocks;
+
+ return 1;
+}
+
+/*
+ * Provide any data to be decrypted. This can be called multiple times. Only
+ * the final time can have a partial block
+ */
+int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len)
+{
+ u64 i, all_num_blocks;
+ size_t num_blocks, last_len;
+
+ /*
+ * Calculate the number of blocks of data to be decrypted provided now, and
+ * so far
+ */
+ num_blocks = len / 16;
+ all_num_blocks = num_blocks + ctx->blocks_processed;
+
+ if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+ && ctx->stream != NULL) {
+ size_t max_idx = 0, top = (size_t)all_num_blocks;
+
+ /*
+ * See how many L_{i} entries we need to process data at hand
+ * and pre-compute missing entries in the table [if any]...
+ */
+ while (top >>= 1)
+ max_idx++;
+ if (ocb_lookup_l(ctx, max_idx) == NULL)
+ return 0;
+
+ ctx->stream(in, out, num_blocks, ctx->keydec,
+ (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+ (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+ } else {
+ OCB_BLOCK tmp;
+
+ /* Loop through all full blocks to be decrypted */
+ for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+ if (lookup == NULL)
+ return 0;
+ ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+ memcpy(tmp.c, in, 16);
+ in += 16;
+
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ ocb_block16_xor(&ctx->offset, &tmp, &tmp);
+ ctx->decrypt(tmp.c, tmp.c, ctx->keydec);
+ ocb_block16_xor(&ctx->offset, &tmp, &tmp);
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ ocb_block16_xor(&tmp, &ctx->checksum, &ctx->checksum);
+
+ memcpy(out, tmp.c, 16);
+ out += 16;
+ }
+ }
+
+ /*
+ * Check if we have any partial blocks left over. This is only valid in the
+ * last call to this function
+ */
+ last_len = len % 16;
+
+ if (last_len > 0) {
+ OCB_BLOCK pad;
+
+ /* Offset_* = Offset_m xor L_* */
+ ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
+
+ /* Pad = ENCIPHER(K, Offset_*) */
+ ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
+
+ /* P_* = C_* xor Pad[1..bitlen(C_*)] */
+ ocb_block_xor(in, pad.c, last_len, out);
+
+ /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+ memset(pad.c, 0, 16); /* borrow pad */
+ memcpy(pad.c, out, last_len);
+ pad.c[last_len] = 0x80;
+ ocb_block16_xor(&pad, &ctx->checksum, &ctx->checksum);
+ }
+
+ ctx->blocks_processed = all_num_blocks;
+
+ return 1;
+}
+
+/*
+ * Calculate the tag and verify it against the supplied tag
+ */
+int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag,
+ size_t len)
+{
+ OCB_BLOCK tmp;
+
+ /*
+ * Tag = ENCIPHER(K, Checksum_* xor Offset_* xor L_$) xor HASH(K,A)
+ */
+ ocb_block16_xor(&ctx->checksum, &ctx->offset, &tmp);
+ ocb_block16_xor(&ctx->l_dollar, &tmp, &tmp);
+ ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+ ocb_block16_xor(&tmp, &ctx->sum, &ctx->tag);
+
+ if (len > 16 || len < 1) {
+ return -1;
+ }
+
+ /* Compare the tag if we've been given one */
+ if (tag)
+ return CRYPTO_memcmp(&ctx->tag, tag, len);
+ else
+ return -1;
+}
+
+/*
+ * Retrieve the calculated tag
+ */
+int CRYPTO_ocb128_tag(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+ if (len > 16 || len < 1) {
+ return -1;
+ }
+
+ /* Calculate the tag */
+ CRYPTO_ocb128_finish(ctx, NULL, 0);
+
+ /* Copy the tag into the supplied buffer */
+ memcpy(tag, ctx->tag.c, len);
+
+ return 1;
+}
+
+/*
+ * Release all resources
+ */
+void CRYPTO_ocb128_cleanup(OCB128_CONTEXT *ctx)
+{
+ if (ctx) {
+ OPENSSL_clear_free(ctx->l, ctx->max_l_index * 16);
+ OPENSSL_cleanse(ctx, sizeof(*ctx));
+ }
+}
+
+#endif /* OPENSSL_NO_OCB */
diff --git a/openssl-1.1.0h/crypto/modes/ofb128.c b/openssl-1.1.0h/crypto/modes/ofb128.c
new file mode 100644
index 0000000..8309256
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/ofb128.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+/*
+ * The input and output encrypted as though 128bit ofb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
+ */
+void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], int *num, block128_f block)
+{
+ unsigned int n;
+ size_t l = 0;
+
+ n = *num;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16 % sizeof(size_t) == 0) { /* always true actually */
+ do {
+ while (n && len) {
+ *(out++) = *(in++) ^ ivec[n];
+ --len;
+ n = (n + 1) % 16;
+ }
+# if defined(STRICT_ALIGNMENT)
+ if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
+ 0)
+ break;
+# endif
+ while (len >= 16) {
+ (*block) (ivec, ivec, key);
+ for (; n < 16; n += sizeof(size_t))
+ *(size_t *)(out + n) =
+ *(size_t *)(in + n) ^ *(size_t *)(ivec + n);
+ len -= 16;
+ out += 16;
+ in += 16;
+ n = 0;
+ }
+ if (len) {
+ (*block) (ivec, ivec, key);
+ while (len--) {
+ out[n] = in[n] ^ ivec[n];
+ ++n;
+ }
+ }
+ *num = n;
+ return;
+ } while (0);
+ }
+ /* the rest would be commonly eliminated by x86* compiler */
+#endif
+ while (l < len) {
+ if (n == 0) {
+ (*block) (ivec, ivec, key);
+ }
+ out[l] = in[l] ^ ivec[n];
+ ++l;
+ n = (n + 1) % 16;
+ }
+
+ *num = n;
+}
diff --git a/openssl-1.1.0h/crypto/modes/wrap128.c b/openssl-1.1.0h/crypto/modes/wrap128.c
new file mode 100644
index 0000000..46809a0
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/wrap128.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/** Beware!
+ *
+ * Following wrapping modes were designed for AES but this implementation
+ * allows you to use them for any 128 bit block cipher.
+ */
+
+#include "internal/cryptlib.h"
+#include <openssl/modes.h>
+
+/** RFC 3394 section 2.2.3.1 Default Initial Value */
+static const unsigned char default_iv[] = {
+ 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
+};
+
+/** RFC 5649 section 3 Alternative Initial Value 32-bit constant */
+static const unsigned char default_aiv[] = {
+ 0xA6, 0x59, 0x59, 0xA6
+};
+
+/** Input size limit: lower than maximum of standards but far larger than
+ * anything that will be used in practice.
+ */
+#define CRYPTO128_WRAP_MAX (1UL << 31)
+
+/** Wrapping according to RFC 3394 section 2.2.1.
+ *
+ * @param[in] key Key value.
+ * @param[in] iv IV value. Length = 8 bytes. NULL = use default_iv.
+ * @param[in] in Plaintext as n 64-bit blocks, n >= 2.
+ * @param[in] inlen Length of in.
+ * @param[out] out Ciphertext. Minimal buffer length = (inlen + 8) bytes.
+ * Input and output buffers can overlap if block function
+ * supports that.
+ * @param[in] block Block processing function.
+ * @return 0 if inlen does not consist of n 64-bit blocks, n >= 2.
+ * or if inlen > CRYPTO128_WRAP_MAX.
+ * Output length if wrapping succeeded.
+ */
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ unsigned char *A, B[16], *R;
+ size_t i, j, t;
+ if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
+ return 0;
+ A = B;
+ t = 1;
+ memmove(out + 8, in, inlen);
+ if (!iv)
+ iv = default_iv;
+
+ memcpy(A, iv, 8);
+
+ for (j = 0; j < 6; j++) {
+ R = out + 8;
+ for (i = 0; i < inlen; i += 8, t++, R += 8) {
+ memcpy(B + 8, R, 8);
+ block(B, B, key);
+ A[7] ^= (unsigned char)(t & 0xff);
+ if (t > 0xff) {
+ A[6] ^= (unsigned char)((t >> 8) & 0xff);
+ A[5] ^= (unsigned char)((t >> 16) & 0xff);
+ A[4] ^= (unsigned char)((t >> 24) & 0xff);
+ }
+ memcpy(R, B + 8, 8);
+ }
+ }
+ memcpy(out, A, 8);
+ return inlen + 8;
+}
+
+/** Unwrapping according to RFC 3394 section 2.2.2 steps 1-2.
+ * The IV check (step 3) is responsibility of the caller.
+ *
+ * @param[in] key Key value.
+ * @param[out] iv Unchecked IV value. Minimal buffer length = 8 bytes.
+ * @param[out] out Plaintext without IV.
+ * Minimal buffer length = (inlen - 8) bytes.
+ * Input and output buffers can overlap if block function
+ * supports that.
+ * @param[in] in Ciphertext as n 64-bit blocks.
+ * @param[in] inlen Length of in.
+ * @param[in] block Block processing function.
+ * @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
+ * or if inlen is not a multiple of 8.
+ * Output length otherwise.
+ */
+static size_t crypto_128_unwrap_raw(void *key, unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ unsigned char *A, B[16], *R;
+ size_t i, j, t;
+ inlen -= 8;
+ if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
+ return 0;
+ A = B;
+ t = 6 * (inlen >> 3);
+ memcpy(A, in, 8);
+ memmove(out, in + 8, inlen);
+ for (j = 0; j < 6; j++) {
+ R = out + inlen - 8;
+ for (i = 0; i < inlen; i += 8, t--, R -= 8) {
+ A[7] ^= (unsigned char)(t & 0xff);
+ if (t > 0xff) {
+ A[6] ^= (unsigned char)((t >> 8) & 0xff);
+ A[5] ^= (unsigned char)((t >> 16) & 0xff);
+ A[4] ^= (unsigned char)((t >> 24) & 0xff);
+ }
+ memcpy(B + 8, R, 8);
+ block(B, B, key);
+ memcpy(R, B + 8, 8);
+ }
+ }
+ memcpy(iv, A, 8);
+ return inlen;
+}
+
+/** Unwrapping according to RFC 3394 section 2.2.2, including the IV check.
+ * The first block of plaintext has to match the supplied IV, otherwise an
+ * error is returned.
+ *
+ * @param[in] key Key value.
+ * @param[out] iv IV value to match against. Length = 8 bytes.
+ * NULL = use default_iv.
+ * @param[out] out Plaintext without IV.
+ * Minimal buffer length = (inlen - 8) bytes.
+ * Input and output buffers can overlap if block function
+ * supports that.
+ * @param[in] in Ciphertext as n 64-bit blocks.
+ * @param[in] inlen Length of in.
+ * @param[in] block Block processing function.
+ * @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
+ * or if inlen is not a multiple of 8
+ * or if IV doesn't match expected value.
+ * Output length otherwise.
+ */
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+ unsigned char *out, const unsigned char *in,
+ size_t inlen, block128_f block)
+{
+ size_t ret;
+ unsigned char got_iv[8];
+
+ ret = crypto_128_unwrap_raw(key, got_iv, out, in, inlen, block);
+ if (ret == 0)
+ return 0;
+
+ if (!iv)
+ iv = default_iv;
+ if (CRYPTO_memcmp(got_iv, iv, 8)) {
+ OPENSSL_cleanse(out, ret);
+ return 0;
+ }
+ return ret;
+}
+
+/** Wrapping according to RFC 5649 section 4.1.
+ *
+ * @param[in] key Key value.
+ * @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv.
+ * @param[out] out Ciphertext. Minimal buffer length = (inlen + 15) bytes.
+ * Input and output buffers can overlap if block function
+ * supports that.
+ * @param[in] in Plaintext as n 64-bit blocks, n >= 2.
+ * @param[in] inlen Length of in.
+ * @param[in] block Block processing function.
+ * @return 0 if inlen is out of range [1, CRYPTO128_WRAP_MAX].
+ * Output length if wrapping succeeded.
+ */
+size_t CRYPTO_128_wrap_pad(void *key, const unsigned char *icv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ /* n: number of 64-bit blocks in the padded key data
+ *
+ * If length of plain text is not a multiple of 8, pad the plain text octet
+ * string on the right with octets of zeros, where final length is the
+ * smallest multiple of 8 that is greater than length of plain text.
+ * If length of plain text is a multiple of 8, then there is no padding. */
+ const size_t blocks_padded = (inlen + 7) / 8; /* CEILING(m/8) */
+ const size_t padded_len = blocks_padded * 8;
+ const size_t padding_len = padded_len - inlen;
+ /* RFC 5649 section 3: Alternative Initial Value */
+ unsigned char aiv[8];
+ int ret;
+
+ /* Section 1: use 32-bit fixed field for plaintext octet length */
+ if (inlen == 0 || inlen >= CRYPTO128_WRAP_MAX)
+ return 0;
+
+ /* Section 3: Alternative Initial Value */
+ if (!icv)
+ memcpy(aiv, default_aiv, 4);
+ else
+ memcpy(aiv, icv, 4); /* Standard doesn't mention this. */
+
+ aiv[4] = (inlen >> 24) & 0xFF;
+ aiv[5] = (inlen >> 16) & 0xFF;
+ aiv[6] = (inlen >> 8) & 0xFF;
+ aiv[7] = inlen & 0xFF;
+
+ if (padded_len == 8) {
+ /*
+ * Section 4.1 - special case in step 2: If the padded plaintext
+ * contains exactly eight octets, then prepend the AIV and encrypt
+ * the resulting 128-bit block using AES in ECB mode.
+ */
+ memmove(out + 8, in, inlen);
+ memcpy(out, aiv, 8);
+ memset(out + 8 + inlen, 0, padding_len);
+ block(out, out, key);
+ ret = 16; /* AIV + padded input */
+ } else {
+ memmove(out, in, inlen);
+ memset(out + inlen, 0, padding_len); /* Section 4.1 step 1 */
+ ret = CRYPTO_128_wrap(key, aiv, out, out, padded_len, block);
+ }
+
+ return ret;
+}
+
+/** Unwrapping according to RFC 5649 section 4.2.
+ *
+ * @param[in] key Key value.
+ * @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv.
+ * @param[out] out Plaintext. Minimal buffer length = inlen bytes.
+ * Input and output buffers can overlap if block function
+ * supports that.
+ * @param[in] in Ciphertext as n 64-bit blocks.
+ * @param[in] inlen Length of in.
+ * @param[in] block Block processing function.
+ * @return 0 if inlen is out of range [16, CRYPTO128_WRAP_MAX],
+ * or if inlen is not a multiple of 8
+ * or if IV and message length indicator doesn't match.
+ * Output length if unwrapping succeeded and IV matches.
+ */
+size_t CRYPTO_128_unwrap_pad(void *key, const unsigned char *icv,
+ unsigned char *out,
+ const unsigned char *in, size_t inlen,
+ block128_f block)
+{
+ /* n: number of 64-bit blocks in the padded key data */
+ size_t n = inlen / 8 - 1;
+ size_t padded_len;
+ size_t padding_len;
+ size_t ptext_len;
+ /* RFC 5649 section 3: Alternative Initial Value */
+ unsigned char aiv[8];
+ static unsigned char zeros[8] = { 0x0 };
+ size_t ret;
+
+ /* Section 4.2: Ciphertext length has to be (n+1) 64-bit blocks. */
+ if ((inlen & 0x7) != 0 || inlen < 16 || inlen >= CRYPTO128_WRAP_MAX)
+ return 0;
+
+ memmove(out, in, inlen);
+ if (inlen == 16) {
+ /*
+ * Section 4.2 - special case in step 1: When n=1, the ciphertext
+ * contains exactly two 64-bit blocks and they are decrypted as a
+ * single AES block using AES in ECB mode: AIV | P[1] = DEC(K, C[0] |
+ * C[1])
+ */
+ block(out, out, key);
+ memcpy(aiv, out, 8);
+ /* Remove AIV */
+ memmove(out, out + 8, 8);
+ padded_len = 8;
+ } else {
+ padded_len = inlen - 8;
+ ret = crypto_128_unwrap_raw(key, aiv, out, out, inlen, block);
+ if (padded_len != ret) {
+ OPENSSL_cleanse(out, inlen);
+ return 0;
+ }
+ }
+
+ /*
+ * Section 3: AIV checks: Check that MSB(32,A) = A65959A6. Optionally a
+ * user-supplied value can be used (even if standard doesn't mention
+ * this).
+ */
+ if ((!icv && CRYPTO_memcmp(aiv, default_aiv, 4))
+ || (icv && CRYPTO_memcmp(aiv, icv, 4))) {
+ OPENSSL_cleanse(out, inlen);
+ return 0;
+ }
+
+ /*
+ * Check that 8*(n-1) < LSB(32,AIV) <= 8*n. If so, let ptext_len =
+ * LSB(32,AIV).
+ */
+
+ ptext_len = ((unsigned int)aiv[4] << 24)
+ | ((unsigned int)aiv[5] << 16)
+ | ((unsigned int)aiv[6] << 8)
+ | (unsigned int)aiv[7];
+ if (8 * (n - 1) >= ptext_len || ptext_len > 8 * n) {
+ OPENSSL_cleanse(out, inlen);
+ return 0;
+ }
+
+ /*
+ * Check that the rightmost padding_len octets of the output data are
+ * zero.
+ */
+ padding_len = padded_len - ptext_len;
+ if (CRYPTO_memcmp(out + ptext_len, zeros, padding_len) != 0) {
+ OPENSSL_cleanse(out, inlen);
+ return 0;
+ }
+
+ /* Section 4.2 step 3: Remove padding */
+ return ptext_len;
+}
diff --git a/openssl-1.1.0h/crypto/modes/xts128.c b/openssl-1.1.0h/crypto/modes/xts128.c
new file mode 100644
index 0000000..81b1eac
--- /dev/null
+++ b/openssl-1.1.0h/crypto/modes/xts128.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+ const unsigned char iv[16],
+ const unsigned char *inp, unsigned char *out,
+ size_t len, int enc)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = {
+ 1
+ };
+ union {
+ u64 u[2];
+ u32 d[4];
+ u8 c[16];
+ } tweak, scratch;
+ unsigned int i;
+
+ if (len < 16)
+ return -1;
+
+ memcpy(tweak.c, iv, 16);
+
+ (*ctx->block2) (tweak.c, tweak.c, ctx->key2);
+
+ if (!enc && (len % 16))
+ len -= 16;
+
+ while (len >= 16) {
+#if defined(STRICT_ALIGNMENT)
+ memcpy(scratch.c, inp, 16);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+#else
+ scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0];
+ scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1];
+#endif
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out, scratch.c, 16);
+#else
+ ((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0];
+ ((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1];
+#endif
+ inp += 16;
+ out += 16;
+ len -= 16;
+
+ if (len == 0)
+ return 0;
+
+ if (is_endian.little) {
+ unsigned int carry, res;
+
+ res = 0x87 & (((int)tweak.d[3]) >> 31);
+ carry = (unsigned int)(tweak.u[0] >> 63);
+ tweak.u[0] = (tweak.u[0] << 1) ^ res;
+ tweak.u[1] = (tweak.u[1] << 1) | carry;
+ } else {
+ size_t c;
+
+ for (c = 0, i = 0; i < 16; ++i) {
+ /*
+ * + substitutes for |, because c is 1 bit
+ */
+ c += ((size_t)tweak.c[i]) << 1;
+ tweak.c[i] = (u8)c;
+ c = c >> 8;
+ }
+ tweak.c[0] ^= (u8)(0x87 & (0 - c));
+ }
+ }
+ if (enc) {
+ for (i = 0; i < len; ++i) {
+ u8 c = inp[i];
+ out[i] = scratch.c[i];
+ scratch.c[i] = c;
+ }
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out - 16, scratch.c, 16);
+ } else {
+ union {
+ u64 u[2];
+ u8 c[16];
+ } tweak1;
+
+ if (is_endian.little) {
+ unsigned int carry, res;
+
+ res = 0x87 & (((int)tweak.d[3]) >> 31);
+ carry = (unsigned int)(tweak.u[0] >> 63);
+ tweak1.u[0] = (tweak.u[0] << 1) ^ res;
+ tweak1.u[1] = (tweak.u[1] << 1) | carry;
+ } else {
+ size_t c;
+
+ for (c = 0, i = 0; i < 16; ++i) {
+ /*
+ * + substitutes for |, because c is 1 bit
+ */
+ c += ((size_t)tweak.c[i]) << 1;
+ tweak1.c[i] = (u8)c;
+ c = c >> 8;
+ }
+ tweak1.c[0] ^= (u8)(0x87 & (0 - c));
+ }
+#if defined(STRICT_ALIGNMENT)
+ memcpy(scratch.c, inp, 16);
+ scratch.u[0] ^= tweak1.u[0];
+ scratch.u[1] ^= tweak1.u[1];
+#else
+ scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0];
+ scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1];
+#endif
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+ scratch.u[0] ^= tweak1.u[0];
+ scratch.u[1] ^= tweak1.u[1];
+
+ for (i = 0; i < len; ++i) {
+ u8 c = inp[16 + i];
+ out[16 + i] = scratch.c[i];
+ scratch.c[i] = c;
+ }
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out, scratch.c, 16);
+#else
+ ((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0];
+ ((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1];
+#endif
+ }
+
+ return 0;
+}