From aa4d426b4d3527d7e166df1a05058c9a4a0f6683 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 00:33:56 +0200 Subject: initial/final commit --- .../crypto/modes/asm/aesni-gcm-x86_64.pl | 1106 ++++++++++ openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl | 467 ++++ openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl | 554 +++++ openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl | 247 +++ openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl | 470 ++++ openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl | 738 +++++++ openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl | 258 +++ openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl | 581 +++++ openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl | 1405 ++++++++++++ openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl | 1762 +++++++++++++++ openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl | 670 ++++++ openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl | 430 ++++ openssl-1.1.0h/crypto/modes/build.info | 27 + openssl-1.1.0h/crypto/modes/cbc128.c | 161 ++ openssl-1.1.0h/crypto/modes/ccm128.c | 432 ++++ openssl-1.1.0h/crypto/modes/cfb128.c | 198 ++ openssl-1.1.0h/crypto/modes/ctr128.c | 209 ++ openssl-1.1.0h/crypto/modes/cts128.c | 523 +++++ openssl-1.1.0h/crypto/modes/gcm128.c | 2301 ++++++++++++++++++++ openssl-1.1.0h/crypto/modes/modes_lcl.h | 185 ++ openssl-1.1.0h/crypto/modes/ocb128.c | 568 +++++ openssl-1.1.0h/crypto/modes/ofb128.c | 74 + openssl-1.1.0h/crypto/modes/wrap128.c | 329 +++ openssl-1.1.0h/crypto/modes/xts128.c | 157 ++ 24 files changed, 13852 insertions(+) create mode 100644 openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl create mode 100755 openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl create mode 100755 openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl create mode 100644 openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl create mode 100644 openssl-1.1.0h/crypto/modes/build.info create mode 100644 openssl-1.1.0h/crypto/modes/cbc128.c create mode 100644 openssl-1.1.0h/crypto/modes/ccm128.c create mode 100644 openssl-1.1.0h/crypto/modes/cfb128.c create mode 100644 openssl-1.1.0h/crypto/modes/ctr128.c create mode 100644 openssl-1.1.0h/crypto/modes/cts128.c create mode 100644 openssl-1.1.0h/crypto/modes/gcm128.c create mode 100644 openssl-1.1.0h/crypto/modes/modes_lcl.h create mode 100644 openssl-1.1.0h/crypto/modes/ocb128.c create mode 100644 openssl-1.1.0h/crypto/modes/ofb128.c create mode 100644 openssl-1.1.0h/crypto/modes/wrap128.c create mode 100644 openssl-1.1.0h/crypto/modes/xts128.c (limited to 'openssl-1.1.0h/crypto/modes') diff --git a/openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl b/openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl new file mode 100644 index 0000000..5ad62b3 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -0,0 +1,1106 @@ +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, 0.74 - on +# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled +# measurements for favourable packet size, one divisible by 96. +# Applications using the EVP interface will observe a few percent +# worse performance.] +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.20) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +if ($avx>1) {{{ + +($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + +($Ii,$T1,$T2,$Hkey, + $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); + +($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); + +($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15"); + +$code=<<___; +.text + +.type _aesni_ctr32_ghash_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_ghash_6x: + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + sub \$6,$len + vpxor $Z0,$Z0,$Z0 # $Z0 = 0 + vmovdqu 0x00-0x80($key),$rndkey + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpaddb $T2,$inout2,$inout3 + vpaddb $T2,$inout3,$inout4 + vpaddb $T2,$inout4,$inout5 + vpxor $rndkey,$T1,$inout0 + vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 + jmp .Loop6x + +.align 32 +.Loop6x: + add \$`6<<24`,$counter + jc .Lhandle_ctr32 # discard $inout[1-5]? + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpaddb $T2,$inout5,$T1 # next counter value + vpxor $rndkey,$inout1,$inout1 + vpxor $rndkey,$inout2,$inout2 + +.Lresume_ctr32: + vmovdqu $T1,($ivp) # save next counter value + vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 + vpxor $rndkey,$inout3,$inout3 + vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey + vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 + + # At this point, the current block of 96 (0x60) bytes has already been + # loaded into registers. Concurrently with processing it, we want to + # load the next 96 bytes of input for the next round. Obviously, we can + # only do this if there are at least 96 more bytes of input beyond the + # input we're currently processing, or else we'd read past the end of + # the input buffer. Here, we set |%r12| to 96 if there are at least 96 + # bytes of input beyond the 96 bytes we're already processing, and we + # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96, + # we'll read in the next block so that it is in registers for the next + # loop iteration. In the case where we set |%r12| to 0, we'll re-read + # the current block and then ignore what we re-read. + # + # At this point, |$in0| points to the current (already read into + # registers) block, and |$end0| points to 2*96 bytes before the end of + # the input. Thus, |$in0| > |$end0| means that we do not have the next + # 96-byte block to read in, and |$in0| <= |$end0| means we do. + xor %r12,%r12 + cmp $in0,$end0 + + vaesenc $T2,$inout0,$inout0 + vmovdqu 0x30+8(%rsp),$Ii # I[4] + vpxor $rndkey,$inout4,$inout4 + vpclmulqdq \$0x00,$Hkey,$Z3,$T1 + vaesenc $T2,$inout1,$inout1 + vpxor $rndkey,$inout5,$inout5 + setnc %r12b + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vaesenc $T2,$inout2,$inout2 + vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2 + neg %r12 + vaesenc $T2,$inout3,$inout3 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 + vpxor $Z0,$Xi,$Xi # modulo-scheduled + vaesenc $T2,$inout4,$inout4 + vpxor $Z1,$T1,$Z0 + and \$0x60,%r12 + vmovups 0x20-0x80($key),$rndkey + vpclmulqdq \$0x10,$Hkey,$Ii,$T1 + vaesenc $T2,$inout5,$inout5 + + vpclmulqdq \$0x01,$Hkey,$Ii,$T2 + lea ($in0,%r12),$in0 + vaesenc $rndkey,$inout0,$inout0 + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] + vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey + vmovdqu 0x40+8(%rsp),$Ii # I[3] + vaesenc $rndkey,$inout1,$inout1 + movbe 0x58($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x50($in0),%r12 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x20+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x28+8(%rsp) + vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x30-0x80($key),$rndkey + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x00,$Z1,$Ii,$T1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x10,$Z1,$Ii,$T2 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Hkey,$Z3,$Z3 + vpclmulqdq \$0x01,$Z1,$Ii,$Hkey + vaesenc $rndkey,$inout2,$inout2 + vpclmulqdq \$0x11,$Z1,$Ii,$Z1 + vmovdqu 0x50+8(%rsp),$Ii # I[2] + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vpxor $T1,$Z0,$Z0 + vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x40-0x80($key),$rndkey + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x00,$T1,$Ii,$T2 + vaesenc $rndkey,$inout0,$inout0 + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x10,$T1,$Ii,$Hkey + vaesenc $rndkey,$inout1,$inout1 + movbe 0x48($in0),%r13 + vpxor $Z1,$Z3,$Z3 + vpclmulqdq \$0x01,$T1,$Ii,$Z1 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x40($in0),%r12 + vpclmulqdq \$0x11,$T1,$Ii,$T1 + vmovdqu 0x60+8(%rsp),$Ii # I[1] + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x30+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x38+8(%rsp) + vpxor $T2,$Z0,$Z0 + vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x50-0x80($key),$rndkey + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x00,$T2,$Ii,$Hkey + vaesenc $rndkey,$inout0,$inout0 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$T2,$Ii,$Z1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x38($in0),%r13 + vpxor $T1,$Z3,$Z3 + vpclmulqdq \$0x01,$T2,$Ii,$T1 + vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] + vaesenc $rndkey,$inout2,$inout2 + movbe 0x30($in0),%r12 + vpclmulqdq \$0x11,$T2,$Ii,$T2 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x40+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x48+8(%rsp) + vpxor $Hkey,$Z0,$Z0 + vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x60-0x80($key),$rndkey + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x01,$Hkey,$Xi,$T1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x28($in0),%r13 + vpxor $T2,$Z3,$Z3 + vpclmulqdq \$0x00,$Hkey,$Xi,$T2 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x20($in0),%r12 + vpclmulqdq \$0x11,$Hkey,$Xi,$Xi + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x50+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x58+8(%rsp) + vpxor $Z1,$Z2,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor $T1,$Z2,$Z2 + + vmovups 0x70-0x80($key),$rndkey + vpslldq \$8,$Z2,$Z1 + vpxor $T2,$Z0,$Z0 + vmovdqu 0x10($const),$Hkey # .Lpoly + + vaesenc $rndkey,$inout0,$inout0 + vpxor $Xi,$Z3,$Z3 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Z1,$Z0,$Z0 + movbe 0x18($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x10($in0),%r12 + vpalignr \$8,$Z0,$Z0,$Ii # 1st phase + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + mov %r13,0x60+8(%rsp) + vaesenc $rndkey,$inout3,$inout3 + mov %r12,0x68+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vmovups 0x90-0x80($key),$rndkey + vaesenc $T1,$inout1,$inout1 + vpsrldq \$8,$Z2,$Z2 + vaesenc $T1,$inout2,$inout2 + vpxor $Z2,$Z3,$Z3 + vaesenc $T1,$inout3,$inout3 + vpxor $Ii,$Z0,$Z0 + movbe 0x08($in0),%r13 + vaesenc $T1,$inout4,$inout4 + movbe 0x00($in0),%r12 + vaesenc $T1,$inout5,$inout5 + vmovups 0xa0-0x80($key),$T1 + cmp \$11,$rounds + jb .Lenc_tail # 128-bit key + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xb0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xc0-0x80($key),$T1 + je .Lenc_tail # 192-bit key + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xd0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xe0-0x80($key),$T1 + jmp .Lenc_tail # 256-bit key + +.align 32 +.Lhandle_ctr32: + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $rndkey,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $rndkey,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpshufb $Ii,$inout5,$inout5 + vpshufb $Ii,$T1,$T1 # next counter value + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc $rndkey,$inout0,$inout0 + vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi + vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase + vaesenc $rndkey,$inout1,$inout1 + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + vpxor 0x00($inp),$T1,$T2 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x10($inp),$T1,$Ii + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x20($inp),$T1,$Z1 + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x30($inp),$T1,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x40($inp),$T1,$Z3 + vpxor 0x50($inp),$T1,$Hkey + vmovdqu ($ivp),$T1 # load next counter value + + vaesenclast $T2,$inout0,$inout0 + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + vaesenclast $Ii,$inout1,$inout1 + vpaddb $T2,$T1,$Ii + mov %r13,0x70+8(%rsp) + lea 0x60($inp),$inp + vaesenclast $Z1,$inout2,$inout2 + vpaddb $T2,$Ii,$Z1 + mov %r12,0x78+8(%rsp) + lea 0x60($out),$out + vmovdqu 0x00-0x80($key),$rndkey + vaesenclast $Z2,$inout3,$inout3 + vpaddb $T2,$Z1,$Z2 + vaesenclast $Z3, $inout4,$inout4 + vpaddb $T2,$Z2,$Z3 + vaesenclast $Hkey,$inout5,$inout5 + vpaddb $T2,$Z3,$Hkey + + add \$0x60,$ret + sub \$0x6,$len + jc .L6x_done + + vmovups $inout0,-0x60($out) # save output + vpxor $rndkey,$T1,$inout0 + vmovups $inout1,-0x50($out) + vmovdqa $Ii,$inout1 # 0 latency + vmovups $inout2,-0x40($out) + vmovdqa $Z1,$inout2 # 0 latency + vmovups $inout3,-0x30($out) + vmovdqa $Z2,$inout3 # 0 latency + vmovups $inout4,-0x20($out) + vmovdqa $Z3,$inout4 # 0 latency + vmovups $inout5,-0x10($out) + vmovdqa $Hkey,$inout5 # 0 latency + vmovdqu 0x20+8(%rsp),$Z3 # I[5] + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled + vpxor $Z0,$Xi,$Xi # modulo-scheduled + + ret +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +___ +###################################################################### +# +# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, +# const AES_KEY *key, unsigned char iv[16], +# struct { u128 Xi,H,Htbl[9]; } *Xip); +$code.=<<___; +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@function,6 +.align 32 +aesni_gcm_decrypt: + xor $ret,$ret + + # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60) + # bytes of input. + cmp \$0x60,$len # minimal accepted length + jb .Lgcm_dec_abort + + lea (%rsp),%rax # save stack pointer + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,-0xd8(%rax) + movaps %xmm7,-0xc8(%rax) + movaps %xmm8,-0xb8(%rax) + movaps %xmm9,-0xa8(%rax) + movaps %xmm10,-0x98(%rax) + movaps %xmm11,-0x88(%rax) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +.Lgcm_dec_body: +___ +$code.=<<___; + vzeroupper + + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + vmovdqu ($Xip),$Xi # load Xi + and \$-128,%rsp # ensure stack alignment + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + lea 0x80($key),$key # size optimization + lea 0x20+0x20($Xip),$Xip # size optimization + mov 0xf0-0x80($key),$rounds + vpshufb $Ii,$Xi,$Xi + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Ldec_no_key_aliasing + cmp \$768,$end0 + jnc .Ldec_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Ldec_no_key_aliasing: + + vmovdqu 0x50($inp),$Z3 # I[5] + lea ($inp),$in0 + vmovdqu 0x40($inp),$Z0 + + # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) + # bytes before the end of the input. Note, in particular, that this is + # correct even if |$len| is not an even multiple of 96 or 16. XXX: This + # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must + # not be near the very beginning of the address space when |$len| < 2*96 + # (0xc0). + lea -0xc0($inp,$len),$end0 + + vmovdqu 0x30($inp),$Z1 + shr \$4,$len + xor $ret,$ret + vmovdqu 0x20($inp),$Z2 + vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu 0x10($inp),$T2 + vpshufb $Ii,$Z0,$Z0 + vmovdqu ($inp),$Hkey + vpshufb $Ii,$Z1,$Z1 + vmovdqu $Z0,0x30(%rsp) + vpshufb $Ii,$Z2,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$T2,$T2 + vmovdqu $Z2,0x50(%rsp) + vpshufb $Ii,$Hkey,$Hkey + vmovdqu $T2,0x60(%rsp) + vmovdqu $Hkey,0x70(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups $inout0,-0x60($out) # save output + vmovups $inout1,-0x50($out) + vmovups $inout2,-0x40($out) + vmovups $inout3,-0x30($out) + vmovups $inout4,-0x20($out) + vmovups $inout5,-0x10($out) + + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,-0x40($Xip) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lgcm_dec_abort: + mov $ret,%rax # return value + ret +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ + +$code.=<<___; +.type _aesni_ctr32_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_6x: + vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + lea -1($rounds),%r13 + vmovups 0x10-0x80($key),$rndkey + lea 0x20-0x80($key),%r12 + vpxor $Z0,$T1,$inout0 + add \$`6<<24`,$counter + jc .Lhandle_ctr32_2 + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddb $T2,$inout2,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddb $T2,$inout3,$inout4 + vpxor $Z0,$inout3,$inout3 + vpaddb $T2,$inout4,$inout5 + vpxor $Z0,$inout4,$inout4 + vpaddb $T2,$inout5,$T1 + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + vmovups (%r12),$rndkey + lea 0x10(%r12),%r12 + dec %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),$Hkey # last round key + vaesenc $rndkey,$inout0,$inout0 + vpxor 0x00($inp),$Hkey,$Z0 + vaesenc $rndkey,$inout1,$inout1 + vpxor 0x10($inp),$Hkey,$Z1 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x20($inp),$Hkey,$Z2 + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x30($inp),$Hkey,$Xi + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x40($inp),$Hkey,$T2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x50($inp),$Hkey,$Hkey + lea 0x60($inp),$inp + + vaesenclast $Z0,$inout0,$inout0 + vaesenclast $Z1,$inout1,$inout1 + vaesenclast $Z2,$inout2,$inout2 + vaesenclast $Xi,$inout3,$inout3 + vaesenclast $T2,$inout4,$inout4 + vaesenclast $Hkey,$inout5,$inout5 + vmovups $inout0,0x00($out) + vmovups $inout1,0x10($out) + vmovups $inout2,0x20($out) + vmovups $inout3,0x30($out) + vmovups $inout4,0x40($out) + vmovups $inout5,0x50($out) + lea 0x60($out),$out + + ret +.align 32 +.Lhandle_ctr32_2: + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpxor $Z0,$inout3,$inout3 + vpshufb $Ii,$inout5,$inout5 + vpxor $Z0,$inout4,$inout4 + vpshufb $Ii,$T1,$T1 # next counter value + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@function,6 +.align 32 +aesni_gcm_encrypt: + xor $ret,$ret + + # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of + # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at + # least 96 more bytes of input. + cmp \$0x60*3,$len # minimal accepted length + jb .Lgcm_enc_abort + + lea (%rsp),%rax # save stack pointer + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,-0xd8(%rax) + movaps %xmm7,-0xc8(%rax) + movaps %xmm8,-0xb8(%rax) + movaps %xmm9,-0xa8(%rax) + movaps %xmm10,-0x98(%rax) + movaps %xmm11,-0x88(%rax) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +.Lgcm_enc_body: +___ +$code.=<<___; + vzeroupper + + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + lea 0x80($key),$key # size optimization + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + and \$-128,%rsp # ensure stack alignment + mov 0xf0-0x80($key),$rounds + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Lenc_no_key_aliasing + cmp \$768,$end0 + jnc .Lenc_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Lenc_no_key_aliasing: + + lea ($out),$in0 + + # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) + # bytes before the end of the input. Note, in particular, that this is + # correct even if |$len| is not an even multiple of 96 or 16. Unlike in + # the decryption case, there's no caveat that |$out| must not be near + # the very beginning of the address space, because we know that + # |$len| >= 3*96 from the check above, and so we know + # |$out| + |$len| >= 2*96 (0xc0). + lea -0xc0($out,$len),$end0 + + shr \$4,$len + + call _aesni_ctr32_6x + vpshufb $Ii,$inout0,$Xi # save bswapped output on stack + vpshufb $Ii,$inout1,$T2 + vmovdqu $Xi,0x70(%rsp) + vpshufb $Ii,$inout2,$Z0 + vmovdqu $T2,0x60(%rsp) + vpshufb $Ii,$inout3,$Z1 + vmovdqu $Z0,0x50(%rsp) + vpshufb $Ii,$inout4,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu $Z2,0x30(%rsp) + + call _aesni_ctr32_6x + + vmovdqu ($Xip),$Xi # load Xi + lea 0x20+0x20($Xip),$Xip # size optimization + sub \$12,$len + mov \$0x60*2,$ret + vpshufb $Ii,$Xi,$Xi + + call _aesni_ctr32_ghash_6x + vmovdqu 0x20(%rsp),$Z3 # I[5] + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpunpckhqdq $Z3,$Z3,$T1 + vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK + vmovups $inout0,-0x60($out) # save output + vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy + vpxor $Z3,$T1,$T1 + vmovups $inout1,-0x50($out) + vpshufb $Ii,$inout1,$inout1 + vmovups $inout2,-0x40($out) + vpshufb $Ii,$inout2,$inout2 + vmovups $inout3,-0x30($out) + vpshufb $Ii,$inout3,$inout3 + vmovups $inout4,-0x20($out) + vpshufb $Ii,$inout4,$inout4 + vmovups $inout5,-0x10($out) + vpshufb $Ii,$inout5,$inout5 + vmovdqu $inout0,0x10(%rsp) # free $inout0 +___ +{ my ($HK,$T3)=($rndkey,$inout0); + +$code.=<<___; + vmovdqu 0x30(%rsp),$Z2 # I[4] + vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 + vpunpckhqdq $Z2,$Z2,$T2 + vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 + vpxor $Z2,$T2,$T2 + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + + vmovdqu 0x40(%rsp),$T3 # I[3] + vpclmulqdq \$0x00,$Ii,$Z2,$Z0 + vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $T3,$T3,$Z1 + vpclmulqdq \$0x11,$Ii,$Z2,$Z2 + vpxor $T3,$Z1,$Z1 + vpxor $Z3,$Z2,$Z2 + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Xip),$HK + vpxor $T1,$T2,$T2 + + vmovdqu 0x50(%rsp),$T1 # I[2] + vpclmulqdq \$0x00,$Hkey,$T3,$Z3 + vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z0,$Z3,$Z3 + vpunpckhqdq $T1,$T1,$Z0 + vpclmulqdq \$0x11,$Hkey,$T3,$T3 + vpxor $T1,$Z0,$Z0 + vpxor $Z2,$T3,$T3 + vpclmulqdq \$0x00,$HK,$Z1,$Z1 + vpxor $T2,$Z1,$Z1 + + vmovdqu 0x60(%rsp),$T2 # I[1] + vpclmulqdq \$0x00,$Ii,$T1,$Z2 + vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 + vpxor $Z3,$Z2,$Z2 + vpunpckhqdq $T2,$T2,$Z3 + vpclmulqdq \$0x11,$Ii,$T1,$T1 + vpxor $T2,$Z3,$Z3 + vpxor $T3,$T1,$T1 + vpclmulqdq \$0x10,$HK,$Z0,$Z0 + vmovdqu 0x80-0x20($Xip),$HK + vpxor $Z1,$Z0,$Z0 + + vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] + vpclmulqdq \$0x00,$Hkey,$T2,$Z1 + vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 + vpunpckhqdq $Xi,$Xi,$T3 + vpxor $Z2,$Z1,$Z1 + vpclmulqdq \$0x11,$Hkey,$T2,$T2 + vpxor $Xi,$T3,$T3 + vpxor $T1,$T2,$T2 + vpclmulqdq \$0x00,$HK,$Z3,$Z3 + vpxor $Z0,$Z3,$Z0 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z2 + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpunpckhqdq $inout5,$inout5,$T1 + vpclmulqdq \$0x11,$Ii,$Xi,$Xi + vpxor $inout5,$T1,$T1 + vpxor $Z1,$Z2,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$T3 + vmovdqu 0x20-0x20($Xip),$HK + vpxor $T2,$Xi,$Z3 + vpxor $Z0,$T3,$Z2 + + vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 + vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 + vpxor $T3,$Z2,$Z2 + vpunpckhqdq $inout4,$inout4,$T2 + vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 + vpxor $inout4,$T2,$T2 + vpslldq \$8,$Z2,$T3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + vpxor $T3,$Z1,$Xi + vpsrldq \$8,$Z2,$Z2 + vpxor $Z2,$Z3,$Z3 + + vpclmulqdq \$0x00,$Ii,$inout4,$Z1 + vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout3,$inout3,$T3 + vpclmulqdq \$0x11,$Ii,$inout4,$inout4 + vpxor $inout3,$T3,$T3 + vpxor $inout5,$inout4,$inout4 + vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Xip),$HK + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 + vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $inout2,$inout2,$T1 + vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 + vpxor $inout2,$T1,$T1 + vpxor $inout4,$inout3,$inout3 + vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 + vpclmulqdq \$0x00,$HK,$T3,$T3 + vpxor $T2,$T3,$T3 + + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Ii,$inout2,$Z1 + vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout1,$inout1,$T2 + vpclmulqdq \$0x11,$Ii,$inout2,$inout2 + vpxor $inout1,$T2,$T2 + vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase + vpxor $inout3,$inout2,$inout2 + vpclmulqdq \$0x10,$HK,$T1,$T1 + vmovdqu 0x80-0x20($Xip),$HK + vpxor $T3,$T1,$T1 + + vxorps $Z3,$inout5,$inout5 + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 + vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $Xi,$Xi,$T3 + vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 + vpxor $Xi,$T3,$T3 + vpxor $inout2,$inout1,$inout1 + vpclmulqdq \$0x00,$HK,$T2,$T2 + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z1 + vpclmulqdq \$0x11,$Ii,$Xi,$Z3 + vpxor $Z0,$Z1,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$Z2 + vpxor $inout1,$Z3,$Z3 + vpxor $T2,$Z2,$Z2 + + vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing + vpxor $Z0,$Z2,$Z2 + vpslldq \$8,$Z2,$T1 + vmovdqu 0x10($const),$Hkey # .Lpoly + vpsrldq \$8,$Z2,$Z2 + vpxor $T1,$Z1,$Xi + vpxor $Z2,$Z3,$Z3 + + vpalignr \$8,$Xi,$Xi,$T2 # 1st phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $Z3,$T2,$T2 + vpxor $T2,$Xi,$Xi +___ +} +$code.=<<___; + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,-0x40($Xip) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lgcm_enc_abort: + mov $ret,%rax # return value + ret +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +___ + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: + .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by " +.align 64 +___ +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___ +.extern __imp_RtlVirtualUnwind +.type gcm_se_handler,\@abi-omnipotent +.align 16 +gcm_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 120($context),%rax # pull context->Rax + + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + mov %r15,240($context) + mov %r14,232($context) + mov %r13,224($context) + mov %r12,216($context) + mov %rbp,160($context) + mov %rbx,144($context) + + lea -0xd8(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size gcm_se_handler,.-gcm_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_aesni_gcm_decrypt + .rva .LSEH_end_aesni_gcm_decrypt + .rva .LSEH_gcm_dec_info + + .rva .LSEH_begin_aesni_gcm_encrypt + .rva .LSEH_end_aesni_gcm_encrypt + .rva .LSEH_gcm_enc_info +.section .xdata +.align 8 +.LSEH_gcm_dec_info: + .byte 9,0,0,0 + .rva gcm_se_handler + .rva .Lgcm_dec_body,.Lgcm_dec_abort +.LSEH_gcm_enc_info: + .byte 9,0,0,0 + .rva gcm_se_handler + .rva .Lgcm_enc_body,.Lgcm_enc_abort +___ +} +}}} else {{{ +$code=<<___; # assembler is too old +.text + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@abi-omnipotent +aesni_gcm_encrypt: + xor %eax,%eax + ret +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@abi-omnipotent +aesni_gcm_decrypt: + xor %eax,%eax + ret +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ +}}} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl new file mode 100644 index 0000000..ccf6b2b --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl @@ -0,0 +1,467 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Even though +# loops are aggressively modulo-scheduled in respect to references to +# Htbl and Z.hi updates for 8 cycles per byte, measured performance is +# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic +# scheduling "glitch," because uprofile(1) indicates uniform sample +# distribution, as if all instruction bundles execute in 1.5 cycles. +# Meaning that it could have been even faster, yet 12 cycles is ~60% +# better than gcc-generated code and ~80% than code generated by vendor +# compiler. + +$cnt="v0"; # $0 +$t0="t0"; +$t1="t1"; +$t2="t2"; +$Thi0="t3"; # $4 +$Tlo0="t4"; +$Thi1="t5"; +$Tlo1="t6"; +$rem="t7"; # $8 +################# +$Xi="a0"; # $16, input argument block +$Htbl="a1"; +$inp="a2"; +$len="a3"; +$nlo="a4"; # $20 +$nhi="a5"; +$Zhi="t8"; +$Zlo="t9"; +$Xhi="t10"; # $24 +$Xlo="t11"; +$remp="t12"; +$rem_4bit="AT"; # $28 + +{ my $N; + sub loop() { + + $N++; +$code.=<<___; +.align 4 + extbl $Xlo,7,$nlo + and $nlo,0xf0,$nhi + sll $nlo,4,$nlo + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Zlo,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Zhi,0($nlo) + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,6(zero) + extbl $Xlo,6,$nlo + + ldq $Tlo1,8($nhi) + s8addq $remp,$rem_4bit,$remp + ldq $Thi1,0($nhi) + srl $Zlo,4,$Zlo + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + and $nlo,0xf0,$nhi + + xor $Tlo1,$Zlo,$Zlo + sll $nlo,4,$nlo + xor $Thi1,$Zhi,$Zhi + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Tlo0,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Thi0,0($nlo) + +.Looplo$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xlo,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Looplo$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,7(zero) + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + unop + + +.Loophi$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Loophi$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + + xor $Tlo0,$Zlo,$Zlo + xor $Thi0,$Zhi,$Zhi + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + xor $t0,$Zlo,$Zlo + xor $rem,$Zhi,$Zhi +___ +}} + +$code=<<___; +#ifdef __linux__ +#include +#else +#include +#include +#endif + +.text + +.set noat +.set noreorder +.globl gcm_gmult_4bit +.align 4 +.ent gcm_gmult_4bit +gcm_gmult_4bit: + .frame sp,0,ra + .prologue 0 + + ldq $Xlo,8($Xi) + ldq $Xhi,0($Xi) + + bsr $t0,picmeup + nop +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + ret (ra) +.end gcm_gmult_4bit +___ + +$inhi="s0"; +$inlo="s1"; + +$code.=<<___; +.globl gcm_ghash_4bit +.align 4 +.ent gcm_ghash_4bit +gcm_ghash_4bit: + lda sp,-32(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + .mask 0x04000600,-32 + .frame sp,32,ra + .prologue 0 + + ldq_u $inhi,0($inp) + ldq_u $Thi0,7($inp) + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + ldq $Xhi,0($Xi) + ldq $Xlo,8($Xi) + + bsr $t0,picmeup + nop + +.Louter: + extql $inhi,$inp,$inhi + extqh $Thi0,$inp,$Thi0 + or $inhi,$Thi0,$inhi + lda $inp,16($inp) + + extql $inlo,$inp,$inlo + extqh $Tlo0,$inp,$Tlo0 + or $inlo,$Tlo0,$inlo + subq $len,16,$len + + xor $Xlo,$inlo,$Xlo + xor $Xhi,$inhi,$Xhi +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + beq $len,.Ldone + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + ldq_u $inhi,0($inp) + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + ldq_u $Thi0,7($inp) + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + br zero,.Louter + +.Ldone: + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + .set noreorder + /*ldq ra,0(sp)*/ + ldq s0,8(sp) + ldq s1,16(sp) + lda sp,32(sp) + ret (ra) +.end gcm_ghash_4bit + +.align 4 +.ent picmeup +picmeup: + .frame sp,0,$t0 + .prologue 0 + br $rem_4bit,.Lpic +.Lpic: lda $rem_4bit,12($rem_4bit) + ret ($t0) +.end picmeup + nop +rem_4bit: + .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 + .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 + .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 + .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 +.ascii "GHASH for Alpha, CRYPTOGAMS by " +.align 4 + +___ +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT; + diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl new file mode 100644 index 0000000..7d880c9 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl @@ -0,0 +1,554 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+32 bytes shared table]. There is no +# experimental performance data available yet. The only approximation +# that can be made at this point is based on code size. Inner loop is +# 32 instructions long and on single-issue core should execute in <40 +# cycles. Having verified that gcc 3.4 didn't unroll corresponding +# loop, this assembler loop body was found to be ~3x smaller than +# compiler-generated one... +# +# July 2010 +# +# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on +# Cortex A8 core and ~25 cycles per processed byte (which was observed +# to be ~3 times faster than gcc-generated code:-) +# +# February 2011 +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Cortex A8 core and ~23.5 cycles per byte. +# +# March 2011 +# +# Add NEON implementation featuring polynomial multiplication, i.e. no +# lookup tables involved. On Cortex A8 it was measured to process one +# byte in 15 cycles or 55% faster than integer-only code. +# +# April 2014 +# +# Switch to multiplication algorithm suggested in paper referred +# below and combine it with reduction algorithm from x86 module. +# Performance improvement over previous version varies from 65% on +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 +# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, +# Snapdragon S4 - in 9.33. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf + +# ==================================================================== +# Note about "528B" variant. In ARM case it makes lesser sense to +# implement it for following reasons: +# +# - performance improvement won't be anywhere near 50%, because 128- +# bit shift operation is neatly fused with 128-bit xor here, and +# "538B" variant would eliminate only 4-5 instructions out of 32 +# in the inner loop (meaning that estimated improvement is ~15%); +# - ARM-based systems are often embedded ones and extra memory +# consumption might be unappreciated (for so little improvement); +# +# Byte order [in]dependence. ========================================= +# +# Caller is expected to maintain specific *dword* order in Htable, +# namely with *least* significant dword of 128-bit value at *lower* +# address. This differs completely from C code and has everything to +# do with ldm instruction and order in which dwords are "consumed" by +# algorithm. *Byte* order within these dwords in turn is whatever +# *native* byte order on current platform. See gcm128.c for working +# example... + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$Xi="r0"; # argument block +$Htbl="r1"; +$inp="r2"; +$len="r3"; + +$Zll="r4"; # variables +$Zlh="r5"; +$Zhl="r6"; +$Zhh="r7"; +$Tll="r8"; +$Tlh="r9"; +$Thl="r10"; +$Thh="r11"; +$nlo="r12"; +################# r13 is stack pointer +$nhi="r14"; +################# r15 is program counter + +$rem_4bit=$inp; # used in gcm_gmult_4bit +$cnt=$len; + +sub Zsmash() { + my $i=12; + my @args=@_; + for ($Zll,$Zlh,$Zhl,$Zhh) { + $code.=<<___; +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev $_,$_ + str $_,[$Xi,#$i] +#elif defined(__ARMEB__) + str $_,[$Xi,#$i] +#else + mov $Tlh,$_,lsr#8 + strb $_,[$Xi,#$i+3] + mov $Thl,$_,lsr#16 + strb $Tlh,[$Xi,#$i+2] + mov $Thh,$_,lsr#24 + strb $Thl,[$Xi,#$i+1] + strb $Thh,[$Xi,#$i] +#endif +___ + $code.="\t".shift(@args)."\n"; + $i-=4; + } +} + +$code=<<___; +#include "arm_arch.h" + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#ifdef __clang__ +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif + +.type rem_4bit,%object +.align 5 +rem_4bit: +.short 0x0000,0x1C20,0x3840,0x2460 +.short 0x7080,0x6CA0,0x48C0,0x54E0 +.short 0xE100,0xFD20,0xD940,0xC560 +.short 0x9180,0x8DA0,0xA9C0,0xB5E0 +.size rem_4bit,.-rem_4bit + +.type rem_4bit_get,%function +rem_4bit_get: +#if defined(__thumb2__) + adr $rem_4bit,rem_4bit +#else + sub $rem_4bit,pc,#8+32 @ &rem_4bit +#endif + b .Lrem_4bit_got + nop + nop +.size rem_4bit_get,.-rem_4bit_get + +.global gcm_ghash_4bit +.type gcm_ghash_4bit,%function +.align 4 +gcm_ghash_4bit: +#if defined(__thumb2__) + adr r12,rem_4bit +#else + sub r12,pc,#8+48 @ &rem_4bit +#endif + add $len,$inp,$len @ $len to point at the end + stmdb sp!,{r3-r11,lr} @ save $len/end too + + ldmia r12,{r4-r11} @ copy rem_4bit ... + stmdb sp!,{r4-r11} @ ... to stack + + ldrb $nlo,[$inp,#15] + ldrb $nhi,[$Xi,#15] +.Louter: + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + add $Thh,$Htbl,$nhi + ldrb $nlo,[$inp,#14] + + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + ldrb $nhi,[$Xi,#14] + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 + +.Linner: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb $nlo,[$inp,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb $Tll,[$Xi,$cnt] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tlh,[sp,$nhi] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 +#ifdef __thumb2__ + it pl +#endif + eorpl $nlo,$nlo,$Tll + eor $Zhh,$Thh,$Zhh,lsr#4 +#ifdef __thumb2__ + itt pl +#endif + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] + bpl .Linner + + ldr $len,[sp,#32] @ re-load $len/end + add $inp,$inp,#16 + mov $nhi,$Zll +___ + &Zsmash("cmp\t$inp,$len","\n". + "#ifdef __thumb2__\n". + " it ne\n". + "#endif\n". + " ldrneb $nlo,[$inp,#15]"); +$code.=<<___; + bne .Louter + + add sp,sp,#36 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_ghash_4bit,.-gcm_ghash_4bit + +.global gcm_gmult_4bit +.type gcm_gmult_4bit,%function +gcm_gmult_4bit: + stmdb sp!,{r4-r11,lr} + ldrb $nlo,[$Xi,#15] + b rem_4bit_get +.Lrem_4bit_got: + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + ldrb $nlo,[$Xi,#14] + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + and $nhi,$nlo,#0xf0 + eor $Zhh,$Zhh,$Tll,lsl#16 + and $nlo,$nlo,#0x0f + +.Loop: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 +#ifdef __thumb2__ + it pl +#endif + ldrplb $nlo,[$Xi,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 +#ifdef __thumb2__ + itt pl +#endif + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + bpl .Loop +___ + &Zsmash(); +$code.=<<___; +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); + +sub clmul64x64 { +my ($r,$a,$b)=@_; +$code.=<<___; + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 +___ +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 $IN#hi,[r1]! @ load H + vmov.i8 $t0,#0xe1 + vld1.64 $IN#lo,[r1] + vshl.i64 $t0#hi,#57 + vshr.u64 $t0#lo,#63 @ t0=0xc2....01 + vdup.8 $t1,$IN#hi[7] + vshr.u64 $Hlo,$IN#lo,#63 + vshr.s8 $t1,#7 @ broadcast carry bit + vshl.i64 $IN,$IN,#1 + vand $t0,$t0,$t1 + vorr $IN#hi,$Hlo @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vstmia r0,{$IN} + + ret @ bx lr +.size gcm_init_neon,.-gcm_init_neon + +.global gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + vld1.64 $IN#hi,[$Xi]! @ load Xi + vld1.64 $IN#lo,[$Xi]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + mov $len,#16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 $Xl#hi,[$Xi]! @ load Xi + vld1.64 $Xl#lo,[$Xi]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 $Xl,$Xl +#endif + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 $IN#hi,[$inp]! @ load inp + vld1.64 $IN#lo,[$inp]! +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $IN,$Xl @ inp^=Xi +.Lgmult_neon: +___ + &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo +$code.=<<___; + veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing +___ + &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) + &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi +$code.=<<___; + veor $Xm,$Xm,$Xl @ Karatsuba post-processing + veor $Xm,$Xm,$Xh + veor $Xl#hi,$Xl#hi,$Xm#lo + veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 $t1,$Xl,#57 @ 1st phase + vshl.i64 $t2,$Xl,#62 + veor $t2,$t2,$t1 @ + vshl.i64 $t1,$Xl,#63 + veor $t2, $t2, $t1 @ + veor $Xl#hi,$Xl#hi,$t2#lo @ + veor $Xh#lo,$Xh#lo,$t2#hi + + vshr.u64 $t2,$Xl,#1 @ 2nd phase + veor $Xh,$Xh,$Xl + veor $Xl,$Xl,$t2 @ + vshr.u64 $t2,$t2,#6 + vshr.u64 $Xl,$Xl,#1 @ + veor $Xl,$Xl,$Xh @ + veor $Xl,$Xl,$t2 @ + + subs $len,#16 + bne .Loop_neon + +#ifdef __ARMEL__ + vrev64.8 $Xl,$Xl +#endif + sub $Xi,#16 + vst1.64 $Xl#hi,[$Xi]! @ write out Xi + vst1.64 $Xl#lo,[$Xi] + + ret @ bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +___ +} +$code.=<<___; +.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl new file mode 100644 index 0000000..3cadda3 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl @@ -0,0 +1,247 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# December 2011 +# +# The module implements GCM GHASH function and underlying single +# multiplication operation in GF(2^128). Even though subroutines +# have _4bit suffix, they are not using any tables, but rely on +# hardware Galois Field Multiply support. Streamed GHASH processes +# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven +# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are +# comparing apples vs. oranges, but compiler surely could have done +# better, because theoretical [though not necessarily achievable] +# estimate for "4-bit" table-driven implementation is ~12 cycles. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments + +($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, + $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); +($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, + $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); +($FF000000,$E10000)=("B30","B31"); +($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len + $xia="A9"; +($rem,$res)=("B4","B5"); # $rem zaps $Htable + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg gcm_gmult_1bit,_gcm_gmult_1bit + .asg gcm_gmult_4bit,_gcm_gmult_4bit + .asg gcm_ghash_4bit,_gcm_ghash_4bit + .endif + + .asg B3,RA + + .if 0 + .global _gcm_gmult_1bit +_gcm_gmult_1bit: + ADDAD $Htable,2,$Htable + .endif + .global _gcm_gmult_4bit +_gcm_gmult_4bit: + .asmfunc + LDDW *${Htable}[-1],$H1:$H0 ; H.lo + LDDW *${Htable}[-2],$H3:$H2 ; H.hi +|| MV $Xip,${xip} ; reassign Xi +|| MVK 15,B1 ; SPLOOPD constant + + MVK 0xE1,$E10000 +|| LDBU *++${xip}[15],$x1 ; Xi[15] + MVK 0xFF,$FF000000 +|| LDBU *--${xip},$x0 ; Xi[14] + SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial + SHL $FF000000,24,$FF000000 ; upper byte mask +|| BNOP ghash_loop? +|| MVK 1,B0 ; take a single spin + + PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes + AND $H2,$FF000000,$H2u ; H2's upper byte + AND $H3,$FF000000,$H3u ; H3's upper byte +|| SHRU $H2u,8,$H2u + SHRU $H3u,8,$H3u +|| ZERO $Z1:$Z0 + SHRU2 $xia,8,$H01u +|| ZERO $Z3:$Z2 + .endasmfunc + + .global _gcm_ghash_4bit +_gcm_ghash_4bit: + .asmfunc + LDDW *${Htable}[-1],$H1:$H0 ; H.lo +|| SHRU $len,4,B0 ; reassign len + LDDW *${Htable}[-2],$H3:$H2 ; H.hi +|| MV $Xip,${xip} ; reassign Xi +|| MVK 15,B1 ; SPLOOPD constant + + MVK 0xE1,$E10000 +|| [B0] LDNDW *${inp}[1],$H1x:$H0x + MVK 0xFF,$FF000000 +|| [B0] LDNDW *${inp}++[2],$H3x:$H2x + SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial +|| LDDW *${xip}[1],$Z1:$Z0 + SHL $FF000000,24,$FF000000 ; upper byte mask +|| LDDW *${xip}[0],$Z3:$Z2 + + PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes + AND $H2,$FF000000,$H2u ; H2's upper byte + AND $H3,$FF000000,$H3u ; H3's upper byte +|| SHRU $H2u,8,$H2u + SHRU $H3u,8,$H3u + SHRU2 $xia,8,$H01u + +|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + .if .LITTLE_ENDIAN + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z1,16,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .else + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z0,8,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .endif + STDW $Z3:$Z2,*${xip}[0] +|| [B0] ZERO $Z3:$Z2 +|| [B0] MV $xia,$x1 + [B0] ADDK 14,${xip} + +ghash_loop?: + SPLOOPD 6 ; 6*16+7 +|| MVC B1,ILC +|| [B0] SUB B0,1,B0 +|| ZERO A0 +|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib +|| SHL $x1,1,$xia +___ + +########____________________________ +# 0 D2. M1 M2 | +# 1 M1 | +# 2 M1 M2 | +# 3 D1. M1 M2 | +# 4 S1. L1 | +# 5 S2 S1x L1 D2 L2 |____________________________ +# 6/0 L1 S1 L2 S2x |D2. M1 M2 | +# 7/1 L1 S1 D1x S2 M2 | M1 | +# 8/2 S1 L1x S2 | M1 M2 | +# 9/3 S1 L1x | D1. M1 M2 | +# 10/4 D1x | S1. L1 | +# 11/5 |S2 S1x L1 D2 L2 |____________ +# 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... +# 7/1 L1 S1 D1x S2 M2 | .... +# 8/2 S1 L1x S2 | .... +#####... ................|............ +$code.=<<___; + XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1) +|| XORMPY $H01u,$xib,$H01y +|| [A0] LDBU *--${xip},$x0 + XORMPY $H1,$xia,$H1x ; 1 + XORMPY $H2,$xia,$H2x ; 2 +|| XORMPY $H2u,$xib,$H2y + XORMPY $H3,$xia,$H3x ; 3 +|| XORMPY $H3u,$xib,$H3y +||[!A0] MVK.D 15,A0 ; *--${xip} counter + XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1) +|| [A0] SUB.S A0,1,A0 + XOR.L $H1x,$Z1,$Z1 ; 5 +|| AND.D $H01y,$FF000000,$H0z +|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y +|| SHL $x0,1,$xib +|| SHL $x0,1,$xia + + XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue +|| SHL $Z0,1,$rem ; ; rem=Z<<1 +|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 +|| AND.L $H1y,$FF000000,$H1z + XOR.L $H3x,$Z3,$Z3 ; 7/1 +|| SHRMB.S $Z2,$Z1,$Z1 +|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products +|| AND.S $H2y,$FF000000,$H2z +|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE + XOR.L $H1z,$Z1,$Z1 ; 8/2 +|| SHRMB.S $Z3,$Z2,$Z2 +|| AND.S $H3y,$FF000000,$H3z + XOR.L $H2z,$Z2,$Z2 ; 9/3 +|| SHRU $Z3,8,$Z3 + XOR.D $H3z,$Z3,$Z3 ; 10/4 + NOP ; 11/5 + + SPKERNEL 0,2 +|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res + + ; input pre-fetch is possible where D1 slot is available... + [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- + [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- + NOP ; 10/- + .if .LITTLE_ENDIAN + SWAP2 $Z0,$Z1 ; 11/- +|| SWAP4 $Z1,$Z0 + SWAP4 $Z1,$Z1 ; 12/- +|| SWAP2 $Z0,$Z0 + SWAP2 $Z2,$Z3 +|| SWAP4 $Z3,$Z2 +||[!B0] BNOP RA + SWAP4 $Z3,$Z3 +|| SWAP2 $Z2,$Z2 +|| [B0] BNOP ghash_loop? + [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z1,16,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .else + [!B0] BNOP RA ; 11/- + [B0] BNOP ghash_loop? ; 12/- + [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z0,8,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .endif + STDW $Z3:$Z2,*${xip}[0] +|| [B0] ZERO $Z3:$Z2 +|| [B0] MV $xia,$x1 + [B0] ADDK 14,${xip} + .endasmfunc + + .sect .const + .cstring "GHASH for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl new file mode 100755 index 0000000..81e75f7 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl @@ -0,0 +1,470 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Streamed +# GHASH performance was measured to be 6.67 cycles per processed byte +# on Itanium 2, which is >90% better than Microsoft compiler generated +# code. To anchor to something else sha1-ia64.pl module processes one +# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per +# byte. + +# September 2010 +# +# It was originally thought that it makes lesser sense to implement +# "528B" variant on Itanium 2 for following reason. Because number of +# functional units is naturally limited, it appeared impossible to +# implement "528B" loop in 4 cycles, only in 5. This would mean that +# theoretically performance improvement couldn't be more than 20%. +# But occasionally you prove yourself wrong:-) I figured out a way to +# fold couple of instructions and having freed yet another instruction +# slot by unrolling the loop... Resulting performance is 4.45 cycles +# per processed byte and 50% better than "256B" version. On original +# Itanium performance should remain the same as the "256B" version, +# i.e. ~8.5 cycles. + +$output=pop and (open STDOUT,">$output" or die "can't open $output: $!"); + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } +for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); + $big_endian=0 if (/\-DL_ENDIAN/); } +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +sub loop() { +my $label=shift; +my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp + +# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. +# in scalable manner;-) Naturally assuming data in L1 cache... +# Special note about 'dep' instruction, which is used to construct +# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 +# bytes boundary and lower 7 bits of its address are guaranteed to +# be zero. +$code.=<<___; +$label: +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p19) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p19) xor Zhi=Zhi,Hhi + ($p17) xor xi[1]=xi[1],in[1] };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p19) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p19) ld8 rem=[rem] + (p18) and Hi[1]=mask0xf0,xi[2] };; +{ .mmi; ($p16) ld1 in[0]=[inp],-1 + (p18) xor Zlo=Zlo,Hlo + (p19) shr.u Zhi=Zhi,4 } +{ .mib; (p19) xor Hhi=Hhi,rem + (p18) add Hi[1]=Htbl,Hi[1] };; + +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p18) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 + (p18) xor Zhi=Zhi,Hhi };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p18) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p18) ld8 rem=[rem] + (p17) and Hi[0]=mask0xf0,Hi[0] };; +{ .mmi; (p16) ld1 xi[0]=[Xi],-1 + (p18) xor Zlo=Zlo,Hlo + (p18) shr.u Zhi=Zhi,4 } +{ .mib; (p18) xor Hhi=Hhi,rem + (p17) add Hi[0]=Htbl,Hi[0] + br.ctop.sptk $label };; +___ +} + +$code=<<___; +.explicit +.text + +prevfs=r2; prevlc=r3; prevpr=r8; +mask0xf0=r21; +rem=r22; rem_4bitp=r23; +Xi=r24; Htbl=r25; +inp=r26; end=r27; +Hhi=r28; Hlo=r29; +Zhi=r30; Zlo=r31; + +.align 128 +.skip 16 // aligns loop body +.global gcm_gmult_4bit# +.proc gcm_gmult_4bit# +gcm_gmult_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,2,6,0,8 + $ADDP Xi=15,in0 // &Xi[15] + mov rem_4bitp=ip } +{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo + .save ar.lc,prevlc + mov prevlc=ar.lc + .save pr,prevpr + mov prevpr=pr };; + + .body + .rotr in[3],xi[3],Hi[2] + +{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] + mov mask0xf0=0xf0 + brp.loop.imp .Loop1,.Lend1-16};; +{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] + };; +{ .mii; shladd Hi[1]=xi[2],4,r0 + mov pr.rot=0x7<<16 + mov ar.lc=13 };; +{ .mii; and Hi[1]=mask0xf0,Hi[1] + mov ar.ec=3 + xor Zlo=Zlo,Zlo };; +{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo + add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp + xor Zhi=Zhi,Zhi };; +___ + &loop (".Loop1",1); +$code.=<<___; +.Lend1: +{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact +{ .mib; mux1 Zlo=Zlo,\@rev };; +{ .mib; mux1 Zhi=Zhi,\@rev };; +{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent + add Hhi=1,Xi };; // pipeline flush on Itanium +{ .mib; st8 [Hlo]=Zlo + mov pr=prevpr,0x1ffff };; +{ .mib; st8 [Hhi]=Zhi + mov ar.lc=prevlc + br.ret.sptk.many b0 };; +.endp gcm_gmult_4bit# +___ + +###################################################################### +# "528B" (well, "512B" actualy) streamed GHASH +# +$Xip="in0"; +$Htbl="in1"; +$inp="in2"; +$len="in3"; +$rem_8bit="loc0"; +$mask0xff="loc1"; +($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); + +sub load_htable() { + for (my $i=0;$i<8;$i++) { + $code.=<<___; +{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi + ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo +{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi + ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo +___ + $code.=shift if (($i+$#_)==7); + $code.="\t};;\n" + } +} + +$code.=<<___; +prevsp=r3; + +.align 32 +.skip 16 // aligns loop body +.global gcm_ghash_4bit# +.proc gcm_ghash_4bit# +gcm_ghash_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,4,2,0,0 + .vframe prevsp + mov prevsp=sp + mov $rem_8bit=ip };; + .body +{ .mfi; $ADDP r8=0+0,$Htbl + $ADDP r9=0+8,$Htbl } +{ .mfi; $ADDP r10=128+0,$Htbl + $ADDP r11=128+8,$Htbl };; +___ + &load_htable( + " $ADDP $Xip=15,$Xip", # &Xi[15] + " $ADDP $len=$len,$inp", # &inp[len] + " $ADDP $inp=15,$inp", # &inp[15] + " mov $mask0xff=0xff", + " add sp=-512,sp", + " andcm sp=sp,$mask0xff", # align stack frame + " add r14=0,sp", + " add r15=8,sp"); +$code.=<<___; +{ .mmi; $sum 1<<1 // go big-endian + add r8=256+0,sp + add r9=256+8,sp } +{ .mmi; add r10=256+128+0,sp + add r11=256+128+8,sp + add $len=-17,$len };; +___ +for($i=0;$i<8;$i++) { # generate first half of Hshr4[] +my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); +$code.=<<___; +{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo + st8 [r9]=$rhi,16 // Htable[$i].hi + shrp $rlo=$rhi,$rlo,4 }//;; +{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo + stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi + shr.u $rhi=$rhi,4 };; +{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 + st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 +___ +} +$code.=<<___; +{ .mmi; ld8 r16=[r8],16 // Htable[8].lo + ld8 r17=[r9],16 };; // Htable[8].hi +{ .mmi; ld8 r18=[r8],16 // Htable[9].lo + ld8 r19=[r9],16 } // Htable[9].hi +{ .mmi; rum 1<<5 // clear um.mfh + shrp r16=r17,r16,4 };; +___ +for($i=0;$i<6;$i++) { # generate second half of Hshr4[] +$code.=<<___; +{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo + ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi + shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 + st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 + shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } +___ +} +$code.=<<___; +{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 + st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 + shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } +{ .mmi; add $Htbl=256,sp // &Htable[0] + add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit + shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 + st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 +___ + +$in="r15"; +@xi=("r16","r17"); +@rem=("r18","r19"); +($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); +($Atbl,$Btbl)=("r26","r27"); + +$code.=<<___; # (p16) +{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + cmp.eq p0,p6=r0,r0 };; // clear p6 +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p16),(p17) +{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mii; ld1 $in=[$inp],-1 //(p16) *inp-- + dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +.align 32 +.LOOP: +{ .mmi; +(p6) st8 [$Xip]=$Zhi,13 + xor $Zlo=$Zlo,$Zlo + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p16),(p17),(p18) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo +{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + ld1 $in=[$inp],-1 } //(p16) *inp-- +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +for ($i=1;$i<14;$i++) { +# Above and below fragments are derived from this one by removing +# unsuitable (p??) instructions. +$code.=<<___; # (p16),(p17),(p18),(p19) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo +{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + ld1 $in=[$inp],-1 //(p16) *inp-- + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers +} + +$code.=<<___; # (p17),(p18),(p19) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo +{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p18),(p19) +{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo +{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo +{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 + xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi +{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi + shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) +{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p19) +{ .mmi; cmp.ltu p6,p0=$inp,$len + add $inp=32,$inp + shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + add $Xip=9,$Xip };; // &Xi.lo +{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] +(p6) ld1 $in=[$inp],-1 //[p16] *inp-- +(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] +{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi +(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] +{ .mmi; st8 [$Xip]=$Zlo,-8 +(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] + shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 +{ .mmi; +(p6) ld1 $in=[$inp],-1 //[p16] *inp-- + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 +(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo +{ .mib; +(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 +(p6) br.cond.dptk.many .LOOP };; + +{ .mib; st8 [$Xip]=$Zhi };; +{ .mib; $rum 1<<1 // return to little-endian + .restore sp + mov sp=prevsp + br.ret.sptk.many b0 };; +.endp gcm_ghash_4bit# +___ +$code.=<<___; +.align 128 +.type rem_4bit#,\@object +rem_4bit: + data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 + data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 + data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 + data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 +.size rem_4bit#,128 +.type rem_8bit#,\@object +rem_8bit: + data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E + data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E + data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E + data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E + data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E + data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E + data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E + data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E + data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE + data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE + data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE + data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE + data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E + data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E + data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE + data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE + data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E + data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E + data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E + data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E + data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E + data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E + data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E + data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E + data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE + data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE + data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE + data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE + data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E + data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E + data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE + data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE +.size rem_8bit#,512 +stringz "GHASH for IA64, CRYPTOGAMS by " +___ + +$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl new file mode 100644 index 0000000..1d62545 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl @@ -0,0 +1,738 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC +# it processes one byte in 19.6 cycles, which is more than twice as +# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for +# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per +# processed byte. This is ~2.2x faster than 64-bit code generated by +# vendor compiler (which used to be very hard to beat:-). +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; + $NREGS =6; +} else { + $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; + $NREGS =11; +} + +$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker + # [+ argument transfer] + +################# volatile registers +$Xi="%r26"; # argument block +$Htbl="%r25"; +$inp="%r24"; +$len="%r23"; +$Hhh=$Htbl; # variables +$Hll="%r22"; +$Zhh="%r21"; +$Zll="%r20"; +$cnt="%r19"; +$rem_4bit="%r28"; +$rem="%r29"; +$mask0xf0="%r31"; + +################# preserved registers +$Thh="%r1"; +$Tll="%r2"; +$nlo="%r3"; +$nhi="%r4"; +$byte="%r5"; +if ($SIZE_T==4) { + $Zhl="%r6"; + $Zlh="%r7"; + $Hhl="%r8"; + $Hlh="%r9"; + $Thl="%r10"; + $Tlh="%r11"; +} +$rem2="%r6"; # used in PA-RISC 2.0 code + +$code.=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 64 +gcm_gmult_4bit + .PROC + .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) +___ +$code.=<<___ if ($SIZE_T==4); + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) +___ +$code.=<<___; + blr %r0,$rem_4bit + ldi 3,$rem +L\$pic_gmult + andcm $rem_4bit,$rem,$rem_4bit + addl $inp,$len,$len + ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit + ldi 0xf0,$mask0xf0 +___ +$code.=<<___ if ($SIZE_T==4); + ldi 31,$rem + mtctl $rem,%cr11 + extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 + b L\$parisc1_gmult + nop +___ + +$code.=<<___; + ldb 15($Xi),$nlo + ldo 8($Htbl),$Hll + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + ldd $nlo($Hll),$Zll + ldd $nlo($Hhh),$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldb 14($Xi),$nlo + + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + b L\$oop_gmult_pa2 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_gmult_pa2 + xor $rem,$Zhh,$Zhh ; moved here to work around gas bug + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + ldbx $cnt($Xi),$nlo + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + ldd $rem($rem_4bit),$rem + + xor $Tll,$Zll,$Zll + addib,uv -1,$cnt,L\$oop_gmult_pa2 + xor $Thh,$Zhh,$Zhh + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + std $Zll,8($Xi) + std $Zhh,0($Xi) +___ + +$code.=<<___ if ($SIZE_T==4); + b L\$done_gmult + nop + +L\$parisc1_gmult + ldb 15($Xi),$nlo + ldo 12($Htbl),$Hll + ldo 8($Htbl),$Hlh + ldo 4($Htbl),$Hhl + + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + ldwx $nlo($Hll),$Zll + ldwx $nlo($Hlh),$Zlh + ldwx $nlo($Hhl),$Zhl + ldwx $nlo($Hhh),$Zhh + zdep $Zll,28,4,$rem + ldb 14($Xi),$nlo + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhl,$Zlh,4,$Zlh + ldwx $nhi($Hlh),$Tlh + shrpw $Zhh,$Zhl,4,$Zhl + ldwx $nhi($Hhl),$Thl + extru $Zhh,27,28,$Zhh + ldwx $nhi($Hhh),$Thh + xor $rem,$Zhh,$Zhh + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $Thl,$Zhl,$Zhl + b L\$oop_gmult_pa1 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_gmult_pa1 + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + ldbx $cnt($Xi),$nlo + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $rem,$Zhh,$Zhh + zdep $Zll,28,4,$rem + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + shrpw $Zlh,$Zll,4,$Zll + ldwx $rem($rem_4bit),$rem + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + and $mask0xf0,$nlo,$nhi + extru $Zhh,27,28,$Zhh + zdep $nlo,27,4,$nlo + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $rem,$Zhh,$Zhh + addib,uv -1,$cnt,L\$oop_gmult_pa1 + xor $Thl,$Zhl,$Zhl + + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $rem,$Zhh,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + zdep $Zll,28,4,$rem + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + extru $Zhh,27,28,$Zhh + xor $Tll,$Zll,$Zll + xor $Tlh,$Zlh,$Zlh + xor $rem,$Zhh,$Zhh + stw $Zll,12($Xi) + xor $Thl,$Zhl,$Zhl + stw $Zlh,8($Xi) + xor $Thh,$Zhh,$Zhh + stw $Zhl,4($Xi) + stw $Zhh,0($Xi) +___ +$code.=<<___; +L\$done_gmult + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +___ +$code.=<<___ if ($SIZE_T==4); + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 +___ +$code.=<<___; + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR + .ALIGN 64 +gcm_ghash_4bit + .PROC + .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) +___ +$code.=<<___ if ($SIZE_T==4); + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) +___ +$code.=<<___; + blr %r0,$rem_4bit + ldi 3,$rem +L\$pic_ghash + andcm $rem_4bit,$rem,$rem_4bit + addl $inp,$len,$len + ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit + ldi 0xf0,$mask0xf0 +___ +$code.=<<___ if ($SIZE_T==4); + ldi 31,$rem + mtctl $rem,%cr11 + extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 + b L\$parisc1_ghash + nop +___ + +$code.=<<___; + ldb 15($Xi),$nlo + ldo 8($Htbl),$Hll + +L\$outer_ghash_pa2 + ldb 15($inp),$nhi + xor $nhi,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + ldd $nlo($Hll),$Zll + ldd $nlo($Hhh),$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldb 14($Xi),$nlo + ldb 14($inp),$byte + + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + xor $byte,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + b L\$oop_ghash_pa2 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_ghash_pa2 + xor $rem,$Zhh,$Zhh ; moved here to work around gas bug + depd,z $Zll,60,4,$rem2 + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldbx $cnt($Xi),$nlo + ldbx $cnt($inp),$byte + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + ldd $rem2($rem_4bit),$rem2 + + xor $rem2,$Zhh,$Zhh + xor $byte,$nlo,$nlo + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + extrd,u $Zhh,59,60,$Zhh + xor $Tll,$Zll,$Zll + + ldd $rem($rem_4bit),$rem + addib,uv -1,$cnt,L\$oop_ghash_pa2 + xor $Thh,$Zhh,$Zhh + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem2 + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + ldd $rem2($rem_4bit),$rem2 + + xor $rem2,$Zhh,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + extrd,u $Zhh,59,60,$Zhh + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + std $Zll,8($Xi) + ldo 16($inp),$inp + std $Zhh,0($Xi) + cmpb,*<> $inp,$len,L\$outer_ghash_pa2 + copy $Zll,$nlo +___ + +$code.=<<___ if ($SIZE_T==4); + b L\$done_ghash + nop + +L\$parisc1_ghash + ldb 15($Xi),$nlo + ldo 12($Htbl),$Hll + ldo 8($Htbl),$Hlh + ldo 4($Htbl),$Hhl + +L\$outer_ghash_pa1 + ldb 15($inp),$byte + xor $byte,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + ldwx $nlo($Hll),$Zll + ldwx $nlo($Hlh),$Zlh + ldwx $nlo($Hhl),$Zhl + ldwx $nlo($Hhh),$Zhh + zdep $Zll,28,4,$rem + ldb 14($Xi),$nlo + ldb 14($inp),$byte + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhl,$Zlh,4,$Zlh + ldwx $nhi($Hlh),$Tlh + shrpw $Zhh,$Zhl,4,$Zhl + ldwx $nhi($Hhl),$Thl + extru $Zhh,27,28,$Zhh + ldwx $nhi($Hhh),$Thh + xor $byte,$nlo,$nlo + xor $rem,$Zhh,$Zhh + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $Thl,$Zhl,$Zhl + b L\$oop_ghash_pa1 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_ghash_pa1 + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + ldbx $cnt($Xi),$nlo + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + ldbx $cnt($inp),$byte + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $rem,$Zhh,$Zhh + zdep $Zll,28,4,$rem + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + shrpw $Zlh,$Zll,4,$Zll + ldwx $rem($rem_4bit),$rem + shrpw $Zhl,$Zlh,4,$Zlh + xor $byte,$nlo,$nlo + shrpw $Zhh,$Zhl,4,$Zhl + and $mask0xf0,$nlo,$nhi + extru $Zhh,27,28,$Zhh + zdep $nlo,27,4,$nlo + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $rem,$Zhh,$Zhh + addib,uv -1,$cnt,L\$oop_ghash_pa1 + xor $Thl,$Zhl,$Zhl + + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $rem,$Zhh,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + zdep $Zll,28,4,$rem + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + extru $Zhh,27,28,$Zhh + xor $Tll,$Zll,$Zll + xor $Tlh,$Zlh,$Zlh + xor $rem,$Zhh,$Zhh + stw $Zll,12($Xi) + xor $Thl,$Zhl,$Zhl + stw $Zlh,8($Xi) + xor $Thh,$Zhh,$Zhh + stw $Zhl,4($Xi) + ldo 16($inp),$inp + stw $Zhh,0($Xi) + comb,<> $inp,$len,L\$outer_ghash_pa1 + copy $Zll,$nlo +___ +$code.=<<___; +L\$done_ghash + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +___ +$code.=<<___ if ($SIZE_T==4); + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 +___ +$code.=<<___; + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 64 +L\$rem_4bit + .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 + .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 + .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 + .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 + .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by " + .ALIGN 64 +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 + { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 + { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; + $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices + { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 + { sprintf "\t.WORD\t0x%08x\t; %s", + (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; + } + else { "\t".$orig; } +}; + +my $depd = sub { + my ($mod,$args) = @_; + my $orig = "depd$mod\t$args"; + + # I only have ",z" completer, it's impicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 + { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); + my $cpos=63-$2; + my $len=32-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + if ($SIZE_T==4) { + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; + s/cmpb,\*/comb,/; + s/,\*/,/; + } + s/\bbv\b/bve/ if ($SIZE_T==8); + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl new file mode 100644 index 0000000..6e628d8 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl @@ -0,0 +1,258 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# September 2010. +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# was measured to be ~18 cycles per processed byte on z10, which is +# almost 40% better than gcc-generated code. It should be noted that +# 18 cycles is worse result than expected: loop is scheduled for 12 +# and the result should be close to 12. In the lack of instruction- +# level profiling data it's impossible to tell why... + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2.8x better than 32-bit code generated by gcc 4.3. + +# March 2011. +# +# Support for hardware KIMD-GHASH is verified to produce correct +# result and therefore is engaged. On z196 it was measured to process +# 8KB buffer ~7 faster than software implementation. It's not as +# impressive for smaller buffer sizes and for smallest 16-bytes buffer +# it's actually almost 2 times slower. Which is the reason why +# KIMD-GHASH is not used in gcm_gmult_4bit. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$softonly=0; + +$Zhi="%r0"; +$Zlo="%r1"; + +$Xi="%r2"; # argument block +$Htbl="%r3"; +$inp="%r4"; +$len="%r5"; + +$rem0="%r6"; # variables +$rem1="%r7"; +$nlo="%r8"; +$nhi="%r9"; +$xi="%r10"; +$cnt="%r11"; +$tmp="%r12"; +$x78="%r13"; +$rem_4bit="%r14"; + +$sp="%r15"; + +$code.=<<___; +.text + +.globl gcm_gmult_4bit +.align 32 +gcm_gmult_4bit: +___ +$code.=<<___ if(!$softonly && 0); # hardware is slow for single block... + larl %r1,OPENSSL_s390xcap_P + lghi %r0,0 + lg %r1,24(%r1) # load second word of kimd capabilities vector + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_gmult + stg %r0,16($sp) # arrange 16 bytes of zero input + stg %r0,24($sp) + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + la $inp,16($sp) + lghi $len,16 + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_gmult: +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) + + aghi $Xi,-1 + lghi $len,1 + lghi $x78,`0xf<<3` + larl $rem_4bit,rem_4bit + + lg $Zlo,8+1($Xi) # Xi + j .Lgmult_shortcut +.type gcm_gmult_4bit,\@function +.size gcm_gmult_4bit,(.-gcm_gmult_4bit) + +.globl gcm_ghash_4bit +.align 32 +gcm_ghash_4bit: +___ +$code.=<<___ if(!$softonly); + larl %r1,OPENSSL_s390xcap_P + lg %r0,24(%r1) # load second word of kimd capabilities vector + tmhh %r0,0x4000 # check for function 65 + jz .Lsoft_ghash + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_ghash: +___ +$code.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) + + aghi $Xi,-1 + srlg $len,$len,4 + lghi $x78,`0xf<<3` + larl $rem_4bit,rem_4bit + + lg $Zlo,8+1($Xi) # Xi + lg $Zhi,0+1($Xi) + lghi $tmp,0 +.Louter: + xg $Zhi,0($inp) # Xi ^= inp + xg $Zlo,8($inp) + xgr $Zhi,$tmp + stg $Zlo,8+1($Xi) + stg $Zhi,0+1($Xi) + +.Lgmult_shortcut: + lghi $tmp,0xf0 + sllg $nlo,$Zlo,4 + srlg $xi,$Zlo,8 # extract second byte + ngr $nlo,$tmp + lgr $nhi,$Zlo + lghi $cnt,14 + ngr $nhi,$tmp + + lg $Zlo,8($nlo,$Htbl) + lg $Zhi,0($nlo,$Htbl) + + sllg $nlo,$xi,4 + sllg $rem0,$Zlo,3 + ngr $nlo,$tmp + ngr $rem0,$x78 + ngr $xi,$tmp + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + lgr $nhi,$xi + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 + sllg $tmp,$Zhi,60 + j .Lghash_inner +.align 16 +.Lghash_inner: + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nlo,$Htbl) + llgc $xi,0($cnt,$Xi) + xg $Zhi,0($nlo,$Htbl) + sllg $nlo,$xi,4 + xg $Zhi,0($rem0,$rem_4bit) + nill $nlo,0xf0 + sllg $rem0,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem0,$x78 + nill $xi,0xf0 + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + lgr $nhi,$xi + xg $Zhi,0($rem1,$rem_4bit) + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 + sllg $tmp,$Zhi,60 + brct $cnt,.Lghash_inner + + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nlo,$Htbl) + xg $Zhi,0($nlo,$Htbl) + sllg $xi,$Zlo,3 + xg $Zhi,0($rem0,$rem_4bit) + xgr $Zlo,$tmp + ngr $xi,$x78 + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + xgr $Zlo,$tmp + xg $Zhi,0($rem1,$rem_4bit) + + lg $tmp,0($xi,$rem_4bit) + la $inp,16($inp) + sllg $tmp,$tmp,4 # correct last rem_4bit[rem] + brctg $len,.Louter + + xgr $Zhi,$tmp + stg $Zlo,8+1($Xi) + stg $Zhi,0+1($Xi) + lm${g} %r6,%r14,6*$SIZE_T($sp) + br %r14 +.type gcm_ghash_4bit,\@function +.size gcm_ghash_4bit,(.-gcm_ghash_4bit) + +.align 64 +rem_4bit: + .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 + .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 + .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 + .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 +.type rem_4bit,\@object +.size rem_4bit,(.-rem_4bit) +.string "GHASH for s390x, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl new file mode 100644 index 0000000..c4eb3b1 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl @@ -0,0 +1,581 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU +# and are expressed in cycles per processed byte, less is better: +# +# gcc 3.3.x cc 5.2 this assembler +# +# 32-bit build 81.4 43.3 12.6 (+546%/+244%) +# 64-bit build 20.2 21.2 12.6 (+60%/+68%) +# +# Here is data collected on UltraSPARC T1 system running Linux: +# +# gcc 4.4.1 this assembler +# +# 32-bit build 566 50 (+1000%) +# 64-bit build 56 50 (+12%) +# +# I don't quite understand why difference between 32-bit and 64-bit +# compiler-generated code is so big. Compilers *were* instructed to +# generate code for UltraSPARC and should have used 64-bit registers +# for Z vector (see C code) even in 32-bit build... Oh well, it only +# means more impressive improvement coefficients for this assembler +# module;-) Loops are aggressively modulo-scheduled in respect to +# references to input data and Z.hi updates to achieve 12 cycles +# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 +# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. +# +# October 2012 +# +# Add VIS3 lookup-table-free implementation using polynomial +# multiplication xmulx[hi] and extended addition addxc[cc] +# instructions. 4.52/7.63x improvement on T3/T4 or in absolute +# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark +# saturates at ~15.5x single-process result on 8-core processor, +# or ~20.5GBps per 2.85GHz socket. + +$output=pop; +open STDOUT,">$output"; + +$frame="STACK_FRAME"; +$bias="STACK_BIAS"; + +$Zhi="%o0"; # 64-bit values +$Zlo="%o1"; +$Thi="%o2"; +$Tlo="%o3"; +$rem="%o4"; +$tmp="%o5"; + +$nhi="%l0"; # small values and pointers +$nlo="%l1"; +$xi0="%l2"; +$xi1="%l3"; +$rem_4bit="%l4"; +$remi="%l5"; +$Htblo="%l6"; +$cnt="%l7"; + +$Xi="%i0"; # input argument block +$Htbl="%i1"; +$inp="%i2"; +$len="%i3"; + +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +#endif + +.section ".text",#alloc,#execinstr + +.align 64 +rem_4bit: + .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 + .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 + .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 + .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 +.type rem_4bit,#object +.size rem_4bit,(.-rem_4bit) + +.globl gcm_ghash_4bit +.align 32 +gcm_ghash_4bit: + save %sp,-$frame,%sp + ldub [$inp+15],$nlo + ldub [$Xi+15],$xi0 + ldub [$Xi+14],$xi1 + add $len,$inp,$len + add $Htbl,8,$Htblo + +1: call .+8 + add %o7,rem_4bit-1b,$rem_4bit + +.Louter: + xor $xi0,$nlo,$nlo + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + sll $nlo,4,$nlo + ldx [$Htblo+$nlo],$Zlo + ldx [$Htbl+$nlo],$Zhi + + ldub [$inp+14],$nlo + + ldx [$Htblo+$nhi],$Tlo + and $Zlo,0xf,$remi + ldx [$Htbl+$nhi],$Thi + sll $remi,3,$remi + ldx [$rem_4bit+$remi],$rem + srlx $Zlo,4,$Zlo + mov 13,$cnt + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + + xor $xi1,$nlo,$nlo + and $Zlo,0xf,$remi + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + ba .Lghash_inner + sll $nlo,4,$nlo +.align 32 +.Lghash_inner: + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$inp+$cnt],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + ldub [$Xi+$cnt],$xi1 + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $xi1,$nlo,$nlo + srlx $Zhi,4,$Zhi + and $nlo,0xf0,$nhi + addcc $cnt,-1,$cnt + xor $Zlo,$tmp,$Zlo + and $nlo,0x0f,$nlo + xor $Tlo,$Zlo,$Zlo + sll $nlo,4,$nlo + blu .Lghash_inner + and $Zlo,0xf,$remi + + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + + add $inp,16,$inp + cmp $inp,$len + be,pn SIZE_T_CC,.Ldone + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$inp+15],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + srl $Zlo,8,$xi1 + and $Zlo,0xff,$xi0 + ba .Louter + and $xi1,0xff,$xi1 +.align 32 +.Ldone: + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + + ret + restore +.type gcm_ghash_4bit,#function +.size gcm_ghash_4bit,(.-gcm_ghash_4bit) +___ + +undef $inp; +undef $len; + +$code.=<<___; +.globl gcm_gmult_4bit +.align 32 +gcm_gmult_4bit: + save %sp,-$frame,%sp + ldub [$Xi+15],$nlo + add $Htbl,8,$Htblo + +1: call .+8 + add %o7,rem_4bit-1b,$rem_4bit + + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + sll $nlo,4,$nlo + ldx [$Htblo+$nlo],$Zlo + ldx [$Htbl+$nlo],$Zhi + + ldub [$Xi+14],$nlo + + ldx [$Htblo+$nhi],$Tlo + and $Zlo,0xf,$remi + ldx [$Htbl+$nhi],$Thi + sll $remi,3,$remi + ldx [$rem_4bit+$remi],$rem + srlx $Zlo,4,$Zlo + mov 13,$cnt + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + + and $Zlo,0xf,$remi + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + ba .Lgmult_inner + sll $nlo,4,$nlo +.align 32 +.Lgmult_inner: + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$Xi+$cnt],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + srlx $Zhi,4,$Zhi + and $nlo,0xf0,$nhi + addcc $cnt,-1,$cnt + xor $Zlo,$tmp,$Zlo + and $nlo,0x0f,$nlo + xor $Tlo,$Zlo,$Zlo + sll $nlo,4,$nlo + blu .Lgmult_inner + and $Zlo,0xf,$remi + + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + + ret + restore +.type gcm_gmult_4bit,#function +.size gcm_gmult_4bit,(.-gcm_gmult_4bit) +___ + +{{{ +# Straightforward 128x128-bit multiplication using Karatsuba algorithm +# followed by pair of 64-bit reductions [with a shortcut in first one, +# which allowed to break dependency between reductions and remove one +# multiplication from critical path]. While it might be suboptimal +# with regard to sheer number of multiplications, other methods [such +# as aggregate reduction] would require more 64-bit registers, which +# we don't have in 32-bit application context. + +($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); + +($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= + (map("%o$_",(0..5,7)),map("%g$_",(1..5))); + +($shl,$shr)=map("%l$_",(0..7)); + +# For details regarding "twisted H" see ghash-x86.pl. +$code.=<<___; +.globl gcm_init_vis3 +.align 32 +gcm_init_vis3: + save %sp,-$frame,%sp + + ldx [%i1+0],$Hhi + ldx [%i1+8],$Hlo + mov 0xE1,$Xhi + mov 1,$Xlo + sllx $Xhi,57,$Xhi + srax $Hhi,63,$C0 ! broadcast carry + addcc $Hlo,$Hlo,$Hlo ! H<<=1 + addxc $Hhi,$Hhi,$Hhi + and $C0,$Xlo,$Xlo + and $C0,$Xhi,$Xhi + xor $Xlo,$Hlo,$Hlo + xor $Xhi,$Hhi,$Hhi + stx $Hlo,[%i0+8] ! save twisted H + stx $Hhi,[%i0+0] + + sethi %hi(0xA0406080),$V + sethi %hi(0x20C0E000),%l0 + or $V,%lo(0xA0406080),$V + or %l0,%lo(0x20C0E000),%l0 + sllx $V,32,$V + or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000 + stx $V,[%i0+16] + + ret + restore +.type gcm_init_vis3,#function +.size gcm_init_vis3,.-gcm_init_vis3 + +.globl gcm_gmult_vis3 +.align 32 +gcm_gmult_vis3: + save %sp,-$frame,%sp + + ldx [$Xip+8],$Xlo ! load Xi + ldx [$Xip+0],$Xhi + ldx [$Htable+8],$Hlo ! load twisted H + ldx [$Htable+0],$Hhi + + mov 0xE1,%l7 + sllx %l7,57,$xE1 ! 57 is not a typo + ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 + + xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing + xmulx $Xlo,$Hlo,$C0 + xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing + xmulx $C2,$Hhl,$C1 + xmulxhi $Xlo,$Hlo,$Xlo + xmulxhi $C2,$Hhl,$C2 + xmulxhi $Xhi,$Hhi,$C3 + xmulx $Xhi,$Hhi,$Xhi + + sll $C0,3,$sqr + srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] + xor $C0,$sqr,$sqr + sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] + + xor $C0,$C1,$C1 ! Karatsuba post-processing + xor $Xlo,$C2,$C2 + xor $sqr,$Xlo,$Xlo ! real destination is $C1 + xor $C3,$C2,$C2 + xor $Xlo,$C1,$C1 + xor $Xhi,$C2,$C2 + xor $Xhi,$C1,$C1 + + xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 + xor $C0,$C2,$C2 + xmulx $C1,$xE1,$C0 + xor $C1,$C3,$C3 + xmulxhi $C1,$xE1,$C1 + + xor $Xlo,$C2,$C2 + xor $C0,$C2,$C2 + xor $C1,$C3,$C3 + + stx $C2,[$Xip+8] ! save Xi + stx $C3,[$Xip+0] + + ret + restore +.type gcm_gmult_vis3,#function +.size gcm_gmult_vis3,.-gcm_gmult_vis3 + +.globl gcm_ghash_vis3 +.align 32 +gcm_ghash_vis3: + save %sp,-$frame,%sp + nop + srln $len,0,$len ! needed on v8+, "nop" on v9 + + ldx [$Xip+8],$C2 ! load Xi + ldx [$Xip+0],$C3 + ldx [$Htable+8],$Hlo ! load twisted H + ldx [$Htable+0],$Hhi + + mov 0xE1,%l7 + sllx %l7,57,$xE1 ! 57 is not a typo + ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 + + and $inp,7,$shl + andn $inp,7,$inp + sll $shl,3,$shl + prefetch [$inp+63], 20 + sub %g0,$shl,$shr + + xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing +.Loop: + ldx [$inp+8],$Xlo + brz,pt $shl,1f + ldx [$inp+0],$Xhi + + ldx [$inp+16],$C1 ! align data + srlx $Xlo,$shr,$C0 + sllx $Xlo,$shl,$Xlo + sllx $Xhi,$shl,$Xhi + srlx $C1,$shr,$C1 + or $C0,$Xhi,$Xhi + or $C1,$Xlo,$Xlo +1: + add $inp,16,$inp + sub $len,16,$len + xor $C2,$Xlo,$Xlo + xor $C3,$Xhi,$Xhi + prefetch [$inp+63], 20 + + xmulx $Xlo,$Hlo,$C0 + xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing + xmulx $C2,$Hhl,$C1 + xmulxhi $Xlo,$Hlo,$Xlo + xmulxhi $C2,$Hhl,$C2 + xmulxhi $Xhi,$Hhi,$C3 + xmulx $Xhi,$Hhi,$Xhi + + sll $C0,3,$sqr + srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] + xor $C0,$sqr,$sqr + sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] + + xor $C0,$C1,$C1 ! Karatsuba post-processing + xor $Xlo,$C2,$C2 + xor $sqr,$Xlo,$Xlo ! real destination is $C1 + xor $C3,$C2,$C2 + xor $Xlo,$C1,$C1 + xor $Xhi,$C2,$C2 + xor $Xhi,$C1,$C1 + + xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 + xor $C0,$C2,$C2 + xmulx $C1,$xE1,$C0 + xor $C1,$C3,$C3 + xmulxhi $C1,$xE1,$C1 + + xor $Xlo,$C2,$C2 + xor $C0,$C2,$C2 + brnz,pt $len,.Loop + xor $C1,$C3,$C3 + + stx $C2,[$Xip+8] ! save Xi + stx $C3,[$Xip+0] + + ret + restore +.type gcm_ghash_vis3,#function +.size gcm_ghash_vis3,.-gcm_ghash_vis3 +___ +}}} +$code.=<<___; +.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by " +.align 4 +___ + + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "xmulx" => 0x115, + "xmulxhi" => 0x116 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl new file mode 100644 index 0000000..cd84582 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl @@ -0,0 +1,1405 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, May, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two +# code paths: vanilla x86 and vanilla SSE. Former will be executed on +# 486 and Pentium, latter on all others. SSE GHASH features so called +# "528B" variant of "4-bit" method utilizing additional 256+16 bytes +# of per-key storage [+512 bytes shared table]. Performance results +# are for streamed GHASH subroutine and are expressed in cycles per +# processed byte, less is better: +# +# gcc 2.95.3(*) SSE assembler x86 assembler +# +# Pentium 105/111(**) - 50 +# PIII 68 /75 12.2 24 +# P4 125/125 17.8 84(***) +# Opteron 66 /70 10.1 30 +# Core2 54 /67 8.4 18 +# Atom 105/105 16.8 53 +# VIA Nano 69 /71 13.0 27 +# +# (*) gcc 3.4.x was observed to generate few percent slower code, +# which is one of reasons why 2.95.3 results were chosen, +# another reason is lack of 3.4.x results for older CPUs; +# comparison with SSE results is not completely fair, because C +# results are for vanilla "256B" implementation, while +# assembler results are for "528B";-) +# (**) second number is result for code compiled with -fPIC flag, +# which is actually more relevant, because assembler code is +# position-independent; +# (***) see comment in non-MMX routine for further details; +# +# To summarize, it's >2-5 times faster than gcc-generated code. To +# anchor it to something else SHA1 assembler processes one byte in +# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE +# in particular, see comment at the end of the file... + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.10 cycles per processed byte. +# The question is how close is it to theoretical limit? The pclmulqdq +# instruction latency appears to be 14 cycles and there can't be more +# than 2 of them executing at any given time. This means that single +# Karatsuba multiplication would take 28 cycles *plus* few cycles for +# pre- and post-processing. Then multiplication has to be followed by +# modulo-reduction. Given that aggregated reduction method [see +# "Carry-less Multiplication and Its Usage for Computing the GCM Mode" +# white paper by Intel] allows you to perform reduction only once in +# a while we can assume that asymptotic performance can be estimated +# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction +# and Naggr is the aggregation factor. +# +# Before we proceed to this implementation let's have closer look at +# the best-performing code suggested by Intel in their white paper. +# By tracing inter-register dependencies Tmod is estimated as ~19 +# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per +# processed byte. As implied, this is quite optimistic estimate, +# because it does not account for Karatsuba pre- and post-processing, +# which for a single multiplication is ~5 cycles. Unfortunately Intel +# does not provide performance data for GHASH alone. But benchmarking +# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt +# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that +# the result accounts even for pre-computing of degrees of the hash +# key H, but its portion is negligible at 16KB buffer size. +# +# Moving on to the implementation in question. Tmod is estimated as +# ~13 cycles and Naggr is 2, giving asymptotic performance of ... +# 2.16. How is it possible that measured performance is better than +# optimistic theoretical estimate? There is one thing Intel failed +# to recognize. By serializing GHASH with CTR in same subroutine +# former's performance is really limited to above (Tmul + Tmod/Naggr) +# equation. But if GHASH procedure is detached, the modulo-reduction +# can be interleaved with Naggr-1 multiplications at instruction level +# and under ideal conditions even disappear from the equation. So that +# optimistic theoretical estimate for this implementation is ... +# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, +# at least for such small Naggr. I'd argue that (28+Tproc/Naggr), +# where Tproc is time required for Karatsuba pre- and post-processing, +# is more realistic estimate. In this case it gives ... 1.91 cycles. +# Or in other words, depending on how well we can interleave reduction +# and one of the two multiplications the performance should be between +# 1.91 and 2.16. As already mentioned, this implementation processes +# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart +# - in 2.02. x86_64 performance is better, because larger register +# bank allows to interleave reduction and multiplication better. +# +# Does it make sense to increase Naggr? To start with it's virtually +# impossible in 32-bit mode, because of limited register bank +# capacity. Otherwise improvement has to be weighed agiainst slower +# setup, as well as code size and complexity increase. As even +# optimistic estimate doesn't promise 30% performance improvement, +# there are currently no plans to increase Naggr. +# +# Special thanks to David Woodhouse for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +# January 2010 +# +# Tweaked to optimize transitions between integer and FP operations +# on same XMM register, PCLMULQDQ subroutine was measured to process +# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. +# The minor regression on Westmere is outweighed by ~15% improvement +# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in +# similar manner resulted in almost 20% degradation on Sandy Bridge, +# where original 64-bit code processes one byte in 1.95 cycles. + +##################################################################### +# For reference, AMD Bulldozer processes one byte in 1.98 cycles in +# 32-bit mode and 1.89 in 64-bit. + +# February 2013 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9. Resulting performance is 1.96 cycles per byte on +# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); +$inp = "edi"; +$Htbl = "esi"; + +$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse + # than unrolled, which has to be weighted against + # 2.5x x86-specific code size reduction. + +sub x86_loop { + my $off = shift; + my $rem = "eax"; + + &mov ($Zhh,&DWP(4,$Htbl,$Zll)); + &mov ($Zhl,&DWP(0,$Htbl,$Zll)); + &mov ($Zlh,&DWP(12,$Htbl,$Zll)); + &mov ($Zll,&DWP(8,$Htbl,$Zll)); + &xor ($rem,$rem); # avoid partial register stalls on PIII + + # shrd practically kills P4, 2.5x deterioration, but P4 has + # MMX code-path to execute. shrd runs tad faster [than twice + # the shifts, move's and or's] on pre-MMX Pentium (as well as + # PIII and Core2), *but* minimizes code size, spares register + # and thus allows to fold the loop... + if (!$unroll) { + my $cnt = $inp; + &mov ($cnt,15); + &jmp (&label("x86_loop")); + &set_label("x86_loop",16); + for($i=1;$i<=2;$i++) { + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + &mov (&LB($rem),&BP($off,"esp",$cnt)); + if ($i&1) { + &and (&LB($rem),0xf0); + } else { + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + + if ($i&1) { + &dec ($cnt); + &js (&label("x86_break")); + } else { + &jmp (&label("x86_loop")); + } + } + &set_label("x86_break",16); + } else { + for($i=1;$i<32;$i++) { + &comment($i); + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + if ($i&1) { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &and (&LB($rem),0xf0); + } else { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + } + } + &bswap ($Zll); + &bswap ($Zlh); + &bswap ($Zhl); + if (!$x86only) { + &bswap ($Zhh); + } else { + &mov ("eax",$Zhh); + &bswap ("eax"); + &mov ($Zhh,"eax"); + } +} + +if ($unroll) { + &function_begin_B("_x86_gmult_4bit_inner"); + &x86_loop(4); + &ret (); + &function_end_B("_x86_gmult_4bit_inner"); +} + +sub deposit_rem_4bit { + my $bias = shift; + + &mov (&DWP($bias+0, "esp"),0x0000<<16); + &mov (&DWP($bias+4, "esp"),0x1C20<<16); + &mov (&DWP($bias+8, "esp"),0x3840<<16); + &mov (&DWP($bias+12,"esp"),0x2460<<16); + &mov (&DWP($bias+16,"esp"),0x7080<<16); + &mov (&DWP($bias+20,"esp"),0x6CA0<<16); + &mov (&DWP($bias+24,"esp"),0x48C0<<16); + &mov (&DWP($bias+28,"esp"),0x54E0<<16); + &mov (&DWP($bias+32,"esp"),0xE100<<16); + &mov (&DWP($bias+36,"esp"),0xFD20<<16); + &mov (&DWP($bias+40,"esp"),0xD940<<16); + &mov (&DWP($bias+44,"esp"),0xC560<<16); + &mov (&DWP($bias+48,"esp"),0x9180<<16); + &mov (&DWP($bias+52,"esp"),0x8DA0<<16); + &mov (&DWP($bias+56,"esp"),0xA9C0<<16); + &mov (&DWP($bias+60,"esp"),0xB5E0<<16); +} + +$suffix = $x86only ? "" : "_x86"; + +&function_begin("gcm_gmult_4bit".$suffix); + &stack_push(16+4+1); # +1 for stack alignment + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] + &mov ($Zhl,&DWP(4,$inp)); + &mov ($Zlh,&DWP(8,$inp)); + &mov ($Zll,&DWP(12,$inp)); + + &deposit_rem_4bit(16); + + &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(12,"esp"),$Zll); + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(0)); + } + + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_gmult_4bit".$suffix); + +&function_begin("gcm_ghash_4bit".$suffix); + &stack_push(16+4+1); # +1 for 64-bit alignment + &mov ($Zll,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ("ecx",&wparam(3)); # load len + &add ("ecx",$inp); + &mov (&wparam(3),"ecx"); + + &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zll)); + &mov ($Zlh,&DWP(8,$Zll)); + &mov ($Zll,&DWP(12,$Zll)); + + &deposit_rem_4bit(16); + + &set_label("x86_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); # xor with input + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&DWP(12,"esp"),$Zll); # dump it on stack + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(0,"esp"),$Zhh); + + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(2)); + } + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &mov (&wparam(2),$inp) if (!$unroll); + &jb (&label("x86_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_ghash_4bit".$suffix); + +if (!$x86only) {{{ + +&static_label("rem_4bit"); + +if (!$sse2) {{ # pure-MMX "May" version... + +$S=12; # shift factor for rem_4bit + +&function_begin_B("_mmx_gmult_4bit_inner"); +# MMX version performs 3.5 times better on P4 (see comment in non-MMX +# routine for further details), 100% better on Opteron, ~70% better +# on Core2 and PIII... In other words effort is considered to be well +# spent... Since initial release the loop was unrolled in order to +# "liberate" register previously used as loop counter. Instead it's +# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. +# The path involves move of Z.lo from MMX to integer register, +# effective address calculation and finally merge of value to Z.hi. +# Reference to rem_4bit is scheduled so late that I had to >>4 +# rem_4bit elements. This resulted in 20-45% procent improvement +# on contemporary µ-archs. +{ + my $cnt; + my $rem_4bit = "eax"; + my @rem = ($Zhh,$Zll); + my $nhi = $Zhl; + my $nlo = $Zlh; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem[0],$Zlo); + + for ($cnt=28;$cnt>=-2;$cnt--) { + my $odd = $cnt&1; + my $nix = $odd ? $nlo : $nhi; + + &shl (&LB($nlo),4) if ($odd); + &psrlq ($Zlo,4); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nix)); + &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); + &psllq ($tmp,60); + &and ($nhi,0xf0) if ($odd); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); + &and ($rem[0],0xf); + &pxor ($Zhi,&QWP(0,$Htbl,$nix)); + &mov ($nhi,$nlo) if (!$odd && $cnt>=0); + &movd ($rem[1],$Zlo); + &pxor ($Zlo,$tmp); + + push (@rem,shift(@rem)); # "rotate" registers + } + + &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + &shl ($inp,4); # compensate for rem_4bit[i] being >>4 + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &xor ($Zhh,$inp); + &bswap ($Zhh); + + &ret (); +} +&function_end_B("_mmx_gmult_4bit_inner"); + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +# Streamed version performs 20% better on P4, 7% on Opteron, +# 10% on Core2 and PIII... +&function_begin("gcm_ghash_4bit_mmx"); + &mov ($Zhh,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ($Zlh,&wparam(3)); # load len + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &add ($Zlh,$inp); + &mov (&wparam(3),$Zlh); # len to point at the end of input + &stack_push(4+1); # +1 for stack alignment + + &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zhh)); + &mov ($Zlh,&DWP(8,$Zhh)); + &mov ($Zhh,&DWP(0,$Zhh)); + &jmp (&label("mmx_outer_loop")); + + &set_label("mmx_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&wparam(2),$inp); + &mov (&DWP(12,"esp"),$Zll); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(0,"esp"),$Zhh); + + &mov ($inp,"esp"); + &shr ($Zll,24); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(2)); + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &jb (&label("mmx_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); + + &stack_pop(4+1); +&function_end("gcm_ghash_4bit_mmx"); + +}} else {{ # "June" MMX version... + # ... has slower "April" gcm_gmult_4bit_mmx with folded + # loop. This is done to conserve code size... +$S=16; # shift factor for rem_4bit + +sub mmx_loop() { +# MMX version performs 2.8 times better on P4 (see comment in non-MMX +# routine for further details), 40% better on Opteron and Core2, 50% +# better on PIII... In other words effort is considered to be well +# spent... + my $inp = shift; + my $rem_4bit = shift; + my $cnt = $Zhh; + my $nhi = $Zhl; + my $nlo = $Zlh; + my $rem = $Zll; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &mov ($cnt,14); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem,$Zlo); + &jmp (&label("mmx_loop")); + + &set_label("mmx_loop",16); + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &mov (&LB($nlo),&BP(0,$inp,$cnt)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &dec ($cnt); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); + &pxor ($Zlo,$tmp); + &js (&label("mmx_break")); + + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + &jmp (&label("mmx_loop")); + + &set_label("mmx_break",16); + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &bswap ($Zhh); +} + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &mmx_loop($inp,"eax"); + + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +###################################################################### +# Below subroutine is "528B" variant of "4-bit" GCM GHASH function +# (see gcm128.c for details). It provides further 20-40% performance +# improvement over above mentioned "May" version. + +&static_label("rem_8bit"); + +&function_begin("gcm_ghash_4bit_mmx"); +{ my ($Zlo,$Zhi) = ("mm7","mm6"); + my $rem_8bit = "esi"; + my $Htbl = "ebx"; + + # parameter block + &mov ("eax",&wparam(0)); # Xi + &mov ("ebx",&wparam(1)); # Htable + &mov ("ecx",&wparam(2)); # inp + &mov ("edx",&wparam(3)); # len + &mov ("ebp","esp"); # original %esp + &call (&label("pic_point")); + &set_label ("pic_point"); + &blindpop ($rem_8bit); + &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit)); + + &sub ("esp",512+16+16); # allocate stack frame... + &and ("esp",-64); # ...and align it + &sub ("esp",16); # place for (u8)(H[]<<4) + + &add ("edx","ecx"); # pointer to the end of input + &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi + &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len + &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp + + { my @lo = ("mm0","mm1","mm2"); + my @hi = ("mm3","mm4","mm5"); + my @tmp = ("mm6","mm7"); + my ($off1,$off2,$i) = (0,0,); + + &add ($Htbl,128); # optimize for size + &lea ("edi",&DWP(16+128,"esp")); + &lea ("ebp",&DWP(16+256+128,"esp")); + + # decompose Htable (low and high parts are kept separately), + # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... + for ($i=0;$i<18;$i++) { + + &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); + &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); + &psllq ($tmp[1],60) if ($i>1); + &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); + &por ($lo[2],$tmp[1]) if ($i>1); + &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); + &psrlq ($lo[1],4) if ($i>0 && $i<17); + &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); + &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); + &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); + &psrlq ($hi[1],4) if ($i>0 && $i<17); + &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); + &shl ("edx",4) if ($i<16); + &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); + + unshift (@lo,pop(@lo)); # "rotate" registers + unshift (@hi,pop(@hi)); + unshift (@tmp,pop(@tmp)); + $off1 += 8 if ($i>0); + $off2 += 8 if ($i>1); + } + } + + &movq ($Zhi,&QWP(0,"eax")); + &mov ("ebx",&DWP(8,"eax")); + &mov ("edx",&DWP(12,"eax")); # load Xi + +&set_label("outer",16); + { my $nlo = "eax"; + my $dat = "edx"; + my @nhi = ("edi","ebp"); + my @rem = ("ebx","ecx"); + my @red = ("mm0","mm1","mm2"); + my $tmp = "mm3"; + + &xor ($dat,&DWP(12,"ecx")); # merge input data + &xor ("ebx",&DWP(8,"ecx")); + &pxor ($Zhi,&QWP(0,"ecx")); + &lea ("ecx",&DWP(16,"ecx")); # inp+=16 + #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi + &mov (&DWP(528+8,"esp"),"ebx"); + &movq (&QWP(528+0,"esp"),$Zhi); + &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &mov ($nhi[1],$nlo); + &and (&LB($nlo),0x0f); + &shr ($nhi[1],4); + &pxor ($red[0],$red[0]); + &rol ($dat,8); # next byte + &pxor ($red[1],$red[1]); + &pxor ($red[2],$red[2]); + + # Just like in "May" version modulo-schedule for critical path in + # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' + # is scheduled so late that rem_8bit[] has to be shifted *right* + # by 16, which is why last argument to pinsrw is 2, which + # corresponds to <<32=<<48>>16... + for ($j=11,$i=0;$i<15;$i++) { + + if ($i>0) { + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &rol ($dat,8); # next byte + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + } else { + &movq ($Zlo,&QWP(16,"esp",$nlo,8)); + &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); + } + + &mov (&LB($nlo),&LB($dat)); + &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0); + + &movd ($rem[0],$Zlo); + &movz ($rem[1],&LB($rem[1])) if ($i>0); + &psrlq ($Zlo,8); # Z>>=8 + + &movq ($tmp,$Zhi); + &mov ($nhi[0],$nlo); + &psrlq ($Zhi,8); + + &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 + &and (&LB($nlo),0x0f); + &psllq ($tmp,56); + + &pxor ($Zhi,$red[1]) if ($i>1); + &shr ($nhi[0],4); + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); + + unshift (@red,pop(@red)); # "rotate" registers + unshift (@rem,pop(@rem)); + unshift (@nhi,pop(@nhi)); + } + + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &movz ($rem[1],&LB($rem[1])); + + &pxor ($red[2],$red[2]); # clear 2nd word + &psllq ($red[1],4); + + &movd ($rem[0],$Zlo); + &psrlq ($Zlo,4); # Z>>=4 + + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &shl ($rem[0],4); # rem<<4 + + &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] + &psllq ($tmp,60); + &movz ($rem[0],&LB($rem[0])); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); + + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); + &pxor ($Zhi,$red[1]); + + &movd ($dat,$Zlo); + &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 + + &psllq ($red[0],12); # correct by <<16>>4 + &pxor ($Zhi,$red[0]); + &psrlq ($Zlo,32); + &pxor ($Zhi,$red[2]); + + &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp + &movd ("ebx",$Zlo); + &movq ($tmp,$Zhi); # 01234567 + &psllw ($Zhi,8); # 1.3.5.7. + &psrlw ($tmp,8); # .0.2.4.6 + &por ($Zhi,$tmp); # 10325476 + &bswap ($dat); + &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 + &bswap ("ebx"); + + &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? + &jne (&label("outer")); + } + + &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi + &mov (&DWP(12,"eax"),"edx"); + &mov (&DWP(8,"eax"),"ebx"); + &movq (&QWP(0,"eax"),$Zhi); + + &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp + &emms (); +} +&function_end("gcm_ghash_4bit_mmx"); +}} + +if ($sse2) {{ +###################################################################### +# PCLMULQDQ version. + +$Xip="eax"; +$Htbl="edx"; +$const="ecx"; +$inp="esi"; +$len="ebx"; + +($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; +($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); +($Xn,$Xhn)=("xmm6","xmm7"); + +&static_label("bswap"); + +sub clmul64x64_T2 { # minimal "register" pressure +my ($Xhi,$Xi,$Hkey,$HK)=@_; + + &movdqa ($Xhi,$Xi); # + &pshufd ($T1,$Xi,0b01001110); + &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK)); + &pxor ($T1,$Xi); # + &pxor ($T2,$Hkey) if (!defined($HK)); + $HK=$T2 if (!defined($HK)); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T1,$HK,0x00); ####### + &xorps ($T1,$Xi); # + &xorps ($T1,$Xhi); # + + &movdqa ($T2,$T1); # + &psrldq ($T1,8); + &pslldq ($T2,8); # + &pxor ($Xhi,$T1); + &pxor ($Xi,$T2); # +} + +sub clmul64x64_T3 { +# Even though this subroutine offers visually better ILP, it +# was empirically found to be a tad slower than above version. +# At least in gcm_ghash_clmul context. But it's just as well, +# because loop modulo-scheduling is possible only thanks to +# minimized "register" pressure... +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($T1,$Xi); # + &movdqa ($Xhi,$Xi); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pshufd ($T2,$T1,0b01001110); # + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T2,$T1); # + &pxor ($T3,$Hkey); + &pclmulqdq ($T2,$T3,0x00); ####### + &pxor ($T2,$Xi); # + &pxor ($T2,$Xhi); # + + &movdqa ($T3,$T2); # + &psrldq ($T2,8); + &pslldq ($T3,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # +} + +if (1) { # Algorithm 9 with <<1 twist. + # Reduction is shorter and uses only two + # temporary registers, which makes it better + # candidate for interleaving with 64x64 + # multiplication. Pre-modulo-scheduled loop + # was found to be ~20% faster than Algorithm 5 + # below. Algorithm 9 was therefore chosen for + # further optimization... + +sub reduction_alg9 { # 17/11 times faster than Intel version +my ($Xhi,$Xi) = @_; + + # 1st phase + &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,57); # + &movdqa ($T1,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + + # 2nd phase + &movdqa ($T2,$Xi); + &psrlq ($Xi,1); + &pxor ($Xhi,$T2); # + &pxor ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$Xhi) # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # <<1 twist + &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword + &movdqa ($T1,$Hkey); + &psllq ($Hkey,1); + &pxor ($T3,$T3); # + &psrlq ($T1,63); + &pcmpgtd ($T3,$T2); # broadcast carry bit + &pslldq ($T1,8); + &por ($Hkey,$T1); # H<<=1 + + # magic reduction + &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial + &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &pshufd ($T1,$Hkey,0b01001110); + &pshufd ($T2,$Xi,0b01001110); + &pxor ($T1,$Hkey); # Karatsuba pre-processing + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &pxor ($T2,$Xi); # Karatsuba pre-processing + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + &palignr ($T2,$T1,8); # low part is H.lo^H.hi + &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt" + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movups ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + &movups ($T2,&QWP(32,$Htbl)); + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); + &reduction_alg9 ($Xhi,$Xi); + + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &movdqu ($T3,&QWP(32,$Htbl)); + &pxor ($Xi,$T1); # Ii+Xi + + &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($T1,$Xn); # + &lea ($inp,&DWP(32,$inp)); # i+=2 + + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &pclmulqdq ($T1,$T3,0x00); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &nop (); + + &sub ($len,0x20); + &jbe (&label("even_tail")); + &jmp (&label("mod_loop")); + +&set_label("mod_loop",32); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # + &nop (); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movups ($Hkey,&QWP(0,$Htbl)); # load H + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &movdqa ($T3,&QWP(0,$const)); + &xorps ($Xhi,$Xhn); + &movdqu ($Xhn,&QWP(0,$inp)); # Ii + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pxor ($T1,$Xhi); # + + &pshufb ($Xhn,$T3); + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + &pshufb ($Xn,$T3); + &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early + + &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &movups ($T3,&QWP(32,$Htbl)); + &psllq ($Xi,57); # + &movdqa ($T1,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + &pshufd ($T1,$Xhn,0b01001110); + &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,1); + &pxor ($T1,$Xhn); + &pxor ($Xhi,$T2); # + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &pxor ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$Xhi) # + &pclmulqdq ($T1,$T3,0x00); ####### + + &lea ($inp,&DWP(32,$inp)); + &sub ($len,0x20); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movdqa ($T3,&QWP(0,$const)); + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &xorps ($Xhi,$Xhn); + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &pxor ($T1,$Xhi); # + + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + + &reduction_alg9 ($Xhi,$Xi); + + &test ($len,$len); + &jnz (&label("done")); + + &movups ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); + +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} else { # Algorithm 5. Kept for reference purposes. + +sub reduction_alg5 { # 19/16 times faster than Intel version +my ($Xhi,$Xi)=@_; + + # <<1 + &movdqa ($T1,$Xi); # + &movdqa ($T2,$Xhi); + &pslld ($Xi,1); + &pslld ($Xhi,1); # + &psrld ($T1,31); + &psrld ($T2,31); # + &movdqa ($T3,$T1); + &pslldq ($T1,4); + &psrldq ($T3,12); # + &pslldq ($T2,4); + &por ($Xhi,$T3); # + &por ($Xi,$T1); + &por ($Xhi,$T2); # + + # 1st phase + &movdqa ($T1,$Xi); + &movdqa ($T2,$Xi); + &movdqa ($T3,$Xi); # + &pslld ($T1,31); + &pslld ($T2,30); + &pslld ($Xi,25); # + &pxor ($T1,$T2); + &pxor ($T1,$Xi); # + &movdqa ($T2,$T1); # + &pslldq ($T1,12); + &psrldq ($T2,4); # + &pxor ($T3,$T1); + + # 2nd phase + &pxor ($Xhi,$T3); # + &movdqa ($Xi,$T3); + &movdqa ($T1,$T3); + &psrld ($Xi,1); # + &psrld ($T1,2); + &psrld ($T3,7); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # + &pxor ($Xi,$Xhi); # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($Xn,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$Xn); + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &pshufb ($Xi,$Xn); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); # i+=2 + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + ####### + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); + &test ($len,$len); + &jnz (&label("done")); + + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} + +&set_label("bswap",64); + &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial +&set_label("rem_8bit",64); + &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); + &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); + &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); + &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); + &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); + &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); + &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); + &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); + &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); + &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); + &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); + &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); + &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); + &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); + &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); + &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); + &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); + &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); + &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); + &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); + &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); + &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); + &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); + &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); + &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); + &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); + &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); + &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); + &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); + &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); + &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); + &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}} # $sse2 + +&set_label("rem_4bit",64); + &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); + &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); + &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); + &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); +}}} # !$x86only + +&asciz("GHASH for x86, CRYPTOGAMS by "); +&asm_finish(); + +close STDOUT; + +# A question was risen about choice of vanilla MMX. Or rather why wasn't +# SSE2 chosen instead? In addition to the fact that MMX runs on legacy +# CPUs such as PIII, "4-bit" MMX version was observed to provide better +# performance than *corresponding* SSE2 one even on contemporary CPUs. +# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 +# implementation featuring full range of lookup-table sizes, but with +# per-invocation lookup table setup. Latter means that table size is +# chosen depending on how much data is to be hashed in every given call, +# more data - larger table. Best reported result for Core2 is ~4 cycles +# per processed byte out of 64KB block. This number accounts even for +# 64KB table setup overhead. As discussed in gcm128.c we choose to be +# more conservative in respect to lookup table sizes, but how do the +# results compare? Minimalistic "256B" MMX version delivers ~11 cycles +# on same platform. As also discussed in gcm128.c, next in line "8-bit +# Shoup's" or "4KB" method should deliver twice the performance of +# "256B" one, in other words not worse than ~6 cycles per byte. It +# should be also be noted that in SSE2 case improvement can be "super- +# linear," i.e. more than twice, mostly because >>8 maps to single +# instruction on SSE2 register. This is unlike "4-bit" case when >>4 +# maps to same amount of instructions in both MMX and SSE2 cases. +# Bottom line is that switch to SSE2 is considered to be justifiable +# only in case we choose to implement "8-bit" method... diff --git a/openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl b/openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl new file mode 100644 index 0000000..387e3f8 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl @@ -0,0 +1,1762 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) +# Haswell 0.55(+93%) (if system doesn't support AVX) +# Broadwell 0.45(+110%)(if system doesn't support AVX) +# Skylake 0.44(+110%)(if system doesn't support AVX) +# Bulldozer 1.49(+27%) +# Silvermont 2.88(+13%) +# Goldmont 1.08(+24%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, in +# 0.29 on Broadwell, and in 0.36 on Skylake. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.20) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$do4xaggr=1; + +# common register layout +$nlo="%rax"; +$nhi="%rbx"; +$Zlo="%r8"; +$Zhi="%r9"; +$tmp="%r10"; +$rem_4bit = "%r11"; + +$Xi="%rdi"; +$Htbl="%rsi"; + +# per-function register layout +$cnt="%rcx"; +$rem="%rdx"; + +sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or + $r =~ s/%[er]([sd]i)/%\1l/ or + $r =~ s/%[er](bp)/%\1l/ or + $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +{ my $N; + sub loop() { + my $inp = shift; + + $N++; +$code.=<<___; + xor $nlo,$nlo + xor $nhi,$nhi + mov `&LB("$Zlo")`,`&LB("$nlo")` + mov `&LB("$Zlo")`,`&LB("$nhi")` + shl \$4,`&LB("$nlo")` + mov \$14,$cnt + mov 8($Htbl,$nlo),$Zlo + mov ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + mov $Zlo,$rem + jmp .Loop$N + +.align 16 +.Loop$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + mov ($inp,$cnt),`&LB("$nlo")` + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + mov `&LB("$nlo")`,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + shl \$4,`&LB("$nlo")` + xor $tmp,$Zlo + dec $cnt + js .Lbreak$N + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + jmp .Loop$N + +.align 16 +.Lbreak$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + xor $tmp,$Zlo + xor ($rem_4bit,$rem,8),$Zhi + + bswap $Zlo + bswap $Zhi +___ +}} + +$code=<<___; +.text +.extern OPENSSL_ia32cap_P + +.globl gcm_gmult_4bit +.type gcm_gmult_4bit,\@function,2 +.align 16 +gcm_gmult_4bit: + push %rbx + push %rbp # %rbp and %r12 are pushed exclusively in + push %r12 # order to reuse Win64 exception handler... +.Lgmult_prologue: + + movzb 15($Xi),$Zlo + lea .Lrem_4bit(%rip),$rem_4bit +___ + &loop ($Xi); +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + mov 16(%rsp),%rbx + lea 24(%rsp),%rsp +.Lgmult_epilogue: + ret +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ + +# per-function register layout +$inp="%rdx"; +$len="%rcx"; +$rem_8bit=$rem_4bit; + +$code.=<<___; +.globl gcm_ghash_4bit +.type gcm_ghash_4bit,\@function,4 +.align 16 +gcm_ghash_4bit: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub \$280,%rsp +.Lghash_prologue: + mov $inp,%r14 # reassign couple of args + mov $len,%r15 +___ +{ my $inp="%r14"; + my $dat="%edx"; + my $len="%r15"; + my @nhi=("%ebx","%ecx"); + my @rem=("%r12","%r13"); + my $Hshr4="%rbp"; + + &sub ($Htbl,-128); # size optimization + &lea ($Hshr4,"16+128(%rsp)"); + { my @lo =($nlo,$nhi); + my @hi =($Zlo,$Zhi); + + &xor ($dat,$dat); + for ($i=0,$j=-2;$i<18;$i++,$j++) { + &mov ("$j(%rsp)",&LB($dat)) if ($i>1); + &or ($lo[0],$tmp) if ($i>1); + &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); + &shr ($lo[1],4) if ($i>0 && $i<17); + &mov ($tmp,$hi[1]) if ($i>0 && $i<17); + &shr ($hi[1],4) if ($i>0 && $i<17); + &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); + &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); + &shl (&LB($dat),4) if ($i>0 && $i<17); + &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); + &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); + &shl ($tmp,60) if ($i>0 && $i<17); + + push (@lo,shift(@lo)); + push (@hi,shift(@hi)); + } + } + &add ($Htbl,-128); + &mov ($Zlo,"8($Xi)"); + &mov ($Zhi,"0($Xi)"); + &add ($len,$inp); # pointer to the end of data + &lea ($rem_8bit,".Lrem_8bit(%rip)"); + &jmp (".Louter_loop"); + +$code.=".align 16\n.Louter_loop:\n"; + &xor ($Zhi,"($inp)"); + &mov ("%rdx","8($inp)"); + &lea ($inp,"16($inp)"); + &xor ("%rdx",$Zlo); + &mov ("($Xi)",$Zhi); + &mov ("8($Xi)","%rdx"); + &shr ("%rdx",32); + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &movz ($nhi[0],&LB($dat)); + &shl (&LB($nlo),4); + &shr ($nhi[0],4); + + for ($j=11,$i=0;$i<15;$i++) { + &rol ($dat,8); + &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); + &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); + &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); + &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); + + &mov (&LB($nlo),&LB($dat)); + &xor ($Zlo,$tmp) if ($i>0); + &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); + + &movz ($nhi[1],&LB($dat)); + &shl (&LB($nlo),4); + &movzb ($rem[0],"(%rsp,$nhi[0])"); + + &shr ($nhi[1],4) if ($i<14); + &and ($nhi[1],0xf0) if ($i==14); + &shl ($rem[1],48) if ($i>0); + &xor ($rem[0],$Zlo); + + &mov ($tmp,$Zhi); + &xor ($Zhi,$rem[1]) if ($i>0); + &shr ($Zlo,8); + + &movz ($rem[0],&LB($rem[0])); + &mov ($dat,"$j($Xi)") if (--$j%4==0); + &shr ($Zhi,8); + + &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); + &shl ($tmp,56); + &xor ($Zhi,"($Hshr4,$nhi[0],8)"); + + unshift (@nhi,pop(@nhi)); # "rotate" registers + unshift (@rem,pop(@rem)); + } + &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); + &xor ($Zlo,"8($Htbl,$nlo)"); + &xor ($Zhi,"($Htbl,$nlo)"); + + &shl ($rem[1],48); + &xor ($Zlo,$tmp); + + &xor ($Zhi,$rem[1]); + &movz ($rem[0],&LB($Zlo)); + &shr ($Zlo,4); + + &mov ($tmp,$Zhi); + &shl (&LB($rem[0]),4); + &shr ($Zhi,4); + + &xor ($Zlo,"8($Htbl,$nhi[0])"); + &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); + &shl ($tmp,60); + + &xor ($Zhi,"($Htbl,$nhi[0])"); + &xor ($Zlo,$tmp); + &shl ($rem[0],48); + + &bswap ($Zlo); + &xor ($Zhi,$rem[0]); + + &bswap ($Zhi); + &cmp ($inp,$len); + &jb (".Louter_loop"); +} +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + lea 280(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lghash_epilogue: + ret +.size gcm_ghash_4bit,.-gcm_ghash_4bit +___ + +###################################################################### +# PCLMULQDQ version. + +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; +($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); + +sub clmul64x64_T2 { # minimal register pressure +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) { $HK = $T2; +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey,$T2 + pxor $Xi,$T1 # + pxor $Hkey,$T2 +___ +} else { +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 # +___ +} +$code.=<<___; + pclmulqdq \$0x00,$Hkey,$Xi ####### + pclmulqdq \$0x11,$Hkey,$Xhi ####### + pclmulqdq \$0x00,$HK,$T1 ####### + pxor $Xi,$T1 # + pxor $Xhi,$T1 # + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ +} + +sub reduction_alg9 { # 17/11 times faster than Intel version +my ($Xhi,$Xi) = @_; + +$code.=<<___; + # 1st phase + movdqa $Xi,$T2 # + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T1 # + pslldq \$8,$Xi + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + + # 2nd phase + movdqa $Xi,$T2 + psrlq \$1,$Xi + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $Xhi,$Xi # +___ +} + +{ my ($Htbl,$Xip)=@_4args; + my $HK="%xmm6"; + +$code.=<<___; +.globl gcm_init_clmul +.type gcm_init_clmul,\@abi-omnipotent +.align 16 +gcm_init_clmul: +.L_init_clmul: +___ +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) +___ +$code.=<<___; + movdqu ($Xip),$Hkey + pshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + movdqa $Hkey,$T1 + psllq \$1,$Hkey + pxor $T3,$T3 # + psrlq \$63,$T1 + pcmpgtd $T2,$T3 # broadcast carry bit + pslldq \$8,$T1 + por $T1,$Hkey # H<<=1 + + # magic reduction + pand .L0x1c2_polynomial(%rip),$T3 + pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + pshufd \$0b01001110,$Hkey,$HK + movdqa $Hkey,$Xi + pxor $Hkey,$HK +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$Hkey,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $Hkey,$T1 # Karatsuba pre-processing + movdqu $Hkey,0x00($Htbl) # save H + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x10($Htbl) # save H^2 + palignr \$8,$T1,$T2 # low part is H.lo^H.hi... + movdqu $T2,0x20($Htbl) # save Karatsuba "salt" +___ +if ($do4xaggr) { + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqa $Xi,$T3 +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$T3,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $T3,$T1 # Karatsuba pre-processing + movdqu $T3,0x30($Htbl) # save H^3 + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x40($Htbl) # save H^4 + palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... + movdqu $T2,0x50($Htbl) # save Karatsuba "salt" +___ +} +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +.LSEH_end_gcm_init_clmul: +___ +$code.=<<___; + ret +.size gcm_init_clmul,.-gcm_init_clmul +___ +} + +{ my ($Xip,$Htbl)=@_4args; + +$code.=<<___; +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,\@abi-omnipotent +.align 16 +gcm_gmult_clmul: +.L_gmult_clmul: + movdqu ($Xip),$Xi + movdqa .Lbswap_mask(%rip),$T3 + movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$T2 + pshufb $T3,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); +$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); + # experimental alternative. special thing about is that there + # no dependency between the two multiplications... + mov \$`0xE1<<1`,%eax + mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff + mov \$0x07,%r11d + movq %rax,$T1 + movq %r10,$T2 + movq %r11,$T3 # borrow $T3 + pand $Xi,$T3 + pshufb $T3,$T2 # ($Xi&7)·0xE0 + movq %rax,$T3 + pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) + pxor $Xi,$T2 + pslldq \$15,$T2 + paddd $T2,$T2 # <<(64+56+1) + pxor $T2,$Xi + pclmulqdq \$0x01,$T3,$Xi + movdqa .Lbswap_mask(%rip),$T3 # reload $T3 + psrldq \$1,$T1 + pxor $T1,$Xhi + pslldq \$7,$Xi + pxor $Xhi,$Xi +___ +$code.=<<___; + pshufb $T3,$Xi + movdqu $Xi,($Xip) + ret +.size gcm_gmult_clmul,.-gcm_gmult_clmul +___ +} + +{ my ($Xip,$Htbl,$inp,$len)=@_4args; + my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); + my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); + +$code.=<<___; +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,\@abi-omnipotent +.align 32 +gcm_ghash_clmul: +.L_ghash_clmul: +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax +.LSEH_begin_gcm_ghash_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) +___ +$code.=<<___; + movdqa .Lbswap_mask(%rip),$T3 + + movdqu ($Xip),$Xi + movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$HK + pshufb $T3,$Xi + + sub \$0x10,$len + jz .Lodd_tail + + movdqu 0x10($Htbl),$Hkey2 +___ +if ($do4xaggr) { +my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); + +$code.=<<___; + mov OPENSSL_ia32cap_P+4(%rip),%eax + cmp \$0x30,$len + jb .Lskip4x + + and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE + cmp \$`1<<22`,%eax # check for MOVBE without XSAVE + je .Lskip4x + + sub \$0x30,$len + mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff + movdqu 0x30($Htbl),$Hkey3 + movdqu 0x40($Htbl),$Hkey4 + + ####### + # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P + # + movdqu 0x30($inp),$Xln + movdqu 0x20($inp),$Xl + pshufb $T3,$Xln + pshufb $T3,$Xl + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey2,$Xl + pclmulqdq \$0x11,$Hkey2,$Xh + pclmulqdq \$0x10,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + xorps $Xm,$Xmn + + movdqu 0x10($inp),$Xl + movdqu 0($inp),$T1 + pshufb $T3,$Xl + pshufb $T3,$T1 + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $T1,$Xi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + pclmulqdq \$0x11,$Hkey3,$Xh + pclmulqdq \$0x00,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: + pclmulqdq \$0x00,$Hkey4,$Xi + xorps $Xm,$Xmn + movdqu 0x30($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x11,$Hkey4,$Xhi + xorps $Xln,$Xi + movdqu 0x20($inp),$Xln + movdqa $Xl,$Xh + pclmulqdq \$0x10,$HK,$T1 + pshufd \$0b01001110,$Xl,$Xm + xorps $Xhn,$Xhi + pxor $Xl,$Xm + pshufb $T3,$Xln + movups 0x20($Htbl),$HK + xorps $Xmn,$T1 + pclmulqdq \$0x00,$Hkey,$Xl + pshufd \$0b01001110,$Xln,$Xmn + + pxor $Xi,$T1 # aggregated Karatsuba post-processing + movdqa $Xln,$Xhn + pxor $Xhi,$T1 # + pxor $Xln,$Xmn + movdqa $T1,$T2 # + pclmulqdq \$0x11,$Hkey,$Xh + pslldq \$8,$T1 + psrldq \$8,$T2 # + pxor $T1,$Xi + movdqa .L7_mask(%rip),$T1 + pxor $T2,$Xhi # + movq %rax,$T2 + + pand $Xi,$T1 # 1st phase + pshufb $T1,$T2 # + pxor $Xi,$T2 # + pclmulqdq \$0x00,$HK,$Xm + psllq \$57,$T2 # + movdqa $T2,$T1 # + pslldq \$8,$T2 + pclmulqdq \$0x00,$Hkey2,$Xln + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + movdqu 0($inp),$T1 + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhn + xorps $Xl,$Xln + movdqu 0x10($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x10,$HK,$Xmn + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + pshufb $T3,$T1 + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + + movdqa $Xl,$Xh + pxor $Xm,$Xmn + pshufd \$0b01001110,$Xl,$Xm + pxor $T2,$Xi # + pxor $T1,$Xhi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + psrlq \$1,$Xi # + pxor $Xhi,$Xi # + movdqa $Xi,$Xhi + pclmulqdq \$0x11,$Hkey3,$Xh + xorps $Xl,$Xln + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + + pclmulqdq \$0x00,$HK,$Xm + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jnc .Lmod4_loop + +.Ltail4x: + pclmulqdq \$0x00,$Hkey4,$Xi + pclmulqdq \$0x11,$Hkey4,$Xhi + pclmulqdq \$0x10,$HK,$T1 + xorps $Xm,$Xmn + xorps $Xln,$Xi + xorps $Xhn,$Xhi + pxor $Xi,$Xhi # aggregated Karatsuba post-processing + pxor $Xmn,$T1 + + pxor $Xhi,$T1 # + pxor $Xi,$Xhi + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ + &reduction_alg9($Xhi,$Xi); +$code.=<<___; + add \$0x40,$len + jz .Ldone + movdqu 0x20($Htbl),$HK + sub \$0x10,$len + jz .Lodd_tail +.Lskip4x: +___ +} +$code.=<<___; + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + movdqu ($inp),$T1 # Ii + movdqu 16($inp),$Xln # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xln + pxor $T1,$Xi # Ii+Xi + + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + lea 32($inp),$inp # i+=2 + nop + sub \$0x20,$len + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # + + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + movdqu ($inp),$T2 # Ii + pxor $Xi,$T1 # aggregated Karatsuba post-processing + pshufb $T3,$T2 + movdqu 16($inp),$Xln # Ii+1 + + pxor $Xhi,$T1 + pxor $T2,$Xhi # "Ii+Xi", consume early + pxor $T1,$Xmn + pshufb $T3,$Xln + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # + + movdqa $Xln,$Xhn # + + movdqa $Xi,$T2 # 1st phase + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # + pclmulqdq \$0x00,$Hkey,$Xln ####### + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T1 # + pslldq \$8,$Xi + psrldq \$8,$T1 # + pxor $T2,$Xi + pshufd \$0b01001110,$Xhn,$Xmn + pxor $T1,$Xhi # + pxor $Xhn,$Xmn # + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey,$Xhn ####### + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + lea 32($inp),$inp + psrlq \$1,$Xi # + pclmulqdq \$0x00,$HK,$Xmn ####### + pxor $Xhi,$Xi # + + sub \$0x20,$len + ja .Lmod_loop + +.Leven_tail: + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # + + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + pxor $Xi,$T1 + pxor $Xhi,$T1 + pxor $T1,$Xmn + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # +___ + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + test $len,$len + jnz .Ldone + +.Lodd_tail: + movdqu ($inp),$T1 # Ii + pshufb $T3,$T1 + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; +.Ldone: + pshufb $T3,$Xi + movdqu $Xi,($Xip) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_clmul: +___ +$code.=<<___; + ret +.size gcm_ghash_clmul,.-gcm_ghash_clmul +___ +} + +$code.=<<___; +.globl gcm_init_avx +.type gcm_init_avx,\@abi-omnipotent +.align 32 +gcm_init_avx: +___ +if ($avx) { +my ($Htbl,$Xip)=@_4args; +my $HK="%xmm6"; + +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_avx: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Hkey + vpshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + vpsrlq \$63,$Hkey,$T1 + vpsllq \$1,$Hkey,$Hkey + vpxor $T3,$T3,$T3 # + vpcmpgtd $T2,$T3,$T3 # broadcast carry bit + vpslldq \$8,$T1,$T1 + vpor $T1,$Hkey,$Hkey # H<<=1 + + # magic reduction + vpand .L0x1c2_polynomial(%rip),$T3,$T3 + vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial + + vpunpckhqdq $Hkey,$Hkey,$HK + vmovdqa $Hkey,$Xi + vpxor $Hkey,$HK,$HK + mov \$4,%r10 # up to H^8 + jmp .Linit_start_avx +___ + +sub clmul64x64_avx { +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) { $HK = $T2; +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpunpckhqdq $Hkey,$Hkey,$T2 + vpxor $Xi,$T1,$T1 # + vpxor $Hkey,$T2,$T2 +___ +} else { +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpxor $Xi,$T1,$T1 # +___ +} +$code.=<<___; + vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### + vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### + vpclmulqdq \$0x00,$HK,$T1,$T1 ####### + vpxor $Xi,$Xhi,$T2 # + vpxor $T2,$T1,$T1 # + + vpslldq \$8,$T1,$T2 # + vpsrldq \$8,$T1,$T1 + vpxor $T2,$Xi,$Xi # + vpxor $T1,$Xhi,$Xhi +___ +} + +sub reduction_avx { +my ($Xhi,$Xi) = @_; + +$code.=<<___; + vpsllq \$57,$Xi,$T1 # 1st phase + vpsllq \$62,$Xi,$T2 + vpxor $T1,$T2,$T2 # + vpsllq \$63,$Xi,$T1 + vpxor $T1,$T2,$T2 # + vpslldq \$8,$T2,$T1 # + vpsrldq \$8,$T2,$T2 + vpxor $T1,$Xi,$Xi # + vpxor $T2,$Xhi,$Xhi + + vpsrlq \$1,$Xi,$T2 # 2nd phase + vpxor $Xi,$Xhi,$Xhi + vpxor $T2,$Xi,$Xi # + vpsrlq \$5,$T2,$T2 + vpxor $T2,$Xi,$Xi # + vpsrlq \$1,$Xi,$Xi # + vpxor $Xhi,$Xi,$Xi # +___ +} + +$code.=<<___; +.align 32 +.Linit_loop_avx: + vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... + vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; +.Linit_start_avx: + vmovdqa $Xi,$T3 +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; + vpshufd \$0b01001110,$T3,$T1 + vpshufd \$0b01001110,$Xi,$T2 + vpxor $T3,$T1,$T1 # Karatsuba pre-processing + vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 + vpxor $Xi,$T2,$T2 # Karatsuba pre-processing + vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 + lea 0x30($Htbl),$Htbl + sub \$1,%r10 + jnz .Linit_loop_avx + + vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped + vmovdqu $T3,-0x10($Htbl) + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +.LSEH_end_gcm_init_avx: +___ +$code.=<<___; + ret +.size gcm_init_avx,.-gcm_init_avx +___ +} else { +$code.=<<___; + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +___ +} + +$code.=<<___; +.globl gcm_gmult_avx +.type gcm_gmult_avx,\@abi-omnipotent +.align 32 +gcm_gmult_avx: + jmp .L_gmult_clmul +.size gcm_gmult_avx,.-gcm_gmult_avx +___ + +$code.=<<___; +.globl gcm_ghash_avx +.type gcm_ghash_avx,\@abi-omnipotent +.align 32 +gcm_ghash_avx: +___ +if ($avx) { +my ($Xip,$Htbl,$inp,$len)=@_4args; +my ($Xlo,$Xhi,$Xmi, + $Zlo,$Zhi,$Zmi, + $Hkey,$HK,$T1,$T2, + $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); + +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax +.LSEH_begin_gcm_ghash_avx: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Xi # load $Xi + lea .L0x1c2_polynomial(%rip),%r10 + lea 0x40($Htbl),$Htbl # size optimization + vmovdqu .Lbswap_mask(%rip),$bswap + vpshufb $bswap,$Xi,$Xi + cmp \$0x80,$len + jb .Lshort_avx + sub \$0x80,$len + + vmovdqu 0x70($inp),$Ii # I[7] + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpshufb $bswap,$Ii,$Ii + vmovdqu 0x20-0x40($Htbl),$HK + + vpunpckhqdq $Ii,$Ii,$T2 + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Ii,$T2,$T2 + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x50($inp),$Ii # I[5] + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpxor $Ii,$T2,$T2 + vmovdqu 0x40($inp),$Ij # I[4] + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + + vpshufb $bswap,$Ij,$Ij + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x30($inp),$Ii # I[3] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x20($inp),$Ij # I[2] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpxor $Xmi,$Zmi,$Zmi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x10($inp),$Ii # I[1] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu ($inp),$Ij # I[0] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x10,$HK,$T2,$Xmi + + lea 0x80($inp),$inp + cmp \$0x80,$len + jb .Ltail_avx + + vpxor $Xi,$Ij,$Ij # accumulate $Xi + sub \$0x80,$len + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x70($inp),$Ii # I[7] + vpxor $Xlo,$Zlo,$Zlo + vpxor $Ij,$T1,$T1 + vpclmulqdq \$0x00,$Hkey,$Ij,$Xi + vpshufb $bswap,$Ii,$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xo + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Tred + vmovdqu 0x20-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Zlo,$Xi,$Xi # collect result + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vxorps $Zhi,$Xo,$Xo + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Zmi,$Tred,$Tred + vxorps $Ij,$T1,$T1 + + vmovdqu 0x50($inp),$Ii # I[5] + vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Xo,$Tred,$Tred + vpslldq \$8,$Tred,$T2 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vpsrldq \$8,$Tred,$Tred + vpxor $T2, $Xi, $Xi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ii + vxorps $Tred,$Xo, $Xo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x40($inp),$Ij # I[4] + vpalignr \$8,$Xi,$Xi,$Tred # 1st phase + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vxorps $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + + vmovdqu 0x30($inp),$Ii # I[3] + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x20($inp),$Ij # I[2] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + vxorps $Tred,$Xi,$Xi + + vmovdqu 0x10($inp),$Ii # I[1] + vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vxorps $Xo,$Tred,$Tred + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu ($inp),$Ij # I[0] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Tred,$Ij,$Ij + vpclmulqdq \$0x10,$HK, $T2,$Xmi + vpxor $Xi,$Ij,$Ij # accumulate $Xi + + lea 0x80($inp),$inp + sub \$0x80,$len + jnc .Loop8x_avx + + add \$0x80,$len + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -0x10($inp,$len),$Ii # very last word + lea ($inp,$len),$inp + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vmovdqu 0x20-0x40($Htbl),$HK + vpshufb $bswap,$Ii,$Ij + + vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, + vmovdqa $Xhi,$Zhi # $Zhi and + vmovdqa $Xmi,$Zmi # $Zmi + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x20($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x30($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x50-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x40($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x50($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x80-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x60($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x70($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovq 0xb8-0x40($Htbl),$HK + sub \$0x10,$len + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor $Xi,$Ij,$Ij # accumulate $Xi +.Ltail_no_xor_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + + vmovdqu (%r10),$Tred + + vpxor $Xlo,$Zlo,$Xi + vpxor $Xhi,$Zhi,$Xo + vpxor $Xmi,$Zmi,$Zmi + + vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing + vpxor $Xo, $Zmi,$Zmi + vpslldq \$8, $Zmi,$T2 + vpsrldq \$8, $Zmi,$Zmi + vpxor $T2, $Xi, $Xi + vpxor $Zmi,$Xo, $Xo + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $Xo,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + cmp \$0,$len + jne .Lshort_avx + + vpshufb $bswap,$Xi,$Xi + vmovdqu $Xi,($Xip) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_avx: +___ +$code.=<<___; + ret +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} else { +$code.=<<___; + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: + .long 7,0,7,0 +.L7_mask_poly: + .long 7,0,`0xE1<<1`,0 +.align 64 +.type .Lrem_4bit,\@object +.Lrem_4bit: + .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` + .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` + .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` + .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` +.type .Lrem_8bit,\@object +.Lrem_8bit: + .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E + .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E + .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E + .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E + .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E + .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E + .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E + .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E + .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE + .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE + .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE + .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE + .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E + .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E + .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE + .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE + .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E + .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E + .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E + .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E + .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E + .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E + .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E + .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E + .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE + .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE + .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE + .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE + .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E + .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E + .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE + .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.asciz "GHASH for x86_64, CRYPTOGAMS by " +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 24(%rax),%rax # adjust "rsp" + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_gcm_gmult_4bit + .rva .LSEH_end_gcm_gmult_4bit + .rva .LSEH_info_gcm_gmult_4bit + + .rva .LSEH_begin_gcm_ghash_4bit + .rva .LSEH_end_gcm_ghash_4bit + .rva .LSEH_info_gcm_ghash_4bit + + .rva .LSEH_begin_gcm_init_clmul + .rva .LSEH_end_gcm_init_clmul + .rva .LSEH_info_gcm_init_clmul + + .rva .LSEH_begin_gcm_ghash_clmul + .rva .LSEH_end_gcm_ghash_clmul + .rva .LSEH_info_gcm_ghash_clmul +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_gcm_init_avx + .rva .LSEH_end_gcm_init_avx + .rva .LSEH_info_gcm_init_clmul + + .rva .LSEH_begin_gcm_ghash_avx + .rva .LSEH_end_gcm_ghash_avx + .rva .LSEH_info_gcm_ghash_clmul +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_gcm_gmult_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData +.LSEH_info_gcm_ghash_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lghash_prologue,.Lghash_epilogue # HandlerData +.LSEH_info_gcm_init_clmul: + .byte 0x01,0x08,0x03,0x00 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 +.LSEH_info_gcm_ghash_clmul: + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl b/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl new file mode 100755 index 0000000..f0598cb --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl @@ -0,0 +1,670 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +# May 2016 +# +# 2x aggregated reduction improves performance by 50% (resulting +# performance on POWER8 is 1 cycle per processed byte), and 4x +# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb). + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; + $UCMP="cmpld"; + $SHRI="srdi"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; + $UCMP="cmplw"; + $SHRI="srwi"; +} else { die "nonsense $flavour"; } + +$sp="r1"; +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); +my $vrsave="r12"; + +$code=<<___; +.machine "any" + +.text + +.globl .gcm_init_p8 +.align 5 +.gcm_init_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $H,0,r4 # load H + + vspltisb $xC2,-16 # 0xf0 + vspltisb $t0,1 # one + vaddubm $xC2,$xC2,$xC2 # 0xe0 + vxor $zero,$zero,$zero + vor $xC2,$xC2,$t0 # 0xe1 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... + vsldoi $t1,$zero,$t0,1 # ...1 + vaddubm $xC2,$xC2,$xC2 # 0xc2... + vspltisb $t2,7 + vor $xC2,$xC2,$t1 # 0xc2....01 + vspltb $t1,$H,0 # most significant byte + vsl $H,$H,$t0 # H<<=1 + vsrab $t1,$t1,$t2 # broadcast carry bit + vand $t1,$t1,$xC2 + vxor $IN,$H,$t1 # twisted H + + vsldoi $H,$IN,$IN,8 # twist even more ... + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 + vsldoi $Hl,$zero,$H,8 # ... and split + vsldoi $Hh,$H,$zero,8 + + stvx_u $xC2,0,r3 # save pre-computed table + stvx_u $Hl,r8,r3 + li r8,0x40 + stvx_u $H, r9,r3 + li r9,0x50 + stvx_u $Hh,r10,r3 + li r10,0x60 + + vpmsumd $Xl,$IN,$Hl # H.lo·H.lo + vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi + vpmsumd $Xh,$IN,$Hh # H.hi·H.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $IN1,$Xl,$t1 + + vsldoi $H2,$IN1,$IN1,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $H2l,r8,r3 # save H^2 + li r8,0x70 + stvx_u $H2,r9,r3 + li r9,0x80 + stvx_u $H2h,r10,r3 + li r10,0x90 +___ +{ +my ($t4,$t5,$t6) = ($Hl,$H,$Hh); +$code.=<<___; + vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo + vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo + vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi + vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi + vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi + vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vsldoi $t4,$Xm1,$zero,8 + vsldoi $t5,$zero,$Xm1,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + vxor $Xl1,$Xl1,$t4 + vxor $Xh1,$Xh1,$t5 + + vsldoi $Xl,$Xl,$Xl,8 + vsldoi $Xl1,$Xl1,$Xl1,8 + vxor $Xl,$Xl,$t2 + vxor $Xl1,$Xl1,$t6 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vpmsumd $Xl1,$Xl1,$xC2 + vxor $t1,$t1,$Xh + vxor $t5,$t5,$Xh1 + vxor $Xl,$Xl,$t1 + vxor $Xl1,$Xl1,$t5 + + vsldoi $H,$Xl,$Xl,8 + vsldoi $H2,$Xl1,$Xl1,8 + vsldoi $Hl,$zero,$H,8 + vsldoi $Hh,$H,$zero,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $Hl,r8,r3 # save H^3 + li r8,0xa0 + stvx_u $H,r9,r3 + li r9,0xb0 + stvx_u $Hh,r10,r3 + li r10,0xc0 + stvx_u $H2l,r8,r3 # save H^4 + stvx_u $H2,r9,r3 + stvx_u $H2h,r10,r3 + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_init_p8,.-.gcm_init_p8 +___ +} +$code.=<<___; +.globl .gcm_gmult_p8 +.align 5 +.gcm_gmult_p8: + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $IN,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $IN,$IN,$IN,$lemask + vxor $zero,$zero,$zero + + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl .gcm_ghash_p8 +.align 5 +.gcm_ghash_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $Xl,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + li r8,0x40 + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + li r9,0x50 + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + li r10,0x60 + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $Xl,$Xl,$Xl,$lemask + vxor $zero,$zero,$zero + + ${UCMP}i $len,64 + bge Lgcm_ghash_p8_4x + + lvx_u $IN,0,$inp + addi $inp,$inp,16 + subic. $len,$len,16 + le?vperm $IN,$IN,$IN,$lemask + vxor $IN,$IN,$Xl + beq Lshort + + lvx_u $H2l,r8,$Htbl # load H^2 + li r8,16 + lvx_u $H2, r9,$Htbl + add r9,$inp,$len # end of input + lvx_u $H2h,r10,$Htbl + be?b Loop_2x + +.align 5 +Loop_2x: + lvx_u $IN1,0,$inp + le?vperm $IN1,$IN1,$IN1,$lemask + + subic $len,$len,32 + vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo + vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo + subfe r0,r0,r0 # borrow?-1:0 + vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi + vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi + and r0,r0,$len + vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi + vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi + add $inp,$inp,r0 + + vxor $Xl,$Xl,$Xl1 + vxor $Xm,$Xm,$Xm1 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh1 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + lvx_u $IN,r8,$inp + addi $inp,$inp,32 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + le?vperm $IN,$IN,$IN,$lemask + vxor $t1,$t1,$Xh + vxor $IN,$IN,$t1 + vxor $IN,$IN,$Xl + $UCMP r9,$inp + bgt Loop_2x # done yet? + + cmplwi $len,0 + bne Leven + +Lshort: + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + +Leven: + vxor $Xl,$Xl,$t1 + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 +___ +{ +my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h, + $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31)); +my $IN0=$IN; +my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h); + +$code.=<<___; +.align 5 +.gcm_ghash_p8_4x: +Lgcm_ghash_p8_4x: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + li r10,0x60 + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME-4`($sp) # save vrsave + mtspr 256,r0 # preserve all AltiVec registers + + lvsl $t0,0,r8 # 0x0001..0e0f + #lvx_u $H2l,r8,$Htbl # load H^2 + li r8,0x70 + lvx_u $H2, r9,$Htbl + li r9,0x80 + vspltisb $t1,8 # 0x0808..0808 + #lvx_u $H2h,r10,$Htbl + li r10,0x90 + lvx_u $H3l,r8,$Htbl # load H^3 + li r8,0xa0 + lvx_u $H3, r9,$Htbl + li r9,0xb0 + lvx_u $H3h,r10,$Htbl + li r10,0xc0 + lvx_u $H4l,r8,$Htbl # load H^4 + li r8,0x10 + lvx_u $H4, r9,$Htbl + li r9,0x20 + lvx_u $H4h,r10,$Htbl + li r10,0x30 + + vsldoi $t2,$zero,$t1,8 # 0x0000..0808 + vaddubm $hiperm,$t0,$t2 # 0x0001..1617 + vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f + + $SHRI $len,$len,4 # this allows to use sign bit + # as carry + lvx_u $IN0,0,$inp # load input + lvx_u $IN1,r8,$inp + subic. $len,$len,8 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + + vxor $Xh,$IN0,$Xl + + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vperm $H21l,$H2,$H,$hiperm + vperm $t0,$IN2,$IN3,$loperm + vperm $H21h,$H2,$H,$loperm + vperm $t1,$IN2,$IN3,$hiperm + vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi + + vxor $Xm2,$Xm2,$Xm1 + vxor $Xl3,$Xl3,$Xl1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh3,$Xh3,$Xh1 + + blt Ltail_4x + +Loop_4x: + lvx_u $IN0,0,$inp + lvx_u $IN1,r8,$inp + subic. $len,$len,4 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + le?vperm $IN0,$IN0,$IN0,$lemask + + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + vxor $Xh,$Xh,$Xh3 + vperm $t0,$IN2,$IN3,$loperm + vperm $t1,$IN2,$IN3,$hiperm + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xl,$Xl,$xC2 + + vxor $Xl3,$Xl3,$Xl1 + vxor $Xh3,$Xh3,$Xh1 + vxor $Xh,$Xh,$IN0 + vxor $Xm2,$Xm2,$Xm1 + vxor $Xh,$Xh,$t1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh,$Xh,$Xl + bge Loop_4x + +Ltail_4x: + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh3 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + addic. $len,$len,4 + beq Ldone_4x + + lvx_u $IN0,0,$inp + ${UCMP}i $len,2 + li $len,-4 + blt Lone + lvx_u $IN1,r8,$inp + beq Ltwo + +Lthree: + lvx_u $IN2,r9,$inp + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + + vxor $Xh,$IN0,$Xl + vmr $H4l,$H3l + vmr $H4, $H3 + vmr $H4h,$H3h + + vperm $t0,$IN1,$IN2,$loperm + vperm $t1,$IN1,$IN2,$hiperm + vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo + vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi + + vxor $Xm3,$Xm3,$Xm2 + b Ltail_4x + +.align 4 +Ltwo: + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + + vxor $Xh,$IN0,$Xl + vperm $t0,$zero,$IN1,$loperm + vperm $t1,$zero,$IN1,$hiperm + + vsldoi $H4l,$zero,$H2,8 + vmr $H4, $H2 + vsldoi $H4h,$H2,$zero,8 + + vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo + vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi + vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi + + b Ltail_4x + +.align 4 +Lone: + le?vperm $IN0,$IN0,$IN0,$lemask + + vsldoi $H4l,$zero,$H,8 + vmr $H4, $H + vsldoi $H4h,$H,$zero,8 + + vxor $Xh,$IN0,$Xl + vxor $Xl3,$Xl3,$Xl3 + vxor $Xm3,$Xm3,$Xm3 + vxor $Xh3,$Xh3,$Xh3 + + b Ltail_4x + +Ldone_4x: + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtspr 256,$vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,0,0x80,0,4,0 + .long 0 +___ +} +$code.=<<___; +.size .gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o; + } else { + s/le\?/#le#/o or + s/be\?//o; + } + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl b/openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl new file mode 100644 index 0000000..dcd5f59 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl @@ -0,0 +1,430 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel from bits-n-pieces from +# other assembly modules. Just like aesv8-armx.pl this module +# supports both AArch32 and AArch64 execution modes. +# +# July 2014 +# +# Implement 2x aggregated reduction [see ghash-x86.pl for background +# information]. +# +# Current performance in cycles per processed byte: +# +# PMULL[2] 32-bit NEON(*) +# Apple A7 0.92 5.62 +# Cortex-A53 1.01 8.39 +# Cortex-A57 1.17 7.61 +# Denver 0.71 6.02 +# Mongoose 1.10 8.06 +# +# (*) presented for reference/comparison purposes; + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$Xi="x0"; # argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); + +$code=<<___; +#include "arm_arch.h" + +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour !~ /64/); +.fpu neon +.code 32 +#undef __thumb2__ +___ + +################################################################################ +# void gcm_init_v8(u128 Htable[16],const u64 H[2]); +# +# input: 128-bit H - secret parameter E(K,0^128) +# output: precomputed table filled with degrees of twisted H; +# H is twisted to handle reverse bitness of GHASH; +# only few of 16 slots of Htable[16] are used; +# data is opaque to outside world (which allows to +# optimize the code independently); +# +$code.=<<___; +.global gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + vld1.64 {$t1},[x1] @ load input H + vmov.i8 $xC2,#0xe1 + vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 + vext.8 $IN,$t1,$t1,#8 + vshr.u64 $t2,$xC2,#63 + vdup.32 $t1,${t1}[1] + vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 + vshr.u64 $t2,$IN,#63 + vshr.s32 $t1,$t1,#31 @ broadcast carry bit + vand $t2,$t2,$t0 + vshl.i64 $IN,$IN,#1 + vext.8 $t2,$t2,$t2,#8 + vand $t0,$t0,$t1 + vorr $IN,$IN,$t2 @ H<<<=1 + veor $H,$IN,$t0 @ twisted H + vst1.64 {$H},[x0],#16 @ store Htable[0] + + @ calculate H^2 + vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing + vpmull.p64 $Xl,$H,$H + veor $t0,$t0,$H + vpmull2.p64 $Xh,$H,$H + vpmull.p64 $Xm,$t0,$t0 + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $H2,$Xl,$t2 + + vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing + veor $t1,$t1,$H2 + vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed + vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2] + + ret +.size gcm_init_v8,.-gcm_init_v8 +___ +################################################################################ +# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); +# +# input: Xi - current hash value; +# Htable - table precomputed in gcm_init_v8; +# output: Xi - next hash value Xi; +# +$code.=<<___; +.global gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + vld1.64 {$t1},[$Xi] @ load Xi + vmov.i8 $xC2,#0xe1 + vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... + vshl.u64 $xC2,$xC2,#57 +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $IN,$t1,$t1,#8 + + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + + ret +.size gcm_gmult_v8,.-gcm_gmult_v8 +___ +################################################################################ +# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +# +# input: table precomputed in gcm_init_v8; +# current hash value Xi; +# pointer to input data; +# length of input data in bytes, but divisible by block size; +# output: next hash value Xi; +# +$code.=<<___; +.global gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: +___ +$code.=<<___ if ($flavour !~ /64/); + vstmdb sp!,{d8-d15} @ 32-bit ABI says so +___ +$code.=<<___; + vld1.64 {$Xl},[$Xi] @ load [rotated] Xi + @ "[rotated]" means that + @ loaded value would have + @ to be rotated in order to + @ make it appear as in + @ alorithm specification + subs $len,$len,#32 @ see if $len is 32 or larger + mov $inc,#16 @ $inc is used as post- + @ increment for input pointer; + @ as loop is modulo-scheduled + @ $inc is zeroed just in time + @ to preclude oversteping + @ inp[len], which means that + @ last block[s] are actually + @ loaded twice, but last + @ copy is not processed + vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 + vmov.i8 $xC2,#0xe1 + vld1.64 {$H2},[$Htbl] + cclr $inc,eq @ is it time to zero $inc? + vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi + vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] + vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant +#ifndef __ARMEB__ + vrev64.8 $t0,$t0 + vrev64.8 $Xl,$Xl +#endif + vext.8 $IN,$t0,$t0,#8 @ rotate I[0] + b.lo .Lodd_tail_v8 @ $len was less than 32 +___ +{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # +$code.=<<___; + vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $In,$t1,$t1,#8 + veor $IN,$IN,$Xl @ I[i]^=Xi + vpmull.p64 $Xln,$H,$In @ H·Ii+1 + veor $t1,$t1,$In @ Karatsuba pre-processing + vpmull2.p64 $Xhn,$H,$In + b .Loop_mod2x_v8 + +.align 4 +.Loop_mod2x_v8: + vext.8 $t2,$IN,$IN,#8 + subs $len,$len,#32 @ is there more data? + vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo + cclr $inc,lo @ is it time to zero $inc? + + vpmull.p64 $Xmn,$Hhl,$t1 + veor $t2,$t2,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi + veor $Xl,$Xl,$Xln @ accumulate + vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] + + veor $Xh,$Xh,$Xhn + cclr $inc,eq @ is it time to zero $inc? + veor $Xm,$Xm,$Xmn + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] +#ifndef __ARMEB__ + vrev64.8 $t0,$t0 +#endif + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vext.8 $In,$t1,$t1,#8 + vext.8 $IN,$t0,$t0,#8 + veor $Xl,$Xm,$t2 + vpmull.p64 $Xln,$H,$In @ H·Ii+1 + veor $IN,$IN,$Xh @ accumulate $IN early + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $IN,$IN,$t2 + veor $t1,$t1,$In @ Karatsuba pre-processing + veor $IN,$IN,$Xl + vpmull2.p64 $Xhn,$H,$In + b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes + + veor $Xh,$Xh,$t2 + vext.8 $IN,$t0,$t0,#8 @ re-construct $IN + adds $len,$len,#32 @ re-construct $len + veor $Xl,$Xl,$Xh @ re-construct $Xl + b.eq .Ldone_v8 @ is $len zero? +___ +} +$code.=<<___; +.Lodd_tail_v8: + vext.8 $t2,$Xl,$Xl,#8 + veor $IN,$IN,$Xl @ inp^=Xi + veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi + + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + +.Ldone_v8: +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} @ 32-bit ABI says so +___ +$code.=<<___; + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +___ +} +$code.=<<___; +.asciz "GHASH for ARMv8, CRYPTOGAMS by " +.align 2 +___ + +if ($flavour =~ /64/) { ######## 64-bit code + sub unvmov { + my $arg=shift; + + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && + sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; + } + foreach(split("\n",$code)) { + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vmov\s+(.*)/unvmov($1)/geo or + s/vext\.8/ext/o or + s/vshr\.s/sshr\.s/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + # fix up remainig legacy suffixes + s/\.[ui]?8(\s)/$1/o; + s/\.[uis]?32//o and s/\.16b/\.4s/go; + m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument + m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments + s/\.[uisp]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + sub unvpmullp64 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + $word |= 0x00010001 if ($mnemonic =~ "2"); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } + + foreach(split("\n",$code)) { + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\],#[0-9]+/]!/o; + + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; # enforce flush diff --git a/openssl-1.1.0h/crypto/modes/build.info b/openssl-1.1.0h/crypto/modes/build.info new file mode 100644 index 0000000..38195c4 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/build.info @@ -0,0 +1,27 @@ +LIBS=../../libcrypto +SOURCE[../../libcrypto]=\ + cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ + ccm128.c xts128.c wrap128.c ocb128.c \ + {- $target{modes_asm_src} -} + +INCLUDE[gcm128.o]=.. + +GENERATE[ghash-ia64.s]=asm/ghash-ia64.pl $(CFLAGS) $(LIB_CFLAGS) +GENERATE[ghash-x86.s]=asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR) +GENERATE[ghash-x86_64.s]=asm/ghash-x86_64.pl $(PERLASM_SCHEME) +GENERATE[aesni-gcm-x86_64.s]=asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME) +GENERATE[ghash-sparcv9.S]=asm/ghash-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[ghash-sparcv9.o]=.. +GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl $(PERLASM_SCHEME) +GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl $(PERLASM_SCHEME) +GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl $(PERLASM_SCHEME) +GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl $(PERLASM_SCHEME) +INCLUDE[ghash-armv4.o]=.. +GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME) +INCLUDE[ghashv8-armx.o]=.. + +BEGINRAW[Makefile] +# GNU make "catch all" +{- $builddir -}/ghash-%.S: {- $sourcedir -}/asm/ghash-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ +ENDRAW[Makefile] diff --git a/openssl-1.1.0h/crypto/modes/cbc128.c b/openssl-1.1.0h/crypto/modes/cbc128.c new file mode 100644 index 0000000..4ce5eb2 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/cbc128.c @@ -0,0 +1,161 @@ +/* + * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC) +# define STRICT_ALIGNMENT 0 +#endif + +void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block) +{ + size_t n; + const unsigned char *iv = ivec; + + if (len == 0) + return; + +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (STRICT_ALIGNMENT && + ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + while (len >= 16) { + for (n = 0; n < 16; ++n) + out[n] = in[n] ^ iv[n]; + (*block) (out, out, key); + iv = out; + len -= 16; + in += 16; + out += 16; + } + } else { + while (len >= 16) { + for (n = 0; n < 16; n += sizeof(size_t)) + *(size_t *)(out + n) = + *(size_t *)(in + n) ^ *(size_t *)(iv + n); + (*block) (out, out, key); + iv = out; + len -= 16; + in += 16; + out += 16; + } + } +#endif + while (len) { + for (n = 0; n < 16 && n < len; ++n) + out[n] = in[n] ^ iv[n]; + for (; n < 16; ++n) + out[n] = iv[n]; + (*block) (out, out, key); + iv = out; + if (len <= 16) + break; + len -= 16; + in += 16; + out += 16; + } + memcpy(ivec, iv, 16); +} + +void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block) +{ + size_t n; + union { + size_t t[16 / sizeof(size_t)]; + unsigned char c[16]; + } tmp; + + if (len == 0) + return; + +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (in != out) { + const unsigned char *iv = ivec; + + if (STRICT_ALIGNMENT && + ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + while (len >= 16) { + (*block) (in, out, key); + for (n = 0; n < 16; ++n) + out[n] ^= iv[n]; + iv = in; + len -= 16; + in += 16; + out += 16; + } + } else if (16 % sizeof(size_t) == 0) { /* always true */ + while (len >= 16) { + size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv; + + (*block) (in, out, key); + for (n = 0; n < 16 / sizeof(size_t); n++) + out_t[n] ^= iv_t[n]; + iv = in; + len -= 16; + in += 16; + out += 16; + } + } + memcpy(ivec, iv, 16); + } else { + if (STRICT_ALIGNMENT && + ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + unsigned char c; + while (len >= 16) { + (*block) (in, tmp.c, key); + for (n = 0; n < 16; ++n) { + c = in[n]; + out[n] = tmp.c[n] ^ ivec[n]; + ivec[n] = c; + } + len -= 16; + in += 16; + out += 16; + } + } else if (16 % sizeof(size_t) == 0) { /* always true */ + while (len >= 16) { + size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec; + const size_t *in_t = (const size_t *)in; + + (*block) (in, tmp.c, key); + for (n = 0; n < 16 / sizeof(size_t); n++) { + c = in_t[n]; + out_t[n] = tmp.t[n] ^ ivec_t[n]; + ivec_t[n] = c; + } + len -= 16; + in += 16; + out += 16; + } + } + } +#endif + while (len) { + unsigned char c; + (*block) (in, tmp.c, key); + for (n = 0; n < 16 && n < len; ++n) { + c = in[n]; + out[n] = tmp.c[n] ^ ivec[n]; + ivec[n] = c; + } + if (len <= 16) { + for (; n < 16; ++n) + ivec[n] = in[n]; + break; + } + len -= 16; + in += 16; + out += 16; + } +} diff --git a/openssl-1.1.0h/crypto/modes/ccm128.c b/openssl-1.1.0h/crypto/modes/ccm128.c new file mode 100644 index 0000000..85ce84f --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/ccm128.c @@ -0,0 +1,432 @@ +/* + * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +/* + * First you setup M and L parameters and pass the key schedule. This is + * called once per session setup... + */ +void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, + unsigned int M, unsigned int L, void *key, + block128_f block) +{ + memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c)); + ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3; + ctx->blocks = 0; + ctx->block = block; + ctx->key = key; +} + +/* !!! Following interfaces are to be called *once* per packet !!! */ + +/* Then you setup per-message nonce and pass the length of the message */ +int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, + const unsigned char *nonce, size_t nlen, size_t mlen) +{ + unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */ + + if (nlen < (14 - L)) + return -1; /* nonce is too short */ + + if (sizeof(mlen) == 8 && L >= 3) { + ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8))); + ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8))); + ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8))); + ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8))); + } else + ctx->nonce.u[1] = 0; + + ctx->nonce.c[12] = (u8)(mlen >> 24); + ctx->nonce.c[13] = (u8)(mlen >> 16); + ctx->nonce.c[14] = (u8)(mlen >> 8); + ctx->nonce.c[15] = (u8)mlen; + + ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */ + memcpy(&ctx->nonce.c[1], nonce, 14 - L); + + return 0; +} + +/* Then you pass additional authentication data, this is optional */ +void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, + const unsigned char *aad, size_t alen) +{ + unsigned int i; + block128_f block = ctx->block; + + if (alen == 0) + return; + + ctx->nonce.c[0] |= 0x40; /* set Adata flag */ + (*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++; + + if (alen < (0x10000 - 0x100)) { + ctx->cmac.c[0] ^= (u8)(alen >> 8); + ctx->cmac.c[1] ^= (u8)alen; + i = 2; + } else if (sizeof(alen) == 8 + && alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) { + ctx->cmac.c[0] ^= 0xFF; + ctx->cmac.c[1] ^= 0xFF; + ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8))); + ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8))); + ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8))); + ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8))); + ctx->cmac.c[6] ^= (u8)(alen >> 24); + ctx->cmac.c[7] ^= (u8)(alen >> 16); + ctx->cmac.c[8] ^= (u8)(alen >> 8); + ctx->cmac.c[9] ^= (u8)alen; + i = 10; + } else { + ctx->cmac.c[0] ^= 0xFF; + ctx->cmac.c[1] ^= 0xFE; + ctx->cmac.c[2] ^= (u8)(alen >> 24); + ctx->cmac.c[3] ^= (u8)(alen >> 16); + ctx->cmac.c[4] ^= (u8)(alen >> 8); + ctx->cmac.c[5] ^= (u8)alen; + i = 6; + } + + do { + for (; i < 16 && alen; ++i, ++aad, --alen) + ctx->cmac.c[i] ^= *aad; + (*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++; + i = 0; + } while (alen); +} + +/* Finally you encrypt or decrypt the message */ + +/* + * counter part of nonce may not be larger than L*8 bits, L is not larger + * than 8, therefore 64-bit counter... + */ +static void ctr64_inc(unsigned char *counter) +{ + unsigned int n = 8; + u8 c; + + counter += 8; + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) + return; + } while (n); +} + +int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len) +{ + size_t n; + unsigned int i, L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void *key = ctx->key; + union { + u64 u[2]; + u8 c[16]; + } scratch; + + if (!(flags0 & 0x40)) + (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++; + + ctx->nonce.c[0] = L = flags0 & 7; + for (n = 0, i = 15 - L; i < 15; ++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i] = 0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15] = 1; + + if (n != len) + return -1; /* length mismatch */ + + ctx->blocks += ((len + 15) >> 3) | 1; + if (ctx->blocks > (U64(1) << 61)) + return -2; /* too much data */ + + while (len >= 16) { +#if defined(STRICT_ALIGNMENT) + union { + u64 u[2]; + u8 c[16]; + } temp; + + memcpy(temp.c, inp, 16); + ctx->cmac.u[0] ^= temp.u[0]; + ctx->cmac.u[1] ^= temp.u[1]; +#else + ctx->cmac.u[0] ^= ((u64 *)inp)[0]; + ctx->cmac.u[1] ^= ((u64 *)inp)[1]; +#endif + (*block) (ctx->cmac.c, ctx->cmac.c, key); + (*block) (ctx->nonce.c, scratch.c, key); + ctr64_inc(ctx->nonce.c); +#if defined(STRICT_ALIGNMENT) + temp.u[0] ^= scratch.u[0]; + temp.u[1] ^= scratch.u[1]; + memcpy(out, temp.c, 16); +#else + ((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]; + ((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1]; +#endif + inp += 16; + out += 16; + len -= 16; + } + + if (len) { + for (i = 0; i < len; ++i) + ctx->cmac.c[i] ^= inp[i]; + (*block) (ctx->cmac.c, ctx->cmac.c, key); + (*block) (ctx->nonce.c, scratch.c, key); + for (i = 0; i < len; ++i) + out[i] = scratch.c[i] ^ inp[i]; + } + + for (i = 15 - L; i < 16; ++i) + ctx->nonce.c[i] = 0; + + (*block) (ctx->nonce.c, scratch.c, key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len) +{ + size_t n; + unsigned int i, L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void *key = ctx->key; + union { + u64 u[2]; + u8 c[16]; + } scratch; + + if (!(flags0 & 0x40)) + (*block) (ctx->nonce.c, ctx->cmac.c, key); + + ctx->nonce.c[0] = L = flags0 & 7; + for (n = 0, i = 15 - L; i < 15; ++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i] = 0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15] = 1; + + if (n != len) + return -1; + + while (len >= 16) { +#if defined(STRICT_ALIGNMENT) + union { + u64 u[2]; + u8 c[16]; + } temp; +#endif + (*block) (ctx->nonce.c, scratch.c, key); + ctr64_inc(ctx->nonce.c); +#if defined(STRICT_ALIGNMENT) + memcpy(temp.c, inp, 16); + ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]); + ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]); + memcpy(out, scratch.c, 16); +#else + ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]); + ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1]); +#endif + (*block) (ctx->cmac.c, ctx->cmac.c, key); + + inp += 16; + out += 16; + len -= 16; + } + + if (len) { + (*block) (ctx->nonce.c, scratch.c, key); + for (i = 0; i < len; ++i) + ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]); + (*block) (ctx->cmac.c, ctx->cmac.c, key); + } + + for (i = 15 - L; i < 16; ++i) + ctx->nonce.c[i] = 0; + + (*block) (ctx->nonce.c, scratch.c, key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +static void ctr64_add(unsigned char *counter, size_t inc) +{ + size_t n = 8, val = 0; + + counter += 8; + do { + --n; + val += counter[n] + (inc & 0xff); + counter[n] = (unsigned char)val; + val >>= 8; /* carry bit */ + inc >>= 8; + } while (n && (inc || val)); +} + +int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len, ccm128_f stream) +{ + size_t n; + unsigned int i, L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void *key = ctx->key; + union { + u64 u[2]; + u8 c[16]; + } scratch; + + if (!(flags0 & 0x40)) + (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++; + + ctx->nonce.c[0] = L = flags0 & 7; + for (n = 0, i = 15 - L; i < 15; ++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i] = 0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15] = 1; + + if (n != len) + return -1; /* length mismatch */ + + ctx->blocks += ((len + 15) >> 3) | 1; + if (ctx->blocks > (U64(1) << 61)) + return -2; /* too much data */ + + if ((n = len / 16)) { + (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c); + n *= 16; + inp += n; + out += n; + len -= n; + if (len) + ctr64_add(ctx->nonce.c, n / 16); + } + + if (len) { + for (i = 0; i < len; ++i) + ctx->cmac.c[i] ^= inp[i]; + (*block) (ctx->cmac.c, ctx->cmac.c, key); + (*block) (ctx->nonce.c, scratch.c, key); + for (i = 0; i < len; ++i) + out[i] = scratch.c[i] ^ inp[i]; + } + + for (i = 15 - L; i < 16; ++i) + ctx->nonce.c[i] = 0; + + (*block) (ctx->nonce.c, scratch.c, key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len, ccm128_f stream) +{ + size_t n; + unsigned int i, L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void *key = ctx->key; + union { + u64 u[2]; + u8 c[16]; + } scratch; + + if (!(flags0 & 0x40)) + (*block) (ctx->nonce.c, ctx->cmac.c, key); + + ctx->nonce.c[0] = L = flags0 & 7; + for (n = 0, i = 15 - L; i < 15; ++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i] = 0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15] = 1; + + if (n != len) + return -1; + + if ((n = len / 16)) { + (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c); + n *= 16; + inp += n; + out += n; + len -= n; + if (len) + ctr64_add(ctx->nonce.c, n / 16); + } + + if (len) { + (*block) (ctx->nonce.c, scratch.c, key); + for (i = 0; i < len; ++i) + ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]); + (*block) (ctx->cmac.c, ctx->cmac.c, key); + } + + for (i = 15 - L; i < 16; ++i) + ctx->nonce.c[i] = 0; + + (*block) (ctx->nonce.c, scratch.c, key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len) +{ + unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */ + + M *= 2; + M += 2; + if (len < M) + return 0; + memcpy(tag, ctx->cmac.c, M); + return M; +} diff --git a/openssl-1.1.0h/crypto/modes/cfb128.c b/openssl-1.1.0h/crypto/modes/cfb128.c new file mode 100644 index 0000000..e439567 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/cfb128.c @@ -0,0 +1,198 @@ +/* + * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +/* + * The input and output encrypted as though 128bit cfb mode is being used. + * The extra state information to record how much of the 128bit block we have + * used is contained in *num; + */ +void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], int *num, + int enc, block128_f block) +{ + unsigned int n; + size_t l = 0; + + n = *num; + + if (enc) { +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16 % sizeof(size_t) == 0) { /* always true actually */ + do { + while (n && len) { + *(out++) = ivec[n] ^= *(in++); + --len; + n = (n + 1) % 16; + } +# if defined(STRICT_ALIGNMENT) + if (((size_t)in | (size_t)out | (size_t)ivec) % + sizeof(size_t) != 0) + break; +# endif + while (len >= 16) { + (*block) (ivec, ivec, key); + for (; n < 16; n += sizeof(size_t)) { + *(size_t *)(out + n) = + *(size_t *)(ivec + n) ^= *(size_t *)(in + n); + } + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block) (ivec, ivec, key); + while (len--) { + out[n] = ivec[n] ^= in[n]; + ++n; + } + } + *num = n; + return; + } while (0); + } + /* the rest would be commonly eliminated by x86* compiler */ +#endif + while (l < len) { + if (n == 0) { + (*block) (ivec, ivec, key); + } + out[l] = ivec[n] ^= in[l]; + ++l; + n = (n + 1) % 16; + } + *num = n; + } else { +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16 % sizeof(size_t) == 0) { /* always true actually */ + do { + while (n && len) { + unsigned char c; + *(out++) = ivec[n] ^ (c = *(in++)); + ivec[n] = c; + --len; + n = (n + 1) % 16; + } +# if defined(STRICT_ALIGNMENT) + if (((size_t)in | (size_t)out | (size_t)ivec) % + sizeof(size_t) != 0) + break; +# endif + while (len >= 16) { + (*block) (ivec, ivec, key); + for (; n < 16; n += sizeof(size_t)) { + size_t t = *(size_t *)(in + n); + *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t; + *(size_t *)(ivec + n) = t; + } + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block) (ivec, ivec, key); + while (len--) { + unsigned char c; + out[n] = ivec[n] ^ (c = in[n]); + ivec[n] = c; + ++n; + } + } + *num = n; + return; + } while (0); + } + /* the rest would be commonly eliminated by x86* compiler */ +#endif + while (l < len) { + unsigned char c; + if (n == 0) { + (*block) (ivec, ivec, key); + } + out[l] = ivec[n] ^ (c = in[l]); + ivec[n] = c; + ++l; + n = (n + 1) % 16; + } + *num = n; + } +} + +/* + * This expects a single block of size nbits for both in and out. Note that + * it corrupts any extra bits in the last byte of out + */ +static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out, + int nbits, const void *key, + unsigned char ivec[16], int enc, + block128_f block) +{ + int n, rem, num; + unsigned char ovec[16 * 2 + 1]; /* +1 because we dereference (but don't + * use) one byte off the end */ + + if (nbits <= 0 || nbits > 128) + return; + + /* fill in the first half of the new IV with the current IV */ + memcpy(ovec, ivec, 16); + /* construct the new IV */ + (*block) (ivec, ivec, key); + num = (nbits + 7) / 8; + if (enc) /* encrypt the input */ + for (n = 0; n < num; ++n) + out[n] = (ovec[16 + n] = in[n] ^ ivec[n]); + else /* decrypt the input */ + for (n = 0; n < num; ++n) + out[n] = (ovec[16 + n] = in[n]) ^ ivec[n]; + /* shift ovec left... */ + rem = nbits % 8; + num = nbits / 8; + if (rem == 0) + memcpy(ivec, ovec + num, 16); + else + for (n = 0; n < 16; ++n) + ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem); + + /* it is not necessary to cleanse ovec, since the IV is not secret */ +} + +/* N.B. This expects the input to be packed, MS bit first */ +void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out, + size_t bits, const void *key, + unsigned char ivec[16], int *num, + int enc, block128_f block) +{ + size_t n; + unsigned char c[1], d[1]; + + for (n = 0; n < bits; ++n) { + c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0; + cfbr_encrypt_block(c, d, 1, key, ivec, enc, block); + out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) | + ((d[0] & 0x80) >> (unsigned int)(n % 8)); + } +} + +void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const void *key, + unsigned char ivec[16], int *num, + int enc, block128_f block) +{ + size_t n; + + for (n = 0; n < length; ++n) + cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block); +} diff --git a/openssl-1.1.0h/crypto/modes/ctr128.c b/openssl-1.1.0h/crypto/modes/ctr128.c new file mode 100644 index 0000000..03920b4 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/ctr128.c @@ -0,0 +1,209 @@ +/* + * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +/* + * NOTE: the IV/counter CTR mode is big-endian. The code itself is + * endian-neutral. + */ + +/* increment counter (128-bit int) by 1 */ +static void ctr128_inc(unsigned char *counter) +{ + u32 n = 16, c = 1; + + do { + --n; + c += counter[n]; + counter[n] = (u8)c; + c >>= 8; + } while (n); +} + +#if !defined(OPENSSL_SMALL_FOOTPRINT) +static void ctr128_inc_aligned(unsigned char *counter) +{ + size_t *data, c, d, n; + const union { + long one; + char little; + } is_endian = { + 1 + }; + + if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) { + ctr128_inc(counter); + return; + } + + data = (size_t *)counter; + c = 1; + n = 16 / sizeof(size_t); + do { + --n; + d = data[n] += c; + /* did addition carry? */ + c = ((d - c) & ~d) >> (sizeof(size_t) * 8 - 1); + } while (n); +} +#endif + +/* + * The input encrypted as though 128bit counter mode is being used. The + * extra state information to record how much of the 128bit block we have + * used is contained in *num, and the encrypted counter is kept in + * ecount_buf. Both *num and ecount_buf must be initialised with zeros + * before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes + * that the counter is in the x lower bits of the IV (ivec), and that the + * application has full control over overflow and the rest of the IV. This + * implementation takes NO responsibility for checking that the counter + * doesn't overflow into the rest of the IV when incremented. + */ +void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], + unsigned char ecount_buf[16], unsigned int *num, + block128_f block) +{ + unsigned int n; + size_t l = 0; + + n = *num; + +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16 % sizeof(size_t) == 0) { /* always true actually */ + do { + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n + 1) % 16; + } + +# if defined(STRICT_ALIGNMENT) + if (((size_t)in | (size_t)out | (size_t)ecount_buf) + % sizeof(size_t) != 0) + break; +# endif + while (len >= 16) { + (*block) (ivec, ecount_buf, key); + ctr128_inc_aligned(ivec); + for (n = 0; n < 16; n += sizeof(size_t)) + *(size_t *)(out + n) = + *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n); + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block) (ivec, ecount_buf, key); + ctr128_inc_aligned(ivec); + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } + *num = n; + return; + } while (0); + } + /* the rest would be commonly eliminated by x86* compiler */ +#endif + while (l < len) { + if (n == 0) { + (*block) (ivec, ecount_buf, key); + ctr128_inc(ivec); + } + out[l] = in[l] ^ ecount_buf[n]; + ++l; + n = (n + 1) % 16; + } + + *num = n; +} + +/* increment upper 96 bits of 128-bit counter by 1 */ +static void ctr96_inc(unsigned char *counter) +{ + u32 n = 12, c = 1; + + do { + --n; + c += counter[n]; + counter[n] = (u8)c; + c >>= 8; + } while (n); +} + +void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], + unsigned char ecount_buf[16], + unsigned int *num, ctr128_f func) +{ + unsigned int n, ctr32; + + n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n + 1) % 16; + } + + ctr32 = GETU32(ivec + 12); + while (len >= 16) { + size_t blocks = len / 16; + /* + * 1<<28 is just a not-so-small yet not-so-large number... + * Below condition is practically never met, but it has to + * be checked for code correctness. + */ + if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28)) + blocks = (1U << 28); + /* + * As (*func) operates on 32-bit counter, caller + * has to handle overflow. 'if' below detects the + * overflow, which is then handled by limiting the + * amount of blocks to the exact overflow point... + */ + ctr32 += (u32)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + (*func) (in, out, blocks, key, ivec); + /* (*ctr) does not update ivec, caller does: */ + PUTU32(ivec + 12, ctr32); + /* ... overflow was detected, propagate carry. */ + if (ctr32 == 0) + ctr96_inc(ivec); + blocks *= 16; + len -= blocks; + out += blocks; + in += blocks; + } + if (len) { + memset(ecount_buf, 0, 16); + (*func) (ecount_buf, ecount_buf, 1, key, ivec); + ++ctr32; + PUTU32(ivec + 12, ctr32); + if (ctr32 == 0) + ctr96_inc(ivec); + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } + + *num = n; +} diff --git a/openssl-1.1.0h/crypto/modes/cts128.c b/openssl-1.1.0h/crypto/modes/cts128.c new file mode 100644 index 0000000..77ec994 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/cts128.c @@ -0,0 +1,523 @@ +/* + * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +/* + * Trouble with Ciphertext Stealing, CTS, mode is that there is no + * common official specification, but couple of cipher/application + * specific ones: RFC2040 and RFC3962. Then there is 'Proposal to + * Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which + * deviates from mentioned RFCs. Most notably it allows input to be + * of block length and it doesn't flip the order of the last two + * blocks. CTS is being discussed even in ECB context, but it's not + * adopted for any known application. This implementation provides + * two interfaces: one compliant with above mentioned RFCs and one + * compliant with the NIST proposal, both extending CBC mode. + */ + +size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, + unsigned char *out, size_t len, + const void *key, unsigned char ivec[16], + block128_f block) +{ + size_t residue, n; + + if (len <= 16) + return 0; + + if ((residue = len % 16) == 0) + residue = 16; + + len -= residue; + + CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block); + + in += len; + out += len; + + for (n = 0; n < residue; ++n) + ivec[n] ^= in[n]; + (*block) (ivec, ivec, key); + memcpy(out, out - 16, residue); + memcpy(out - 16, ivec, 16); + + return len + residue; +} + +size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, + unsigned char *out, size_t len, + const void *key, + unsigned char ivec[16], + block128_f block) +{ + size_t residue, n; + + if (len < 16) + return 0; + + residue = len % 16; + + len -= residue; + + CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block); + + if (residue == 0) + return len; + + in += len; + out += len; + + for (n = 0; n < residue; ++n) + ivec[n] ^= in[n]; + (*block) (ivec, ivec, key); + memcpy(out - 16 + residue, ivec, 16); + + return len + residue; +} + +size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc) +{ + size_t residue; + union { + size_t align; + unsigned char c[16]; + } tmp; + + if (len <= 16) + return 0; + + if ((residue = len % 16) == 0) + residue = 16; + + len -= residue; + + (*cbc) (in, out, len, key, ivec, 1); + + in += len; + out += len; + +#if defined(CBC_HANDLES_TRUNCATED_IO) + memcpy(tmp.c, out - 16, 16); + (*cbc) (in, out - 16, residue, key, ivec, 1); + memcpy(out, tmp.c, residue); +#else + memset(tmp.c, 0, sizeof(tmp)); + memcpy(tmp.c, in, residue); + memcpy(out, out - 16, residue); + (*cbc) (tmp.c, out - 16, 16, key, ivec, 1); +#endif + return len + residue; +} + +size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc) +{ + size_t residue; + union { + size_t align; + unsigned char c[16]; + } tmp; + + if (len < 16) + return 0; + + residue = len % 16; + + len -= residue; + + (*cbc) (in, out, len, key, ivec, 1); + + if (residue == 0) + return len; + + in += len; + out += len; + +#if defined(CBC_HANDLES_TRUNCATED_IO) + (*cbc) (in, out - 16 + residue, residue, key, ivec, 1); +#else + memset(tmp.c, 0, sizeof(tmp)); + memcpy(tmp.c, in, residue); + (*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1); +#endif + return len + residue; +} + +size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, + unsigned char *out, size_t len, + const void *key, unsigned char ivec[16], + block128_f block) +{ + size_t residue, n; + union { + size_t align; + unsigned char c[32]; + } tmp; + + if (len <= 16) + return 0; + + if ((residue = len % 16) == 0) + residue = 16; + + len -= 16 + residue; + + if (len) { + CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block); + in += len; + out += len; + } + + (*block) (in, tmp.c + 16, key); + + memcpy(tmp.c, tmp.c + 16, 16); + memcpy(tmp.c, in + 16, residue); + (*block) (tmp.c, tmp.c, key); + + for (n = 0; n < 16; ++n) { + unsigned char c = in[n]; + out[n] = tmp.c[n] ^ ivec[n]; + ivec[n] = c; + } + for (residue += 16; n < residue; ++n) + out[n] = tmp.c[n] ^ in[n]; + + return 16 + len + residue; +} + +size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, + unsigned char *out, size_t len, + const void *key, + unsigned char ivec[16], + block128_f block) +{ + size_t residue, n; + union { + size_t align; + unsigned char c[32]; + } tmp; + + if (len < 16) + return 0; + + residue = len % 16; + + if (residue == 0) { + CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block); + return len; + } + + len -= 16 + residue; + + if (len) { + CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block); + in += len; + out += len; + } + + (*block) (in + residue, tmp.c + 16, key); + + memcpy(tmp.c, tmp.c + 16, 16); + memcpy(tmp.c, in, residue); + (*block) (tmp.c, tmp.c, key); + + for (n = 0; n < 16; ++n) { + unsigned char c = in[n]; + out[n] = tmp.c[n] ^ ivec[n]; + ivec[n] = in[n + residue]; + tmp.c[n] = c; + } + for (residue += 16; n < residue; ++n) + out[n] = tmp.c[n] ^ tmp.c[n - 16]; + + return 16 + len + residue; +} + +size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc) +{ + size_t residue; + union { + size_t align; + unsigned char c[32]; + } tmp; + + if (len <= 16) + return 0; + + if ((residue = len % 16) == 0) + residue = 16; + + len -= 16 + residue; + + if (len) { + (*cbc) (in, out, len, key, ivec, 0); + in += len; + out += len; + } + + memset(tmp.c, 0, sizeof(tmp)); + /* + * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] + */ + (*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0); + + memcpy(tmp.c, in + 16, residue); +#if defined(CBC_HANDLES_TRUNCATED_IO) + (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0); +#else + (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0); + memcpy(out, tmp.c, 16 + residue); +#endif + return 16 + len + residue; +} + +size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc) +{ + size_t residue; + union { + size_t align; + unsigned char c[32]; + } tmp; + + if (len < 16) + return 0; + + residue = len % 16; + + if (residue == 0) { + (*cbc) (in, out, len, key, ivec, 0); + return len; + } + + len -= 16 + residue; + + if (len) { + (*cbc) (in, out, len, key, ivec, 0); + in += len; + out += len; + } + + memset(tmp.c, 0, sizeof(tmp)); + /* + * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] + */ + (*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0); + + memcpy(tmp.c, in, residue); +#if defined(CBC_HANDLES_TRUNCATED_IO) + (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0); +#else + (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0); + memcpy(out, tmp.c, 16 + residue); +#endif + return 16 + len + residue; +} + +#if defined(SELFTEST) +# include +# include + +/* test vectors from RFC 3962 */ +static const unsigned char test_key[16] = "chicken teriyaki"; +static const unsigned char test_input[64] = + "I would like the" " General Gau's C" + "hicken, please, " "and wonton soup."; +static const unsigned char test_iv[16] = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + +static const unsigned char vector_17[17] = { + 0xc6, 0x35, 0x35, 0x68, 0xf2, 0xbf, 0x8c, 0xb4, + 0xd8, 0xa5, 0x80, 0x36, 0x2d, 0xa7, 0xff, 0x7f, + 0x97 +}; + +static const unsigned char vector_31[31] = { + 0xfc, 0x00, 0x78, 0x3e, 0x0e, 0xfd, 0xb2, 0xc1, + 0xd4, 0x45, 0xd4, 0xc8, 0xef, 0xf7, 0xed, 0x22, + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5 +}; + +static const unsigned char vector_32[32] = { + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8, + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84 +}; + +static const unsigned char vector_47[47] = { + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, + 0xb3, 0xff, 0xfd, 0x94, 0x0c, 0x16, 0xa1, 0x8c, + 0x1b, 0x55, 0x49, 0xd2, 0xf8, 0x38, 0x02, 0x9e, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5 +}; + +static const unsigned char vector_48[48] = { + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, + 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0, + 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8 +}; + +static const unsigned char vector_64[64] = { + 0x97, 0x68, 0x72, 0x68, 0xd6, 0xec, 0xcc, 0xc0, + 0xc0, 0x7b, 0x25, 0xe2, 0x5e, 0xcf, 0xe5, 0x84, + 0x39, 0x31, 0x25, 0x23, 0xa7, 0x86, 0x62, 0xd5, + 0xbe, 0x7f, 0xcb, 0xcc, 0x98, 0xeb, 0xf5, 0xa8, + 0x48, 0x07, 0xef, 0xe8, 0x36, 0xee, 0x89, 0xa5, + 0x26, 0x73, 0x0d, 0xbc, 0x2f, 0x7b, 0xc8, 0x40, + 0x9d, 0xad, 0x8b, 0xbb, 0x96, 0xc4, 0xcd, 0xc0, + 0x3b, 0xc1, 0x03, 0xe1, 0xa1, 0x94, 0xbb, 0xd8 +}; + +static AES_KEY encks, decks; + +void test_vector(const unsigned char *vector, size_t len) +{ + unsigned char iv[sizeof(test_iv)]; + unsigned char cleartext[64], ciphertext[64]; + size_t tail; + + printf("vector_%d\n", len); + fflush(stdout); + + if ((tail = len % 16) == 0) + tail = 16; + tail += 16; + + /* test block-based encryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_cts128_encrypt_block(test_input, ciphertext, len, &encks, iv, + (block128_f) AES_encrypt); + if (memcmp(ciphertext, vector, len)) + fprintf(stderr, "output_%d mismatch\n", len), exit(1); + if (memcmp(iv, vector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(1); + + /* test block-based decryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_cts128_decrypt_block(ciphertext, cleartext, len, &decks, iv, + (block128_f) AES_decrypt); + if (memcmp(cleartext, test_input, len)) + fprintf(stderr, "input_%d mismatch\n", len), exit(2); + if (memcmp(iv, vector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(2); + + /* test streamed encryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_cts128_encrypt(test_input, ciphertext, len, &encks, iv, + (cbc128_f) AES_cbc_encrypt); + if (memcmp(ciphertext, vector, len)) + fprintf(stderr, "output_%d mismatch\n", len), exit(3); + if (memcmp(iv, vector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(3); + + /* test streamed decryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_cts128_decrypt(ciphertext, cleartext, len, &decks, iv, + (cbc128_f) AES_cbc_encrypt); + if (memcmp(cleartext, test_input, len)) + fprintf(stderr, "input_%d mismatch\n", len), exit(4); + if (memcmp(iv, vector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(4); +} + +void test_nistvector(const unsigned char *vector, size_t len) +{ + unsigned char iv[sizeof(test_iv)]; + unsigned char cleartext[64], ciphertext[64], nistvector[64]; + size_t tail; + + printf("nistvector_%d\n", len); + fflush(stdout); + + if ((tail = len % 16) == 0) + tail = 16; + + len -= 16 + tail; + memcpy(nistvector, vector, len); + /* flip two last blocks */ + memcpy(nistvector + len, vector + len + 16, tail); + memcpy(nistvector + len + tail, vector + len, 16); + len += 16 + tail; + tail = 16; + + /* test block-based encryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_nistcts128_encrypt_block(test_input, ciphertext, len, &encks, iv, + (block128_f) AES_encrypt); + if (memcmp(ciphertext, nistvector, len)) + fprintf(stderr, "output_%d mismatch\n", len), exit(1); + if (memcmp(iv, nistvector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(1); + + /* test block-based decryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_nistcts128_decrypt_block(ciphertext, cleartext, len, &decks, iv, + (block128_f) AES_decrypt); + if (memcmp(cleartext, test_input, len)) + fprintf(stderr, "input_%d mismatch\n", len), exit(2); + if (memcmp(iv, nistvector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(2); + + /* test streamed encryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_nistcts128_encrypt(test_input, ciphertext, len, &encks, iv, + (cbc128_f) AES_cbc_encrypt); + if (memcmp(ciphertext, nistvector, len)) + fprintf(stderr, "output_%d mismatch\n", len), exit(3); + if (memcmp(iv, nistvector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(3); + + /* test streamed decryption */ + memcpy(iv, test_iv, sizeof(test_iv)); + CRYPTO_nistcts128_decrypt(ciphertext, cleartext, len, &decks, iv, + (cbc128_f) AES_cbc_encrypt); + if (memcmp(cleartext, test_input, len)) + fprintf(stderr, "input_%d mismatch\n", len), exit(4); + if (memcmp(iv, nistvector + len - tail, sizeof(iv))) + fprintf(stderr, "iv_%d mismatch\n", len), exit(4); +} + +int main() +{ + AES_set_encrypt_key(test_key, 128, &encks); + AES_set_decrypt_key(test_key, 128, &decks); + + test_vector(vector_17, sizeof(vector_17)); + test_vector(vector_31, sizeof(vector_31)); + test_vector(vector_32, sizeof(vector_32)); + test_vector(vector_47, sizeof(vector_47)); + test_vector(vector_48, sizeof(vector_48)); + test_vector(vector_64, sizeof(vector_64)); + + test_nistvector(vector_17, sizeof(vector_17)); + test_nistvector(vector_31, sizeof(vector_31)); + test_nistvector(vector_32, sizeof(vector_32)); + test_nistvector(vector_47, sizeof(vector_47)); + test_nistvector(vector_48, sizeof(vector_48)); + test_nistvector(vector_64, sizeof(vector_64)); + + return 0; +} +#endif diff --git a/openssl-1.1.0h/crypto/modes/gcm128.c b/openssl-1.1.0h/crypto/modes/gcm128.c new file mode 100644 index 0000000..a2b05c4 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/gcm128.c @@ -0,0 +1,2301 @@ +/* + * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +#if defined(BSWAP4) && defined(STRICT_ALIGNMENT) +/* redefine, because alignment is ensured */ +# undef GETU32 +# define GETU32(p) BSWAP4(*(const u32 *)(p)) +# undef PUTU32 +# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) +#endif + +#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) +#define REDUCE1BIT(V) do { \ + if (sizeof(size_t)==8) { \ + u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^T; \ + } \ + else { \ + u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^((u64)T<<32); \ + } \ +} while(0) + +/*- + * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should + * never be set to 8. 8 is effectively reserved for testing purposes. + * TABLE_BITS>1 are lookup-table-driven implementations referred to as + * "Shoup's" in GCM specification. In other words OpenSSL does not cover + * whole spectrum of possible table driven implementations. Why? In + * non-"Shoup's" case memory access pattern is segmented in such manner, + * that it's trivial to see that cache timing information can reveal + * fair portion of intermediate hash value. Given that ciphertext is + * always available to attacker, it's possible for him to attempt to + * deduce secret parameter H and if successful, tamper with messages + * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's + * not as trivial, but there is no reason to believe that it's resistant + * to cache-timing attack. And the thing about "8-bit" implementation is + * that it consumes 16 (sixteen) times more memory, 4KB per individual + * key + 1KB shared. Well, on pros side it should be twice as fast as + * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version + * was observed to run ~75% faster, closer to 100% for commercial + * compilers... Yet "4-bit" procedure is preferred, because it's + * believed to provide better security-performance balance and adequate + * all-round performance. "All-round" refers to things like: + * + * - shorter setup time effectively improves overall timing for + * handling short messages; + * - larger table allocation can become unbearable because of VM + * subsystem penalties (for example on Windows large enough free + * results in VM working set trimming, meaning that consequent + * malloc would immediately incur working set expansion); + * - larger table has larger cache footprint, which can affect + * performance of other code paths (not necessarily even from same + * thread in Hyper-Threading world); + * + * Value of 1 is not appropriate for performance reasons. + */ +#if TABLE_BITS==8 + +static void gcm_init_8bit(u128 Htable[256], u64 H[2]) +{ + int i, j; + u128 V; + + Htable[0].hi = 0; + Htable[0].lo = 0; + V.hi = H[0]; + V.lo = H[1]; + + for (Htable[128] = V, i = 64; i > 0; i >>= 1) { + REDUCE1BIT(V); + Htable[i] = V; + } + + for (i = 2; i < 256; i <<= 1) { + u128 *Hi = Htable + i, H0 = *Hi; + for (j = 1; j < i; ++j) { + Hi[j].hi = H0.hi ^ Htable[j].hi; + Hi[j].lo = H0.lo ^ Htable[j].lo; + } + } +} + +static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) +{ + u128 Z = { 0, 0 }; + const u8 *xi = (const u8 *)Xi + 15; + size_t rem, n = *xi; + const union { + long one; + char little; + } is_endian = { 1 }; + static const size_t rem_8bit[256] = { + PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), + PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), + PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), + PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), + PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), + PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), + PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), + PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), + PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), + PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), + PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), + PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), + PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), + PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), + PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), + PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), + PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), + PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), + PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), + PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), + PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), + PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), + PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), + PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), + PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), + PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), + PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), + PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), + PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), + PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), + PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), + PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), + PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), + PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), + PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), + PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), + PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), + PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), + PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), + PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), + PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), + PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), + PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), + PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), + PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), + PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), + PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), + PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), + PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), + PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), + PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), + PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), + PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), + PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), + PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), + PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), + PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), + PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), + PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), + PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), + PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), + PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), + PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), + PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) + }; + + while (1) { + Z.hi ^= Htable[n].hi; + Z.lo ^= Htable[n].lo; + + if ((u8 *)Xi == xi) + break; + + n = *(--xi); + + rem = (size_t)Z.lo & 0xff; + Z.lo = (Z.hi << 56) | (Z.lo >> 8); + Z.hi = (Z.hi >> 8); + if (sizeof(size_t) == 8) + Z.hi ^= rem_8bit[rem]; + else + Z.hi ^= (u64)rem_8bit[rem] << 32; + } + + if (is_endian.little) { +# ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +# else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi >> 32); + PUTU32(p, v); + v = (u32)(Z.hi); + PUTU32(p + 4, v); + v = (u32)(Z.lo >> 32); + PUTU32(p + 8, v); + v = (u32)(Z.lo); + PUTU32(p + 12, v); +# endif + } else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} + +# define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) + +#elif TABLE_BITS==4 + +static void gcm_init_4bit(u128 Htable[16], u64 H[2]) +{ + u128 V; +# if defined(OPENSSL_SMALL_FOOTPRINT) + int i; +# endif + + Htable[0].hi = 0; + Htable[0].lo = 0; + V.hi = H[0]; + V.lo = H[1]; + +# if defined(OPENSSL_SMALL_FOOTPRINT) + for (Htable[8] = V, i = 4; i > 0; i >>= 1) { + REDUCE1BIT(V); + Htable[i] = V; + } + + for (i = 2; i < 16; i <<= 1) { + u128 *Hi = Htable + i; + int j; + for (V = *Hi, j = 1; j < i; ++j) { + Hi[j].hi = V.hi ^ Htable[j].hi; + Hi[j].lo = V.lo ^ Htable[j].lo; + } + } +# else + Htable[8] = V; + REDUCE1BIT(V); + Htable[4] = V; + REDUCE1BIT(V); + Htable[2] = V; + REDUCE1BIT(V); + Htable[1] = V; + Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo; + V = Htable[4]; + Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo; + Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo; + Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo; + V = Htable[8]; + Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo; + Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo; + Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo; + Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo; + Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo; + Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo; + Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo; +# endif +# if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) + /* + * ARM assembler expects specific dword order in Htable. + */ + { + int j; + const union { + long one; + char little; + } is_endian = { 1 }; + + if (is_endian.little) + for (j = 0; j < 16; ++j) { + V = Htable[j]; + Htable[j].hi = V.lo; + Htable[j].lo = V.hi; + } else + for (j = 0; j < 16; ++j) { + V = Htable[j]; + Htable[j].hi = V.lo << 32 | V.lo >> 32; + Htable[j].lo = V.hi << 32 | V.hi >> 32; + } + } +# endif +} + +# ifndef GHASH_ASM +static const size_t rem_4bit[16] = { + PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), + PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), + PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), + PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) +}; + +static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) +{ + u128 Z; + int cnt = 15; + size_t rem, nlo, nhi; + const union { + long one; + char little; + } is_endian = { 1 }; + + nlo = ((const u8 *)Xi)[15]; + nhi = nlo >> 4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem] << 32; + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt < 0) + break; + + nlo = ((const u8 *)Xi)[cnt]; + nhi = nlo >> 4; + nlo &= 0xf; + + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem] << 32; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } + + if (is_endian.little) { +# ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +# else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi >> 32); + PUTU32(p, v); + v = (u32)(Z.hi); + PUTU32(p + 4, v); + v = (u32)(Z.lo >> 32); + PUTU32(p + 8, v); + v = (u32)(Z.lo); + PUTU32(p + 12, v); +# endif + } else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} + +# if !defined(OPENSSL_SMALL_FOOTPRINT) +/* + * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for + * details... Compiler-generated code doesn't seem to give any + * performance improvement, at least not on x86[_64]. It's here + * mostly as reference and a placeholder for possible future + * non-trivial optimization[s]... + */ +static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], + const u8 *inp, size_t len) +{ + u128 Z; + int cnt; + size_t rem, nlo, nhi; + const union { + long one; + char little; + } is_endian = { 1 }; + +# if 1 + do { + cnt = 15; + nlo = ((const u8 *)Xi)[15]; + nlo ^= inp[15]; + nhi = nlo >> 4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem] << 32; + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt < 0) + break; + + nlo = ((const u8 *)Xi)[cnt]; + nlo ^= inp[cnt]; + nhi = nlo >> 4; + nlo &= 0xf; + + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem] << 32; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } +# else + /* + * Extra 256+16 bytes per-key plus 512 bytes shared tables + * [should] give ~50% improvement... One could have PACK()-ed + * the rem_8bit even here, but the priority is to minimize + * cache footprint... + */ + u128 Hshr4[16]; /* Htable shifted right by 4 bits */ + u8 Hshl4[16]; /* Htable shifted left by 4 bits */ + static const unsigned short rem_8bit[256] = { + 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, + 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, + 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, + 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, + 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, + 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, + 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, + 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, + 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, + 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, + 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, + 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, + 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, + 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, + 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, + 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, + 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, + 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, + 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, + 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, + 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, + 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, + 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, + 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, + 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, + 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, + 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, + 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, + 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, + 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, + 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, + 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE + }; + /* + * This pre-processing phase slows down procedure by approximately + * same time as it makes each loop spin faster. In other words + * single block performance is approximately same as straightforward + * "4-bit" implementation, and then it goes only faster... + */ + for (cnt = 0; cnt < 16; ++cnt) { + Z.hi = Htable[cnt].hi; + Z.lo = Htable[cnt].lo; + Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4); + Hshr4[cnt].hi = (Z.hi >> 4); + Hshl4[cnt] = (u8)(Z.lo << 4); + } + + do { + for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) { + nlo = ((const u8 *)Xi)[cnt]; + nlo ^= inp[cnt]; + nhi = nlo >> 4; + nlo &= 0xf; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + + rem = (size_t)Z.lo & 0xff; + + Z.lo = (Z.hi << 56) | (Z.lo >> 8); + Z.hi = (Z.hi >> 8); + + Z.hi ^= Hshr4[nhi].hi; + Z.lo ^= Hshr4[nhi].lo; + Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48; + } + + nlo = ((const u8 *)Xi)[0]; + nlo ^= inp[0]; + nhi = nlo >> 4; + nlo &= 0xf; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + + rem = (size_t)Z.lo & 0xf; + + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48; +# endif + + if (is_endian.little) { +# ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +# else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi >> 32); + PUTU32(p, v); + v = (u32)(Z.hi); + PUTU32(p + 4, v); + v = (u32)(Z.lo >> 32); + PUTU32(p + 8, v); + v = (u32)(Z.lo); + PUTU32(p + 12, v); +# endif + } else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } + } while (inp += 16, len -= 16); +} +# endif +# else +void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +# endif + +# define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) +# if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) +# define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) +/* + * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing + * effect. In other words idea is to hash data while it's still in L1 cache + * after encryption pass... + */ +# define GHASH_CHUNK (3*1024) +# endif + +#else /* TABLE_BITS */ + +static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2]) +{ + u128 V, Z = { 0, 0 }; + long X; + int i, j; + const long *xi = (const long *)Xi; + const union { + long one; + char little; + } is_endian = { 1 }; + + V.hi = H[0]; /* H is in host byte order, no byte swapping */ + V.lo = H[1]; + + for (j = 0; j < 16 / sizeof(long); ++j) { + if (is_endian.little) { + if (sizeof(long) == 8) { +# ifdef BSWAP8 + X = (long)(BSWAP8(xi[j])); +# else + const u8 *p = (const u8 *)(xi + j); + X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4)); +# endif + } else { + const u8 *p = (const u8 *)(xi + j); + X = (long)GETU32(p); + } + } else + X = xi[j]; + + for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) { + u64 M = (u64)(X >> (8 * sizeof(long) - 1)); + Z.hi ^= V.hi & M; + Z.lo ^= V.lo & M; + + REDUCE1BIT(V); + } + } + + if (is_endian.little) { +# ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +# else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi >> 32); + PUTU32(p, v); + v = (u32)(Z.hi); + PUTU32(p + 4, v); + v = (u32)(Z.lo >> 32); + PUTU32(p + 8, v); + v = (u32)(Z.lo); + PUTU32(p + 12, v); +# endif + } else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} + +# define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) + +#endif + +#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ)) +# if !defined(I386_ONLY) && \ + (defined(__i386) || defined(__i386__) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) +# define GHASH_ASM_X86_OR_64 +# define GCM_FUNCREF_4BIT +extern unsigned int OPENSSL_ia32cap_P[]; + +void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); + +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) +# define gcm_init_avx gcm_init_clmul +# define gcm_gmult_avx gcm_gmult_clmul +# define gcm_ghash_avx gcm_ghash_clmul +# else +void gcm_init_avx(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +# endif + +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) +# define GHASH_ASM_X86 +void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); + +void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +# endif +# elif defined(__arm__) || defined(__arm) || defined(__aarch64__) +# include "arm_arch.h" +# if __ARM_MAX_ARCH__>=7 +# define GHASH_ASM_ARM +# define GCM_FUNCREF_4BIT +# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL) +# if defined(__arm__) || defined(__arm) +# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) +# endif +void gcm_init_neon(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +void gcm_init_v8(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +# endif +# elif defined(__sparc__) || defined(__sparc) +# include "sparc_arch.h" +# define GHASH_ASM_SPARC +# define GCM_FUNCREF_4BIT +extern unsigned int OPENSSL_sparcv9cap_P[]; +void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) +# include "ppc_arch.h" +# define GHASH_ASM_PPC +# define GCM_FUNCREF_4BIT +void gcm_init_p8(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +# endif +#endif + +#ifdef GCM_FUNCREF_4BIT +# undef GCM_MUL +# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) +# ifdef GHASH +# undef GHASH +# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) +# endif +#endif + +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + memset(ctx, 0, sizeof(*ctx)); + ctx->block = block; + ctx->key = key; + + (*block) (ctx->H.c, ctx->H.c, key); + + if (is_endian.little) { + /* H is stored in host byte order */ +#ifdef BSWAP8 + ctx->H.u[0] = BSWAP8(ctx->H.u[0]); + ctx->H.u[1] = BSWAP8(ctx->H.u[1]); +#else + u8 *p = ctx->H.c; + u64 hi, lo; + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); + ctx->H.u[0] = hi; + ctx->H.u[1] = lo; +#endif + } +#if TABLE_BITS==8 + gcm_init_8bit(ctx->Htable, ctx->H.u); +#elif TABLE_BITS==4 +# if defined(GHASH) +# define CTX__GHASH(f) (ctx->ghash = (f)) +# else +# define CTX__GHASH(f) (ctx->ghash = NULL) +# endif +# if defined(GHASH_ASM_X86_OR_64) +# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) + if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */ + if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */ + gcm_init_avx(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_avx; + CTX__GHASH(gcm_ghash_avx); + } else { + gcm_init_clmul(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_clmul; + CTX__GHASH(gcm_ghash_clmul); + } + return; + } +# endif + gcm_init_4bit(ctx->Htable, ctx->H.u); +# if defined(GHASH_ASM_X86) /* x86 only */ +# if defined(OPENSSL_IA32_SSE2) + if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */ +# else + if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */ +# endif + ctx->gmult = gcm_gmult_4bit_mmx; + CTX__GHASH(gcm_ghash_4bit_mmx); + } else { + ctx->gmult = gcm_gmult_4bit_x86; + CTX__GHASH(gcm_ghash_4bit_x86); + } +# else + ctx->gmult = gcm_gmult_4bit; + CTX__GHASH(gcm_ghash_4bit); +# endif +# elif defined(GHASH_ASM_ARM) +# ifdef PMULL_CAPABLE + if (PMULL_CAPABLE) { + gcm_init_v8(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_v8; + CTX__GHASH(gcm_ghash_v8); + } else +# endif +# ifdef NEON_CAPABLE + if (NEON_CAPABLE) { + gcm_init_neon(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_neon; + CTX__GHASH(gcm_ghash_neon); + } else +# endif + { + gcm_init_4bit(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + CTX__GHASH(gcm_ghash_4bit); + } +# elif defined(GHASH_ASM_SPARC) + if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) { + gcm_init_vis3(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_vis3; + CTX__GHASH(gcm_ghash_vis3); + } else { + gcm_init_4bit(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + CTX__GHASH(gcm_ghash_4bit); + } +# elif defined(GHASH_ASM_PPC) + if (OPENSSL_ppccap_P & PPC_CRYPTO207) { + gcm_init_p8(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_p8; + CTX__GHASH(gcm_ghash_p8); + } else { + gcm_init_4bit(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + CTX__GHASH(gcm_ghash_4bit); + } +# else + gcm_init_4bit(ctx->Htable, ctx->H.u); +# endif +# undef CTX__GHASH +#endif +} + +void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, + size_t len) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + unsigned int ctr; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +#endif + + ctx->Yi.u[0] = 0; + ctx->Yi.u[1] = 0; + ctx->Xi.u[0] = 0; + ctx->Xi.u[1] = 0; + ctx->len.u[0] = 0; /* AAD length */ + ctx->len.u[1] = 0; /* message length */ + ctx->ares = 0; + ctx->mres = 0; + + if (len == 12) { + memcpy(ctx->Yi.c, iv, 12); + ctx->Yi.c[15] = 1; + ctr = 1; + } else { + size_t i; + u64 len0 = len; + + while (len >= 16) { + for (i = 0; i < 16; ++i) + ctx->Yi.c[i] ^= iv[i]; + GCM_MUL(ctx, Yi); + iv += 16; + len -= 16; + } + if (len) { + for (i = 0; i < len; ++i) + ctx->Yi.c[i] ^= iv[i]; + GCM_MUL(ctx, Yi); + } + len0 <<= 3; + if (is_endian.little) { +#ifdef BSWAP8 + ctx->Yi.u[1] ^= BSWAP8(len0); +#else + ctx->Yi.c[8] ^= (u8)(len0 >> 56); + ctx->Yi.c[9] ^= (u8)(len0 >> 48); + ctx->Yi.c[10] ^= (u8)(len0 >> 40); + ctx->Yi.c[11] ^= (u8)(len0 >> 32); + ctx->Yi.c[12] ^= (u8)(len0 >> 24); + ctx->Yi.c[13] ^= (u8)(len0 >> 16); + ctx->Yi.c[14] ^= (u8)(len0 >> 8); + ctx->Yi.c[15] ^= (u8)(len0); +#endif + } else + ctx->Yi.u[1] ^= len0; + + GCM_MUL(ctx, Yi); + + if (is_endian.little) +#ifdef BSWAP4 + ctr = BSWAP4(ctx->Yi.d[3]); +#else + ctr = GETU32(ctx->Yi.c + 12); +#endif + else + ctr = ctx->Yi.d[3]; + } + + (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key); + ++ctr; + if (is_endian.little) +#ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +#else + PUTU32(ctx->Yi.c + 12, ctr); +#endif + else + ctx->Yi.d[3] = ctr; +} + +int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, + size_t len) +{ + size_t i; + unsigned int n; + u64 alen = ctx->len.u[0]; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], + const u8 *inp, size_t len) = ctx->ghash; +# endif +#endif + + if (ctx->len.u[1]) + return -2; + + alen += len; + if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len)) + return -1; + ctx->len.u[0] = alen; + + n = ctx->ares; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(aad++); + --len; + n = (n + 1) % 16; + } + if (n == 0) + GCM_MUL(ctx, Xi); + else { + ctx->ares = n; + return 0; + } + } +#ifdef GHASH + if ((i = (len & (size_t)-16))) { + GHASH(ctx, aad, i); + aad += i; + len -= i; + } +#else + while (len >= 16) { + for (i = 0; i < 16; ++i) + ctx->Xi.c[i] ^= aad[i]; + GCM_MUL(ctx, Xi); + aad += 16; + len -= 16; + } +#endif + if (len) { + n = (unsigned int)len; + for (i = 0; i < len; ++i) + ctx->Xi.c[i] ^= aad[i]; + } + + ctx->ares = n; + return 0; +} + +int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) + void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], + const u8 *inp, size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) +#ifdef BSWAP4 + ctr = BSWAP4(ctx->Yi.d[3]); +#else + ctr = GETU32(ctx->Yi.c + 12); +#endif + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16 % sizeof(size_t) == 0) { /* always true actually */ + do { + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; + --len; + n = (n + 1) % 16; + } + if (n == 0) + GCM_MUL(ctx, Xi); + else { + ctx->mres = n; + return 0; + } + } +# if defined(STRICT_ALIGNMENT) + if (((size_t)in | (size_t)out) % sizeof(size_t) != 0) + break; +# endif +# if defined(GHASH) +# if defined(GHASH_CHUNK) + while (len >= GHASH_CHUNK) { + size_t j = GHASH_CHUNK; + + while (j) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + for (i = 0; i < 16 / sizeof(size_t); ++i) + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + out += 16; + in += 16; + j -= 16; + } + GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK); + len -= GHASH_CHUNK; + } +# endif + if ((i = (len & (size_t)-16))) { + size_t j = i; + + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + for (i = 0; i < 16 / sizeof(size_t); ++i) + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + out += 16; + in += 16; + len -= 16; + } + GHASH(ctx, out - j, j); + } +# else + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + for (i = 0; i < 16 / sizeof(size_t); ++i) + ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + GCM_MUL(ctx, Xi); + out += 16; + in += 16; + len -= 16; + } +# endif + if (len) { + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; + } while (0); + } +#endif + for (i = 0; i < len; ++i) { + if (n == 0) { + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +#ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +#else + PUTU32(ctx->Yi.c + 12, ctr); +#endif + else + ctx->Yi.d[3] = ctr; + } + ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n]; + n = (n + 1) % 16; + if (n == 0) + GCM_MUL(ctx, Xi); + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +# if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) + void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], + const u8 *inp, size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) +#ifdef BSWAP4 + ctr = BSWAP4(ctx->Yi.d[3]); +#else + ctr = GETU32(ctx->Yi.c + 12); +#endif + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16 % sizeof(size_t) == 0) { /* always true actually */ + do { + if (n) { + while (n && len) { + u8 c = *(in++); + *(out++) = c ^ ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n + 1) % 16; + } + if (n == 0) + GCM_MUL(ctx, Xi); + else { + ctx->mres = n; + return 0; + } + } +# if defined(STRICT_ALIGNMENT) + if (((size_t)in | (size_t)out) % sizeof(size_t) != 0) + break; +# endif +# if defined(GHASH) +# if defined(GHASH_CHUNK) + while (len >= GHASH_CHUNK) { + size_t j = GHASH_CHUNK; + + GHASH(ctx, in, GHASH_CHUNK); + while (j) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + for (i = 0; i < 16 / sizeof(size_t); ++i) + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + out += 16; + in += 16; + j -= 16; + } + len -= GHASH_CHUNK; + } +# endif + if ((i = (len & (size_t)-16))) { + GHASH(ctx, in, i); + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + for (i = 0; i < 16 / sizeof(size_t); ++i) + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + out += 16; + in += 16; + len -= 16; + } + } +# else + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + for (i = 0; i < 16 / sizeof(size_t); ++i) { + size_t c = in[i]; + out[i] = c ^ ctx->EKi.t[i]; + ctx->Xi.t[i] ^= c; + } + GCM_MUL(ctx, Xi); + out += 16; + in += 16; + len -= 16; + } +# endif + if (len) { + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + while (len--) { + u8 c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; + } while (0); + } +#endif + for (i = 0; i < len; ++i) { + u8 c; + if (n == 0) { + (*block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +#ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +#else + PUTU32(ctx->Yi.c + 12, ctr); +#endif + else + ctx->Yi.d[3] = ctr; + } + c = in[i]; + out[i] = c ^ ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + n = (n + 1) % 16; + if (n == 0) + GCM_MUL(ctx, Xi); + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream) +{ +#if defined(OPENSSL_SMALL_FOOTPRINT) + return CRYPTO_gcm128_encrypt(ctx, in, out, len); +#else + const union { + long one; + char little; + } is_endian = { 1 }; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + void *key = ctx->key; +# ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], + const u8 *inp, size_t len) = ctx->ghash; +# endif +# endif + + mlen += len; + if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) +# ifdef BSWAP4 + ctr = BSWAP4(ctx->Yi.d[3]); +# else + ctr = GETU32(ctx->Yi.c + 12); +# endif + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; + --len; + n = (n + 1) % 16; + } + if (n == 0) + GCM_MUL(ctx, Xi); + else { + ctx->mres = n; + return 0; + } + } +# if defined(GHASH) && defined(GHASH_CHUNK) + while (len >= GHASH_CHUNK) { + (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); + ctr += GHASH_CHUNK / 16; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + GHASH(ctx, out, GHASH_CHUNK); + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +# endif + if ((i = (len & (size_t)-16))) { + size_t j = i / 16; + + (*stream) (in, out, j, key, ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + in += i; + len -= i; +# if defined(GHASH) + GHASH(ctx, out, i); + out += i; +# else + while (j--) { + for (i = 0; i < 16; ++i) + ctx->Xi.c[i] ^= out[i]; + GCM_MUL(ctx, Xi); + out += 16; + } +# endif + } + if (len) { + (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; +#endif +} + +int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream) +{ +#if defined(OPENSSL_SMALL_FOOTPRINT) + return CRYPTO_gcm128_decrypt(ctx, in, out, len); +#else + const union { + long one; + char little; + } is_endian = { 1 }; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + void *key = ctx->key; +# ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], + const u8 *inp, size_t len) = ctx->ghash; +# endif +# endif + + mlen += len; + if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) + return -1; + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) +# ifdef BSWAP4 + ctr = BSWAP4(ctx->Yi.d[3]); +# else + ctr = GETU32(ctx->Yi.c + 12); +# endif + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; + if (n) { + while (n && len) { + u8 c = *(in++); + *(out++) = c ^ ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n + 1) % 16; + } + if (n == 0) + GCM_MUL(ctx, Xi); + else { + ctx->mres = n; + return 0; + } + } +# if defined(GHASH) && defined(GHASH_CHUNK) + while (len >= GHASH_CHUNK) { + GHASH(ctx, in, GHASH_CHUNK); + (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); + ctr += GHASH_CHUNK / 16; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +# endif + if ((i = (len & (size_t)-16))) { + size_t j = i / 16; + +# if defined(GHASH) + GHASH(ctx, in, i); +# else + while (j--) { + size_t k; + for (k = 0; k < 16; ++k) + ctx->Xi.c[k] ^= in[k]; + GCM_MUL(ctx, Xi); + in += 16; + } + j = i / 16; + in -= i; +# endif + (*stream) (in, out, j, key, ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + out += i; + in += i; + len -= i; + } + if (len) { + (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) +# ifdef BSWAP4 + ctx->Yi.d[3] = BSWAP4(ctr); +# else + PUTU32(ctx->Yi.c + 12, ctr); +# endif + else + ctx->Yi.d[3] = ctr; + while (len--) { + u8 c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; +#endif +} + +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag, + size_t len) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + u64 alen = ctx->len.u[0] << 3; + u64 clen = ctx->len.u[1] << 3; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult; +#endif + + if (ctx->mres || ctx->ares) + GCM_MUL(ctx, Xi); + + if (is_endian.little) { +#ifdef BSWAP8 + alen = BSWAP8(alen); + clen = BSWAP8(clen); +#else + u8 *p = ctx->len.c; + + ctx->len.u[0] = alen; + ctx->len.u[1] = clen; + + alen = (u64)GETU32(p) << 32 | GETU32(p + 4); + clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + } + + ctx->Xi.u[0] ^= alen; + ctx->Xi.u[1] ^= clen; + GCM_MUL(ctx, Xi); + + ctx->Xi.u[0] ^= ctx->EK0.u[0]; + ctx->Xi.u[1] ^= ctx->EK0.u[1]; + + if (tag && len <= sizeof(ctx->Xi)) + return CRYPTO_memcmp(ctx->Xi.c, tag, len); + else + return -1; +} + +void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) +{ + CRYPTO_gcm128_finish(ctx, NULL, 0); + memcpy(tag, ctx->Xi.c, + len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c)); +} + +GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) +{ + GCM128_CONTEXT *ret; + + if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL) + CRYPTO_gcm128_init(ret, key, block); + + return ret; +} + +void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) +{ + OPENSSL_clear_free(ctx, sizeof(*ctx)); +} + +#if defined(SELFTEST) +# include +# include + +/* Test Case 1 */ +static const u8 K1[16], *P1 = NULL, *A1 = NULL, IV1[12], *C1 = NULL; +static const u8 T1[] = { + 0x58, 0xe2, 0xfc, 0xce, 0xfa, 0x7e, 0x30, 0x61, + 0x36, 0x7f, 0x1d, 0x57, 0xa4, 0xe7, 0x45, 0x5a +}; + +/* Test Case 2 */ +# define K2 K1 +# define A2 A1 +# define IV2 IV1 +static const u8 P2[16]; +static const u8 C2[] = { + 0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, + 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78 +}; + +static const u8 T2[] = { + 0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, + 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf +}; + +/* Test Case 3 */ +# define A3 A2 +static const u8 K3[] = { + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, + 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 +}; + +static const u8 P3[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, + 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, + 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, + 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, + 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; + +static const u8 IV3[] = { + 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, + 0xde, 0xca, 0xf8, 0x88 +}; + +static const u8 C3[] = { + 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, + 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c, + 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, + 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e, + 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, + 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05, + 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, + 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85 +}; + +static const u8 T3[] = { + 0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, + 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4 +}; + +/* Test Case 4 */ +# define K4 K3 +# define IV4 IV3 +static const u8 P4[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, + 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, + 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, + 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, + 0xba, 0x63, 0x7b, 0x39 +}; + +static const u8 A4[] = { + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xab, 0xad, 0xda, 0xd2 +}; + +static const u8 C4[] = { + 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, + 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c, + 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, + 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e, + 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, + 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05, + 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, + 0x3d, 0x58, 0xe0, 0x91 +}; + +static const u8 T4[] = { + 0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, + 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47 +}; + +/* Test Case 5 */ +# define K5 K4 +# define P5 P4 +# define A5 A4 +static const u8 IV5[] = { + 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad +}; + +static const u8 C5[] = { + 0x61, 0x35, 0x3b, 0x4c, 0x28, 0x06, 0x93, 0x4a, + 0x77, 0x7f, 0xf5, 0x1f, 0xa2, 0x2a, 0x47, 0x55, + 0x69, 0x9b, 0x2a, 0x71, 0x4f, 0xcd, 0xc6, 0xf8, + 0x37, 0x66, 0xe5, 0xf9, 0x7b, 0x6c, 0x74, 0x23, + 0x73, 0x80, 0x69, 0x00, 0xe4, 0x9f, 0x24, 0xb2, + 0x2b, 0x09, 0x75, 0x44, 0xd4, 0x89, 0x6b, 0x42, + 0x49, 0x89, 0xb5, 0xe1, 0xeb, 0xac, 0x0f, 0x07, + 0xc2, 0x3f, 0x45, 0x98 +}; + +static const u8 T5[] = { + 0x36, 0x12, 0xd2, 0xe7, 0x9e, 0x3b, 0x07, 0x85, + 0x56, 0x1b, 0xe1, 0x4a, 0xac, 0xa2, 0xfc, 0xcb +}; + +/* Test Case 6 */ +# define K6 K5 +# define P6 P5 +# define A6 A5 +static const u8 IV6[] = { + 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5, + 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa, + 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1, + 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28, + 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39, + 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54, + 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57, + 0xa6, 0x37, 0xb3, 0x9b +}; + +static const u8 C6[] = { + 0x8c, 0xe2, 0x49, 0x98, 0x62, 0x56, 0x15, 0xb6, + 0x03, 0xa0, 0x33, 0xac, 0xa1, 0x3f, 0xb8, 0x94, + 0xbe, 0x91, 0x12, 0xa5, 0xc3, 0xa2, 0x11, 0xa8, + 0xba, 0x26, 0x2a, 0x3c, 0xca, 0x7e, 0x2c, 0xa7, + 0x01, 0xe4, 0xa9, 0xa4, 0xfb, 0xa4, 0x3c, 0x90, + 0xcc, 0xdc, 0xb2, 0x81, 0xd4, 0x8c, 0x7c, 0x6f, + 0xd6, 0x28, 0x75, 0xd2, 0xac, 0xa4, 0x17, 0x03, + 0x4c, 0x34, 0xae, 0xe5 +}; + +static const u8 T6[] = { + 0x61, 0x9c, 0xc5, 0xae, 0xff, 0xfe, 0x0b, 0xfa, + 0x46, 0x2a, 0xf4, 0x3c, 0x16, 0x99, 0xd0, 0x50 +}; + +/* Test Case 7 */ +static const u8 K7[24], *P7 = NULL, *A7 = NULL, IV7[12], *C7 = NULL; +static const u8 T7[] = { + 0xcd, 0x33, 0xb2, 0x8a, 0xc7, 0x73, 0xf7, 0x4b, + 0xa0, 0x0e, 0xd1, 0xf3, 0x12, 0x57, 0x24, 0x35 +}; + +/* Test Case 8 */ +# define K8 K7 +# define IV8 IV7 +# define A8 A7 +static const u8 P8[16]; +static const u8 C8[] = { + 0x98, 0xe7, 0x24, 0x7c, 0x07, 0xf0, 0xfe, 0x41, + 0x1c, 0x26, 0x7e, 0x43, 0x84, 0xb0, 0xf6, 0x00 +}; + +static const u8 T8[] = { + 0x2f, 0xf5, 0x8d, 0x80, 0x03, 0x39, 0x27, 0xab, + 0x8e, 0xf4, 0xd4, 0x58, 0x75, 0x14, 0xf0, 0xfb +}; + +/* Test Case 9 */ +# define A9 A8 +static const u8 K9[] = { + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, + 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c +}; + +static const u8 P9[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, + 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, + 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, + 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, + 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; + +static const u8 IV9[] = { + 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, + 0xde, 0xca, 0xf8, 0x88 +}; + +static const u8 C9[] = { + 0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41, + 0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57, + 0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84, + 0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c, + 0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25, + 0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47, + 0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9, + 0xcc, 0xda, 0x27, 0x10, 0xac, 0xad, 0xe2, 0x56 +}; + +static const u8 T9[] = { + 0x99, 0x24, 0xa7, 0xc8, 0x58, 0x73, 0x36, 0xbf, + 0xb1, 0x18, 0x02, 0x4d, 0xb8, 0x67, 0x4a, 0x14 +}; + +/* Test Case 10 */ +# define K10 K9 +# define IV10 IV9 +static const u8 P10[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, + 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, + 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, + 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, + 0xba, 0x63, 0x7b, 0x39 +}; + +static const u8 A10[] = { + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xab, 0xad, 0xda, 0xd2 +}; + +static const u8 C10[] = { + 0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41, + 0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57, + 0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84, + 0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c, + 0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25, + 0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47, + 0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9, + 0xcc, 0xda, 0x27, 0x10 +}; + +static const u8 T10[] = { + 0x25, 0x19, 0x49, 0x8e, 0x80, 0xf1, 0x47, 0x8f, + 0x37, 0xba, 0x55, 0xbd, 0x6d, 0x27, 0x61, 0x8c +}; + +/* Test Case 11 */ +# define K11 K10 +# define P11 P10 +# define A11 A10 +static const u8 IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + +static const u8 C11[] = { + 0x0f, 0x10, 0xf5, 0x99, 0xae, 0x14, 0xa1, 0x54, + 0xed, 0x24, 0xb3, 0x6e, 0x25, 0x32, 0x4d, 0xb8, + 0xc5, 0x66, 0x63, 0x2e, 0xf2, 0xbb, 0xb3, 0x4f, + 0x83, 0x47, 0x28, 0x0f, 0xc4, 0x50, 0x70, 0x57, + 0xfd, 0xdc, 0x29, 0xdf, 0x9a, 0x47, 0x1f, 0x75, + 0xc6, 0x65, 0x41, 0xd4, 0xd4, 0xda, 0xd1, 0xc9, + 0xe9, 0x3a, 0x19, 0xa5, 0x8e, 0x8b, 0x47, 0x3f, + 0xa0, 0xf0, 0x62, 0xf7 +}; + +static const u8 T11[] = { + 0x65, 0xdc, 0xc5, 0x7f, 0xcf, 0x62, 0x3a, 0x24, + 0x09, 0x4f, 0xcc, 0xa4, 0x0d, 0x35, 0x33, 0xf8 +}; + +/* Test Case 12 */ +# define K12 K11 +# define P12 P11 +# define A12 A11 +static const u8 IV12[] = { + 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5, + 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa, + 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1, + 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28, + 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39, + 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54, + 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57, + 0xa6, 0x37, 0xb3, 0x9b +}; + +static const u8 C12[] = { + 0xd2, 0x7e, 0x88, 0x68, 0x1c, 0xe3, 0x24, 0x3c, + 0x48, 0x30, 0x16, 0x5a, 0x8f, 0xdc, 0xf9, 0xff, + 0x1d, 0xe9, 0xa1, 0xd8, 0xe6, 0xb4, 0x47, 0xef, + 0x6e, 0xf7, 0xb7, 0x98, 0x28, 0x66, 0x6e, 0x45, + 0x81, 0xe7, 0x90, 0x12, 0xaf, 0x34, 0xdd, 0xd9, + 0xe2, 0xf0, 0x37, 0x58, 0x9b, 0x29, 0x2d, 0xb3, + 0xe6, 0x7c, 0x03, 0x67, 0x45, 0xfa, 0x22, 0xe7, + 0xe9, 0xb7, 0x37, 0x3b +}; + +static const u8 T12[] = { + 0xdc, 0xf5, 0x66, 0xff, 0x29, 0x1c, 0x25, 0xbb, + 0xb8, 0x56, 0x8f, 0xc3, 0xd3, 0x76, 0xa6, 0xd9 +}; + +/* Test Case 13 */ +static const u8 K13[32], *P13 = NULL, *A13 = NULL, IV13[12], *C13 = NULL; +static const u8 T13[] = { + 0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9, + 0xa9, 0x63, 0xb4, 0xf1, 0xc4, 0xcb, 0x73, 0x8b +}; + +/* Test Case 14 */ +# define K14 K13 +# define A14 A13 +static const u8 P14[16], IV14[12]; +static const u8 C14[] = { + 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, + 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18 +}; + +static const u8 T14[] = { + 0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, + 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19 +}; + +/* Test Case 15 */ +# define A15 A14 +static const u8 K15[] = { + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, + 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, + 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 +}; + +static const u8 P15[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, + 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, + 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, + 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, + 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; + +static const u8 IV15[] = { + 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, + 0xde, 0xca, 0xf8, 0x88 +}; + +static const u8 C15[] = { + 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, + 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, + 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, + 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, + 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, + 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, + 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, + 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad +}; + +static const u8 T15[] = { + 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, + 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c +}; + +/* Test Case 16 */ +# define K16 K15 +# define IV16 IV15 +static const u8 P16[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, + 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, + 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, + 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, + 0xba, 0x63, 0x7b, 0x39 +}; + +static const u8 A16[] = { + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xab, 0xad, 0xda, 0xd2 +}; + +static const u8 C16[] = { + 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, + 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, + 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, + 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, + 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, + 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, + 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, + 0xbc, 0xc9, 0xf6, 0x62 +}; + +static const u8 T16[] = { + 0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, + 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b +}; + +/* Test Case 17 */ +# define K17 K16 +# define P17 P16 +# define A17 A16 +static const u8 IV17[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad }; + +static const u8 C17[] = { + 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, + 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb, + 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, + 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0, + 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, + 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78, + 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, + 0xf4, 0x7c, 0x9b, 0x1f +}; + +static const u8 T17[] = { + 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, + 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2 +}; + +/* Test Case 18 */ +# define K18 K17 +# define P18 P17 +# define A18 A17 +static const u8 IV18[] = { + 0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5, + 0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa, + 0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1, + 0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28, + 0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39, + 0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54, + 0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57, + 0xa6, 0x37, 0xb3, 0x9b +}; + +static const u8 C18[] = { + 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, + 0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20, + 0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, + 0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4, + 0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, + 0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde, + 0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e, + 0x44, 0xae, 0x7e, 0x3f +}; + +static const u8 T18[] = { + 0xa4, 0x4a, 0x82, 0x66, 0xee, 0x1c, 0x8e, 0xb0, + 0xc8, 0xb5, 0xd4, 0xcf, 0x5a, 0xe9, 0xf1, 0x9a +}; + +/* Test Case 19 */ +# define K19 K1 +# define P19 P1 +# define IV19 IV1 +# define C19 C1 +static const u8 A19[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, + 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, + 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, + 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, + 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55, + 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, + 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, + 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, + 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, + 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, + 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, + 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, + 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad +}; + +static const u8 T19[] = { + 0x5f, 0xea, 0x79, 0x3a, 0x2d, 0x6f, 0x97, 0x4d, + 0x37, 0xe6, 0x8e, 0x0c, 0xb8, 0xff, 0x94, 0x92 +}; + +/* Test Case 20 */ +# define K20 K1 +# define A20 A1 +/* this results in 0xff in counter LSB */ +static const u8 IV20[64] = { 0xff, 0xff, 0xff, 0xff }; + +static const u8 P20[288]; +static const u8 C20[] = { + 0x56, 0xb3, 0x37, 0x3c, 0xa9, 0xef, 0x6e, 0x4a, + 0x2b, 0x64, 0xfe, 0x1e, 0x9a, 0x17, 0xb6, 0x14, + 0x25, 0xf1, 0x0d, 0x47, 0xa7, 0x5a, 0x5f, 0xce, + 0x13, 0xef, 0xc6, 0xbc, 0x78, 0x4a, 0xf2, 0x4f, + 0x41, 0x41, 0xbd, 0xd4, 0x8c, 0xf7, 0xc7, 0x70, + 0x88, 0x7a, 0xfd, 0x57, 0x3c, 0xca, 0x54, 0x18, + 0xa9, 0xae, 0xff, 0xcd, 0x7c, 0x5c, 0xed, 0xdf, + 0xc6, 0xa7, 0x83, 0x97, 0xb9, 0xa8, 0x5b, 0x49, + 0x9d, 0xa5, 0x58, 0x25, 0x72, 0x67, 0xca, 0xab, + 0x2a, 0xd0, 0xb2, 0x3c, 0xa4, 0x76, 0xa5, 0x3c, + 0xb1, 0x7f, 0xb4, 0x1c, 0x4b, 0x8b, 0x47, 0x5c, + 0xb4, 0xf3, 0xf7, 0x16, 0x50, 0x94, 0xc2, 0x29, + 0xc9, 0xe8, 0xc4, 0xdc, 0x0a, 0x2a, 0x5f, 0xf1, + 0x90, 0x3e, 0x50, 0x15, 0x11, 0x22, 0x13, 0x76, + 0xa1, 0xcd, 0xb8, 0x36, 0x4c, 0x50, 0x61, 0xa2, + 0x0c, 0xae, 0x74, 0xbc, 0x4a, 0xcd, 0x76, 0xce, + 0xb0, 0xab, 0xc9, 0xfd, 0x32, 0x17, 0xef, 0x9f, + 0x8c, 0x90, 0xbe, 0x40, 0x2d, 0xdf, 0x6d, 0x86, + 0x97, 0xf4, 0xf8, 0x80, 0xdf, 0xf1, 0x5b, 0xfb, + 0x7a, 0x6b, 0x28, 0x24, 0x1e, 0xc8, 0xfe, 0x18, + 0x3c, 0x2d, 0x59, 0xe3, 0xf9, 0xdf, 0xff, 0x65, + 0x3c, 0x71, 0x26, 0xf0, 0xac, 0xb9, 0xe6, 0x42, + 0x11, 0xf4, 0x2b, 0xae, 0x12, 0xaf, 0x46, 0x2b, + 0x10, 0x70, 0xbe, 0xf1, 0xab, 0x5e, 0x36, 0x06, + 0x87, 0x2c, 0xa1, 0x0d, 0xee, 0x15, 0xb3, 0x24, + 0x9b, 0x1a, 0x1b, 0x95, 0x8f, 0x23, 0x13, 0x4c, + 0x4b, 0xcc, 0xb7, 0xd0, 0x32, 0x00, 0xbc, 0xe4, + 0x20, 0xa2, 0xf8, 0xeb, 0x66, 0xdc, 0xf3, 0x64, + 0x4d, 0x14, 0x23, 0xc1, 0xb5, 0x69, 0x90, 0x03, + 0xc1, 0x3e, 0xce, 0xf4, 0xbf, 0x38, 0xa3, 0xb6, + 0x0e, 0xed, 0xc3, 0x40, 0x33, 0xba, 0xc1, 0x90, + 0x27, 0x83, 0xdc, 0x6d, 0x89, 0xe2, 0xe7, 0x74, + 0x18, 0x8a, 0x43, 0x9c, 0x7e, 0xbc, 0xc0, 0x67, + 0x2d, 0xbd, 0xa4, 0xdd, 0xcf, 0xb2, 0x79, 0x46, + 0x13, 0xb0, 0xbe, 0x41, 0x31, 0x5e, 0xf7, 0x78, + 0x70, 0x8a, 0x70, 0xee, 0x7d, 0x75, 0x16, 0x5c +}; + +static const u8 T20[] = { + 0x8b, 0x30, 0x7f, 0x6b, 0x33, 0x28, 0x6d, 0x0a, + 0xb0, 0x26, 0xa9, 0xed, 0x3f, 0xe1, 0xe8, 0x5f +}; + +# define TEST_CASE(n) do { \ + u8 out[sizeof(P##n)]; \ + AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \ + CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \ + CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ + memset(out,0,sizeof(out)); \ + if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ + if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \ + if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ + (C##n && memcmp(out,C##n,sizeof(out)))) \ + ret++, printf ("encrypt test#%d failed.\n",n); \ + CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ + memset(out,0,sizeof(out)); \ + if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ + if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \ + if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ + (P##n && memcmp(out,P##n,sizeof(out)))) \ + ret++, printf ("decrypt test#%d failed.\n",n); \ + } while(0) + +int main() +{ + GCM128_CONTEXT ctx; + AES_KEY key; + int ret = 0; + + TEST_CASE(1); + TEST_CASE(2); + TEST_CASE(3); + TEST_CASE(4); + TEST_CASE(5); + TEST_CASE(6); + TEST_CASE(7); + TEST_CASE(8); + TEST_CASE(9); + TEST_CASE(10); + TEST_CASE(11); + TEST_CASE(12); + TEST_CASE(13); + TEST_CASE(14); + TEST_CASE(15); + TEST_CASE(16); + TEST_CASE(17); + TEST_CASE(18); + TEST_CASE(19); + TEST_CASE(20); + +# ifdef OPENSSL_CPUID_OBJ + { + size_t start, stop, gcm_t, ctr_t, OPENSSL_rdtsc(); + union { + u64 u; + u8 c[1024]; + } buf; + int i; + + AES_set_encrypt_key(K1, sizeof(K1) * 8, &key); + CRYPTO_gcm128_init(&ctx, &key, (block128_f) AES_encrypt); + CRYPTO_gcm128_setiv(&ctx, IV1, sizeof(IV1)); + + CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf)); + start = OPENSSL_rdtsc(); + CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf)); + gcm_t = OPENSSL_rdtsc() - start; + + CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf), + &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres, + (block128_f) AES_encrypt); + start = OPENSSL_rdtsc(); + CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf), + &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres, + (block128_f) AES_encrypt); + ctr_t = OPENSSL_rdtsc() - start; + + printf("%.2f-%.2f=%.2f\n", + gcm_t / (double)sizeof(buf), + ctr_t / (double)sizeof(buf), + (gcm_t - ctr_t) / (double)sizeof(buf)); +# ifdef GHASH + { + void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16], + const u8 *inp, size_t len) = ctx.ghash; + + GHASH((&ctx), buf.c, sizeof(buf)); + start = OPENSSL_rdtsc(); + for (i = 0; i < 100; ++i) + GHASH((&ctx), buf.c, sizeof(buf)); + gcm_t = OPENSSL_rdtsc() - start; + printf("%.2f\n", gcm_t / (double)sizeof(buf) / (double)i); + } +# endif + } +# endif + + return ret; +} +#endif diff --git a/openssl-1.1.0h/crypto/modes/modes_lcl.h b/openssl-1.1.0h/crypto/modes/modes_lcl.h new file mode 100644 index 0000000..7a1603b --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/modes_lcl.h @@ -0,0 +1,185 @@ +/* + * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include + +#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) +typedef __int64 i64; +typedef unsigned __int64 u64; +# define U64(C) C##UI64 +#elif defined(__arch64__) +typedef long i64; +typedef unsigned long u64; +# define U64(C) C##UL +#else +typedef long long i64; +typedef unsigned long long u64; +# define U64(C) C##ULL +#endif + +typedef unsigned int u32; +typedef unsigned char u8; + +#define STRICT_ALIGNMENT 1 +#ifndef PEDANTIC +# if defined(__i386) || defined(__i386__) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ + defined(__aarch64__) || \ + defined(__s390__) || defined(__s390x__) +# undef STRICT_ALIGNMENT +# endif +#endif + +#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) +# if defined(__GNUC__) && __GNUC__>=2 +# if defined(__x86_64) || defined(__x86_64__) +# define BSWAP8(x) ({ u64 ret_=(x); \ + asm ("bswapq %0" \ + : "+r"(ret_)); ret_; }) +# define BSWAP4(x) ({ u32 ret_=(x); \ + asm ("bswapl %0" \ + : "+r"(ret_)); ret_; }) +# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) +# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \ + asm ("bswapl %0; bswapl %1" \ + : "+r"(hi_),"+r"(lo_)); \ + (u64)hi_<<32|lo_; }) +# define BSWAP4(x) ({ u32 ret_=(x); \ + asm ("bswapl %0" \ + : "+r"(ret_)); ret_; }) +# elif defined(__aarch64__) +# define BSWAP8(x) ({ u64 ret_; \ + asm ("rev %0,%1" \ + : "=r"(ret_) : "r"(x)); ret_; }) +# define BSWAP4(x) ({ u32 ret_; \ + asm ("rev %w0,%w1" \ + : "=r"(ret_) : "r"(x)); ret_; }) +# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT) +# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \ + asm ("rev %0,%0; rev %1,%1" \ + : "+r"(hi_),"+r"(lo_)); \ + (u64)hi_<<32|lo_; }) +# define BSWAP4(x) ({ u32 ret_; \ + asm ("rev %0,%1" \ + : "=r"(ret_) : "r"((u32)(x))); \ + ret_; }) +# endif +# elif defined(_MSC_VER) +# if _MSC_VER>=1300 +# pragma intrinsic(_byteswap_uint64,_byteswap_ulong) +# define BSWAP8(x) _byteswap_uint64((u64)(x)) +# define BSWAP4(x) _byteswap_ulong((u32)(x)) +# elif defined(_M_IX86) +__inline u32 _bswap4(u32 val) +{ +_asm mov eax, val _asm bswap eax} +# define BSWAP4(x) _bswap4(x) +# endif +# endif +#endif +#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) +# define GETU32(p) BSWAP4(*(const u32 *)(p)) +# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) +#else +# define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) +# define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) +#endif +/*- GCM definitions */ typedef struct { + u64 hi, lo; +} u128; + +#ifdef TABLE_BITS +# undef TABLE_BITS +#endif +/* + * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should + * never be set to 8 [or 1]. For further information see gcm128.c. + */ +#define TABLE_BITS 4 + +struct gcm128_context { + /* Following 6 names follow names in GCM specification */ + union { + u64 u[2]; + u32 d[4]; + u8 c[16]; + size_t t[16 / sizeof(size_t)]; + } Yi, EKi, EK0, len, Xi, H; + /* + * Relative position of Xi, H and pre-computed Htable is used in some + * assembler modules, i.e. don't change the order! + */ +#if TABLE_BITS==8 + u128 Htable[256]; +#else + u128 Htable[16]; + void (*gmult) (u64 Xi[2], const u128 Htable[16]); + void (*ghash) (u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +#endif + unsigned int mres, ares; + block128_f block; + void *key; +}; + +struct xts128_context { + void *key1, *key2; + block128_f block1, block2; +}; + +struct ccm128_context { + union { + u64 u[2]; + u8 c[16]; + } nonce, cmac; + u64 blocks; + block128_f block; + void *key; +}; + +#ifndef OPENSSL_NO_OCB + +typedef union { + u64 a[2]; + unsigned char c[16]; +} OCB_BLOCK; +# define ocb_block16_xor(in1,in2,out) \ + ( (out)->a[0]=(in1)->a[0]^(in2)->a[0], \ + (out)->a[1]=(in1)->a[1]^(in2)->a[1] ) +# if STRICT_ALIGNMENT +# define ocb_block16_xor_misaligned(in1,in2,out) \ + ocb_block_xor((in1)->c,(in2)->c,16,(out)->c) +# else +# define ocb_block16_xor_misaligned ocb_block16_xor +# endif + +struct ocb128_context { + /* Need both encrypt and decrypt key schedules for decryption */ + block128_f encrypt; + block128_f decrypt; + void *keyenc; + void *keydec; + ocb128_f stream; /* direction dependent */ + /* Key dependent variables. Can be reused if key remains the same */ + size_t l_index; + size_t max_l_index; + OCB_BLOCK l_star; + OCB_BLOCK l_dollar; + OCB_BLOCK *l; + /* Must be reset for each session */ + u64 blocks_hashed; + u64 blocks_processed; + OCB_BLOCK tag; + OCB_BLOCK offset_aad; + OCB_BLOCK sum; + OCB_BLOCK offset; + OCB_BLOCK checksum; +}; +#endif /* OPENSSL_NO_OCB */ diff --git a/openssl-1.1.0h/crypto/modes/ocb128.c b/openssl-1.1.0h/crypto/modes/ocb128.c new file mode 100644 index 0000000..db794d0 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/ocb128.c @@ -0,0 +1,568 @@ +/* + * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include +#include "modes_lcl.h" + +#ifndef OPENSSL_NO_OCB + +/* + * Calculate the number of binary trailing zero's in any given number + */ +static u32 ocb_ntz(u64 n) +{ + u32 cnt = 0; + + /* + * We do a right-to-left simple sequential search. This is surprisingly + * efficient as the distribution of trailing zeros is not uniform, + * e.g. the number of possible inputs with no trailing zeros is equal to + * the number with 1 or more; the number with exactly 1 is equal to the + * number with 2 or more, etc. Checking the last two bits covers 75% of + * all numbers. Checking the last three covers 87.5% + */ + while (!(n & 1)) { + n >>= 1; + cnt++; + } + return cnt; +} + +/* + * Shift a block of 16 bytes left by shift bits + */ +static void ocb_block_lshift(const unsigned char *in, size_t shift, + unsigned char *out) +{ + unsigned char shift_mask; + int i; + unsigned char mask[15]; + + shift_mask = 0xff; + shift_mask <<= (8 - shift); + for (i = 15; i >= 0; i--) { + if (i > 0) { + mask[i - 1] = in[i] & shift_mask; + mask[i - 1] >>= 8 - shift; + } + out[i] = in[i] << shift; + + if (i != 15) { + out[i] ^= mask[i]; + } + } +} + +/* + * Perform a "double" operation as per OCB spec + */ +static void ocb_double(OCB_BLOCK *in, OCB_BLOCK *out) +{ + unsigned char mask; + + /* + * Calculate the mask based on the most significant bit. There are more + * efficient ways to do this - but this way is constant time + */ + mask = in->c[0] & 0x80; + mask >>= 7; + mask *= 135; + + ocb_block_lshift(in->c, 1, out->c); + + out->c[15] ^= mask; +} + +/* + * Perform an xor on in1 and in2 - each of len bytes. Store result in out + */ +static void ocb_block_xor(const unsigned char *in1, + const unsigned char *in2, size_t len, + unsigned char *out) +{ + size_t i; + for (i = 0; i < len; i++) { + out[i] = in1[i] ^ in2[i]; + } +} + +/* + * Lookup L_index in our lookup table. If we haven't already got it we need to + * calculate it + */ +static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx) +{ + size_t l_index = ctx->l_index; + + if (idx <= l_index) { + return ctx->l + idx; + } + + /* We don't have it - so calculate it */ + if (idx >= ctx->max_l_index) { + void *tmp_ptr; + /* + * Each additional entry allows to process almost double as + * much data, so that in linear world the table will need to + * be expanded with smaller and smaller increments. Originally + * it was doubling in size, which was a waste. Growing it + * linearly is not formally optimal, but is simpler to implement. + * We grow table by minimally required 4*n that would accommodate + * the index. + */ + ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3; + tmp_ptr = + OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK)); + if (tmp_ptr == NULL) /* prevent ctx->l from being clobbered */ + return NULL; + ctx->l = tmp_ptr; + } + while (l_index < idx) { + ocb_double(ctx->l + l_index, ctx->l + l_index + 1); + l_index++; + } + ctx->l_index = l_index; + + return ctx->l + idx; +} + +/* + * Create a new OCB128_CONTEXT + */ +OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec, + block128_f encrypt, block128_f decrypt, + ocb128_f stream) +{ + OCB128_CONTEXT *octx; + int ret; + + if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) { + ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt, + stream); + if (ret) + return octx; + OPENSSL_free(octx); + } + + return NULL; +} + +/* + * Initialise an existing OCB128_CONTEXT + */ +int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec, + block128_f encrypt, block128_f decrypt, + ocb128_f stream) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->l_index = 0; + ctx->max_l_index = 5; + ctx->l = OPENSSL_malloc(ctx->max_l_index * 16); + if (ctx->l == NULL) + return 0; + + /* + * We set both the encryption and decryption key schedules - decryption + * needs both. Don't really need decryption schedule if only doing + * encryption - but it simplifies things to take it anyway + */ + ctx->encrypt = encrypt; + ctx->decrypt = decrypt; + ctx->stream = stream; + ctx->keyenc = keyenc; + ctx->keydec = keydec; + + /* L_* = ENCIPHER(K, zeros(128)) */ + ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc); + + /* L_$ = double(L_*) */ + ocb_double(&ctx->l_star, &ctx->l_dollar); + + /* L_0 = double(L_$) */ + ocb_double(&ctx->l_dollar, ctx->l); + + /* L_{i} = double(L_{i-1}) */ + ocb_double(ctx->l, ctx->l+1); + ocb_double(ctx->l+1, ctx->l+2); + ocb_double(ctx->l+2, ctx->l+3); + ocb_double(ctx->l+3, ctx->l+4); + ctx->l_index = 4; /* enough to process up to 496 bytes */ + + return 1; +} + +/* + * Copy an OCB128_CONTEXT object + */ +int CRYPTO_ocb128_copy_ctx(OCB128_CONTEXT *dest, OCB128_CONTEXT *src, + void *keyenc, void *keydec) +{ + memcpy(dest, src, sizeof(OCB128_CONTEXT)); + if (keyenc) + dest->keyenc = keyenc; + if (keydec) + dest->keydec = keydec; + if (src->l) { + dest->l = OPENSSL_malloc(src->max_l_index * 16); + if (dest->l == NULL) + return 0; + memcpy(dest->l, src->l, (src->l_index + 1) * 16); + } + return 1; +} + +/* + * Set the IV to be used for this operation. Must be 1 - 15 bytes. + */ +int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv, + size_t len, size_t taglen) +{ + unsigned char ktop[16], tmp[16], mask; + unsigned char stretch[24], nonce[16]; + size_t bottom, shift; + + /* + * Spec says IV is 120 bits or fewer - it allows non byte aligned lengths. + * We don't support this at this stage + */ + if ((len > 15) || (len < 1) || (taglen > 16) || (taglen < 1)) { + return -1; + } + + /* Nonce = num2str(TAGLEN mod 128,7) || zeros(120-bitlen(N)) || 1 || N */ + nonce[0] = ((taglen * 8) % 128) << 1; + memset(nonce + 1, 0, 15); + memcpy(nonce + 16 - len, iv, len); + nonce[15 - len] |= 1; + + /* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */ + memcpy(tmp, nonce, 16); + tmp[15] &= 0xc0; + ctx->encrypt(tmp, ktop, ctx->keyenc); + + /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */ + memcpy(stretch, ktop, 16); + ocb_block_xor(ktop, ktop + 1, 8, stretch + 16); + + /* bottom = str2num(Nonce[123..128]) */ + bottom = nonce[15] & 0x3f; + + /* Offset_0 = Stretch[1+bottom..128+bottom] */ + shift = bottom % 8; + ocb_block_lshift(stretch + (bottom / 8), shift, ctx->offset.c); + mask = 0xff; + mask <<= 8 - shift; + ctx->offset.c[15] |= + (*(stretch + (bottom / 8) + 16) & mask) >> (8 - shift); + + return 1; +} + +/* + * Provide any AAD. This can be called multiple times. Only the final time can + * have a partial block + */ +int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad, + size_t len) +{ + u64 i, all_num_blocks; + size_t num_blocks, last_len; + OCB_BLOCK tmp; + + /* Calculate the number of blocks of AAD provided now, and so far */ + num_blocks = len / 16; + all_num_blocks = num_blocks + ctx->blocks_hashed; + + /* Loop through all full blocks of AAD */ + for (i = ctx->blocks_hashed + 1; i <= all_num_blocks; i++) { + OCB_BLOCK *lookup; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + lookup = ocb_lookup_l(ctx, ocb_ntz(i)); + if (lookup == NULL) + return 0; + ocb_block16_xor(&ctx->offset_aad, lookup, &ctx->offset_aad); + + memcpy(tmp.c, aad, 16); + aad += 16; + + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + ocb_block16_xor(&ctx->offset_aad, &tmp, &tmp); + ctx->encrypt(tmp.c, tmp.c, ctx->keyenc); + ocb_block16_xor(&tmp, &ctx->sum, &ctx->sum); + } + + /* + * Check if we have any partial blocks left over. This is only valid in the + * last call to this function + */ + last_len = len % 16; + + if (last_len > 0) { + /* Offset_* = Offset_m xor L_* */ + ocb_block16_xor(&ctx->offset_aad, &ctx->l_star, &ctx->offset_aad); + + /* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */ + memset(tmp.c, 0, 16); + memcpy(tmp.c, aad, last_len); + tmp.c[last_len] = 0x80; + ocb_block16_xor(&ctx->offset_aad, &tmp, &tmp); + + /* Sum = Sum_m xor ENCIPHER(K, CipherInput) */ + ctx->encrypt(tmp.c, tmp.c, ctx->keyenc); + ocb_block16_xor(&tmp, &ctx->sum, &ctx->sum); + } + + ctx->blocks_hashed = all_num_blocks; + + return 1; +} + +/* + * Provide any data to be encrypted. This can be called multiple times. Only + * the final time can have a partial block + */ +int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + u64 i, all_num_blocks; + size_t num_blocks, last_len; + + /* + * Calculate the number of blocks of data to be encrypted provided now, and + * so far + */ + num_blocks = len / 16; + all_num_blocks = num_blocks + ctx->blocks_processed; + + if (num_blocks && all_num_blocks == (size_t)all_num_blocks + && ctx->stream != NULL) { + size_t max_idx = 0, top = (size_t)all_num_blocks; + + /* + * See how many L_{i} entries we need to process data at hand + * and pre-compute missing entries in the table [if any]... + */ + while (top >>= 1) + max_idx++; + if (ocb_lookup_l(ctx, max_idx) == NULL) + return 0; + + ctx->stream(in, out, num_blocks, ctx->keyenc, + (size_t)ctx->blocks_processed + 1, ctx->offset.c, + (const unsigned char (*)[16])ctx->l, ctx->checksum.c); + } else { + /* Loop through all full blocks to be encrypted */ + for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) { + OCB_BLOCK *lookup; + OCB_BLOCK tmp; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + lookup = ocb_lookup_l(ctx, ocb_ntz(i)); + if (lookup == NULL) + return 0; + ocb_block16_xor(&ctx->offset, lookup, &ctx->offset); + + memcpy(tmp.c, in, 16); + in += 16; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ocb_block16_xor(&tmp, &ctx->checksum, &ctx->checksum); + + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + ocb_block16_xor(&ctx->offset, &tmp, &tmp); + ctx->encrypt(tmp.c, tmp.c, ctx->keyenc); + ocb_block16_xor(&ctx->offset, &tmp, &tmp); + + memcpy(out, tmp.c, 16); + out += 16; + } + } + + /* + * Check if we have any partial blocks left over. This is only valid in the + * last call to this function + */ + last_len = len % 16; + + if (last_len > 0) { + OCB_BLOCK pad; + + /* Offset_* = Offset_m xor L_* */ + ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset); + + /* Pad = ENCIPHER(K, Offset_*) */ + ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc); + + /* C_* = P_* xor Pad[1..bitlen(P_*)] */ + ocb_block_xor(in, pad.c, last_len, out); + + /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */ + memset(pad.c, 0, 16); /* borrow pad */ + memcpy(pad.c, in, last_len); + pad.c[last_len] = 0x80; + ocb_block16_xor(&pad, &ctx->checksum, &ctx->checksum); + } + + ctx->blocks_processed = all_num_blocks; + + return 1; +} + +/* + * Provide any data to be decrypted. This can be called multiple times. Only + * the final time can have a partial block + */ +int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + u64 i, all_num_blocks; + size_t num_blocks, last_len; + + /* + * Calculate the number of blocks of data to be decrypted provided now, and + * so far + */ + num_blocks = len / 16; + all_num_blocks = num_blocks + ctx->blocks_processed; + + if (num_blocks && all_num_blocks == (size_t)all_num_blocks + && ctx->stream != NULL) { + size_t max_idx = 0, top = (size_t)all_num_blocks; + + /* + * See how many L_{i} entries we need to process data at hand + * and pre-compute missing entries in the table [if any]... + */ + while (top >>= 1) + max_idx++; + if (ocb_lookup_l(ctx, max_idx) == NULL) + return 0; + + ctx->stream(in, out, num_blocks, ctx->keydec, + (size_t)ctx->blocks_processed + 1, ctx->offset.c, + (const unsigned char (*)[16])ctx->l, ctx->checksum.c); + } else { + OCB_BLOCK tmp; + + /* Loop through all full blocks to be decrypted */ + for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) { + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i)); + if (lookup == NULL) + return 0; + ocb_block16_xor(&ctx->offset, lookup, &ctx->offset); + + memcpy(tmp.c, in, 16); + in += 16; + + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + ocb_block16_xor(&ctx->offset, &tmp, &tmp); + ctx->decrypt(tmp.c, tmp.c, ctx->keydec); + ocb_block16_xor(&ctx->offset, &tmp, &tmp); + + /* Checksum_i = Checksum_{i-1} xor P_i */ + ocb_block16_xor(&tmp, &ctx->checksum, &ctx->checksum); + + memcpy(out, tmp.c, 16); + out += 16; + } + } + + /* + * Check if we have any partial blocks left over. This is only valid in the + * last call to this function + */ + last_len = len % 16; + + if (last_len > 0) { + OCB_BLOCK pad; + + /* Offset_* = Offset_m xor L_* */ + ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset); + + /* Pad = ENCIPHER(K, Offset_*) */ + ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc); + + /* P_* = C_* xor Pad[1..bitlen(C_*)] */ + ocb_block_xor(in, pad.c, last_len, out); + + /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */ + memset(pad.c, 0, 16); /* borrow pad */ + memcpy(pad.c, out, last_len); + pad.c[last_len] = 0x80; + ocb_block16_xor(&pad, &ctx->checksum, &ctx->checksum); + } + + ctx->blocks_processed = all_num_blocks; + + return 1; +} + +/* + * Calculate the tag and verify it against the supplied tag + */ +int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag, + size_t len) +{ + OCB_BLOCK tmp; + + /* + * Tag = ENCIPHER(K, Checksum_* xor Offset_* xor L_$) xor HASH(K,A) + */ + ocb_block16_xor(&ctx->checksum, &ctx->offset, &tmp); + ocb_block16_xor(&ctx->l_dollar, &tmp, &tmp); + ctx->encrypt(tmp.c, tmp.c, ctx->keyenc); + ocb_block16_xor(&tmp, &ctx->sum, &ctx->tag); + + if (len > 16 || len < 1) { + return -1; + } + + /* Compare the tag if we've been given one */ + if (tag) + return CRYPTO_memcmp(&ctx->tag, tag, len); + else + return -1; +} + +/* + * Retrieve the calculated tag + */ +int CRYPTO_ocb128_tag(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len) +{ + if (len > 16 || len < 1) { + return -1; + } + + /* Calculate the tag */ + CRYPTO_ocb128_finish(ctx, NULL, 0); + + /* Copy the tag into the supplied buffer */ + memcpy(tag, ctx->tag.c, len); + + return 1; +} + +/* + * Release all resources + */ +void CRYPTO_ocb128_cleanup(OCB128_CONTEXT *ctx) +{ + if (ctx) { + OPENSSL_clear_free(ctx->l, ctx->max_l_index * 16); + OPENSSL_cleanse(ctx, sizeof(*ctx)); + } +} + +#endif /* OPENSSL_NO_OCB */ diff --git a/openssl-1.1.0h/crypto/modes/ofb128.c b/openssl-1.1.0h/crypto/modes/ofb128.c new file mode 100644 index 0000000..8309256 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/ofb128.c @@ -0,0 +1,74 @@ +/* + * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +/* + * The input and output encrypted as though 128bit ofb mode is being used. + * The extra state information to record how much of the 128bit block we have + * used is contained in *num; + */ +void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], int *num, block128_f block) +{ + unsigned int n; + size_t l = 0; + + n = *num; + +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16 % sizeof(size_t) == 0) { /* always true actually */ + do { + while (n && len) { + *(out++) = *(in++) ^ ivec[n]; + --len; + n = (n + 1) % 16; + } +# if defined(STRICT_ALIGNMENT) + if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != + 0) + break; +# endif + while (len >= 16) { + (*block) (ivec, ivec, key); + for (; n < 16; n += sizeof(size_t)) + *(size_t *)(out + n) = + *(size_t *)(in + n) ^ *(size_t *)(ivec + n); + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block) (ivec, ivec, key); + while (len--) { + out[n] = in[n] ^ ivec[n]; + ++n; + } + } + *num = n; + return; + } while (0); + } + /* the rest would be commonly eliminated by x86* compiler */ +#endif + while (l < len) { + if (n == 0) { + (*block) (ivec, ivec, key); + } + out[l] = in[l] ^ ivec[n]; + ++l; + n = (n + 1) % 16; + } + + *num = n; +} diff --git a/openssl-1.1.0h/crypto/modes/wrap128.c b/openssl-1.1.0h/crypto/modes/wrap128.c new file mode 100644 index 0000000..46809a0 --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/wrap128.c @@ -0,0 +1,329 @@ +/* + * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/** Beware! + * + * Following wrapping modes were designed for AES but this implementation + * allows you to use them for any 128 bit block cipher. + */ + +#include "internal/cryptlib.h" +#include + +/** RFC 3394 section 2.2.3.1 Default Initial Value */ +static const unsigned char default_iv[] = { + 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, +}; + +/** RFC 5649 section 3 Alternative Initial Value 32-bit constant */ +static const unsigned char default_aiv[] = { + 0xA6, 0x59, 0x59, 0xA6 +}; + +/** Input size limit: lower than maximum of standards but far larger than + * anything that will be used in practice. + */ +#define CRYPTO128_WRAP_MAX (1UL << 31) + +/** Wrapping according to RFC 3394 section 2.2.1. + * + * @param[in] key Key value. + * @param[in] iv IV value. Length = 8 bytes. NULL = use default_iv. + * @param[in] in Plaintext as n 64-bit blocks, n >= 2. + * @param[in] inlen Length of in. + * @param[out] out Ciphertext. Minimal buffer length = (inlen + 8) bytes. + * Input and output buffers can overlap if block function + * supports that. + * @param[in] block Block processing function. + * @return 0 if inlen does not consist of n 64-bit blocks, n >= 2. + * or if inlen > CRYPTO128_WRAP_MAX. + * Output length if wrapping succeeded. + */ +size_t CRYPTO_128_wrap(void *key, const unsigned char *iv, + unsigned char *out, + const unsigned char *in, size_t inlen, + block128_f block) +{ + unsigned char *A, B[16], *R; + size_t i, j, t; + if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX)) + return 0; + A = B; + t = 1; + memmove(out + 8, in, inlen); + if (!iv) + iv = default_iv; + + memcpy(A, iv, 8); + + for (j = 0; j < 6; j++) { + R = out + 8; + for (i = 0; i < inlen; i += 8, t++, R += 8) { + memcpy(B + 8, R, 8); + block(B, B, key); + A[7] ^= (unsigned char)(t & 0xff); + if (t > 0xff) { + A[6] ^= (unsigned char)((t >> 8) & 0xff); + A[5] ^= (unsigned char)((t >> 16) & 0xff); + A[4] ^= (unsigned char)((t >> 24) & 0xff); + } + memcpy(R, B + 8, 8); + } + } + memcpy(out, A, 8); + return inlen + 8; +} + +/** Unwrapping according to RFC 3394 section 2.2.2 steps 1-2. + * The IV check (step 3) is responsibility of the caller. + * + * @param[in] key Key value. + * @param[out] iv Unchecked IV value. Minimal buffer length = 8 bytes. + * @param[out] out Plaintext without IV. + * Minimal buffer length = (inlen - 8) bytes. + * Input and output buffers can overlap if block function + * supports that. + * @param[in] in Ciphertext as n 64-bit blocks. + * @param[in] inlen Length of in. + * @param[in] block Block processing function. + * @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX] + * or if inlen is not a multiple of 8. + * Output length otherwise. + */ +static size_t crypto_128_unwrap_raw(void *key, unsigned char *iv, + unsigned char *out, + const unsigned char *in, size_t inlen, + block128_f block) +{ + unsigned char *A, B[16], *R; + size_t i, j, t; + inlen -= 8; + if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX)) + return 0; + A = B; + t = 6 * (inlen >> 3); + memcpy(A, in, 8); + memmove(out, in + 8, inlen); + for (j = 0; j < 6; j++) { + R = out + inlen - 8; + for (i = 0; i < inlen; i += 8, t--, R -= 8) { + A[7] ^= (unsigned char)(t & 0xff); + if (t > 0xff) { + A[6] ^= (unsigned char)((t >> 8) & 0xff); + A[5] ^= (unsigned char)((t >> 16) & 0xff); + A[4] ^= (unsigned char)((t >> 24) & 0xff); + } + memcpy(B + 8, R, 8); + block(B, B, key); + memcpy(R, B + 8, 8); + } + } + memcpy(iv, A, 8); + return inlen; +} + +/** Unwrapping according to RFC 3394 section 2.2.2, including the IV check. + * The first block of plaintext has to match the supplied IV, otherwise an + * error is returned. + * + * @param[in] key Key value. + * @param[out] iv IV value to match against. Length = 8 bytes. + * NULL = use default_iv. + * @param[out] out Plaintext without IV. + * Minimal buffer length = (inlen - 8) bytes. + * Input and output buffers can overlap if block function + * supports that. + * @param[in] in Ciphertext as n 64-bit blocks. + * @param[in] inlen Length of in. + * @param[in] block Block processing function. + * @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX] + * or if inlen is not a multiple of 8 + * or if IV doesn't match expected value. + * Output length otherwise. + */ +size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv, + unsigned char *out, const unsigned char *in, + size_t inlen, block128_f block) +{ + size_t ret; + unsigned char got_iv[8]; + + ret = crypto_128_unwrap_raw(key, got_iv, out, in, inlen, block); + if (ret == 0) + return 0; + + if (!iv) + iv = default_iv; + if (CRYPTO_memcmp(got_iv, iv, 8)) { + OPENSSL_cleanse(out, ret); + return 0; + } + return ret; +} + +/** Wrapping according to RFC 5649 section 4.1. + * + * @param[in] key Key value. + * @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv. + * @param[out] out Ciphertext. Minimal buffer length = (inlen + 15) bytes. + * Input and output buffers can overlap if block function + * supports that. + * @param[in] in Plaintext as n 64-bit blocks, n >= 2. + * @param[in] inlen Length of in. + * @param[in] block Block processing function. + * @return 0 if inlen is out of range [1, CRYPTO128_WRAP_MAX]. + * Output length if wrapping succeeded. + */ +size_t CRYPTO_128_wrap_pad(void *key, const unsigned char *icv, + unsigned char *out, + const unsigned char *in, size_t inlen, + block128_f block) +{ + /* n: number of 64-bit blocks in the padded key data + * + * If length of plain text is not a multiple of 8, pad the plain text octet + * string on the right with octets of zeros, where final length is the + * smallest multiple of 8 that is greater than length of plain text. + * If length of plain text is a multiple of 8, then there is no padding. */ + const size_t blocks_padded = (inlen + 7) / 8; /* CEILING(m/8) */ + const size_t padded_len = blocks_padded * 8; + const size_t padding_len = padded_len - inlen; + /* RFC 5649 section 3: Alternative Initial Value */ + unsigned char aiv[8]; + int ret; + + /* Section 1: use 32-bit fixed field for plaintext octet length */ + if (inlen == 0 || inlen >= CRYPTO128_WRAP_MAX) + return 0; + + /* Section 3: Alternative Initial Value */ + if (!icv) + memcpy(aiv, default_aiv, 4); + else + memcpy(aiv, icv, 4); /* Standard doesn't mention this. */ + + aiv[4] = (inlen >> 24) & 0xFF; + aiv[5] = (inlen >> 16) & 0xFF; + aiv[6] = (inlen >> 8) & 0xFF; + aiv[7] = inlen & 0xFF; + + if (padded_len == 8) { + /* + * Section 4.1 - special case in step 2: If the padded plaintext + * contains exactly eight octets, then prepend the AIV and encrypt + * the resulting 128-bit block using AES in ECB mode. + */ + memmove(out + 8, in, inlen); + memcpy(out, aiv, 8); + memset(out + 8 + inlen, 0, padding_len); + block(out, out, key); + ret = 16; /* AIV + padded input */ + } else { + memmove(out, in, inlen); + memset(out + inlen, 0, padding_len); /* Section 4.1 step 1 */ + ret = CRYPTO_128_wrap(key, aiv, out, out, padded_len, block); + } + + return ret; +} + +/** Unwrapping according to RFC 5649 section 4.2. + * + * @param[in] key Key value. + * @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv. + * @param[out] out Plaintext. Minimal buffer length = inlen bytes. + * Input and output buffers can overlap if block function + * supports that. + * @param[in] in Ciphertext as n 64-bit blocks. + * @param[in] inlen Length of in. + * @param[in] block Block processing function. + * @return 0 if inlen is out of range [16, CRYPTO128_WRAP_MAX], + * or if inlen is not a multiple of 8 + * or if IV and message length indicator doesn't match. + * Output length if unwrapping succeeded and IV matches. + */ +size_t CRYPTO_128_unwrap_pad(void *key, const unsigned char *icv, + unsigned char *out, + const unsigned char *in, size_t inlen, + block128_f block) +{ + /* n: number of 64-bit blocks in the padded key data */ + size_t n = inlen / 8 - 1; + size_t padded_len; + size_t padding_len; + size_t ptext_len; + /* RFC 5649 section 3: Alternative Initial Value */ + unsigned char aiv[8]; + static unsigned char zeros[8] = { 0x0 }; + size_t ret; + + /* Section 4.2: Ciphertext length has to be (n+1) 64-bit blocks. */ + if ((inlen & 0x7) != 0 || inlen < 16 || inlen >= CRYPTO128_WRAP_MAX) + return 0; + + memmove(out, in, inlen); + if (inlen == 16) { + /* + * Section 4.2 - special case in step 1: When n=1, the ciphertext + * contains exactly two 64-bit blocks and they are decrypted as a + * single AES block using AES in ECB mode: AIV | P[1] = DEC(K, C[0] | + * C[1]) + */ + block(out, out, key); + memcpy(aiv, out, 8); + /* Remove AIV */ + memmove(out, out + 8, 8); + padded_len = 8; + } else { + padded_len = inlen - 8; + ret = crypto_128_unwrap_raw(key, aiv, out, out, inlen, block); + if (padded_len != ret) { + OPENSSL_cleanse(out, inlen); + return 0; + } + } + + /* + * Section 3: AIV checks: Check that MSB(32,A) = A65959A6. Optionally a + * user-supplied value can be used (even if standard doesn't mention + * this). + */ + if ((!icv && CRYPTO_memcmp(aiv, default_aiv, 4)) + || (icv && CRYPTO_memcmp(aiv, icv, 4))) { + OPENSSL_cleanse(out, inlen); + return 0; + } + + /* + * Check that 8*(n-1) < LSB(32,AIV) <= 8*n. If so, let ptext_len = + * LSB(32,AIV). + */ + + ptext_len = ((unsigned int)aiv[4] << 24) + | ((unsigned int)aiv[5] << 16) + | ((unsigned int)aiv[6] << 8) + | (unsigned int)aiv[7]; + if (8 * (n - 1) >= ptext_len || ptext_len > 8 * n) { + OPENSSL_cleanse(out, inlen); + return 0; + } + + /* + * Check that the rightmost padding_len octets of the output data are + * zero. + */ + padding_len = padded_len - ptext_len; + if (CRYPTO_memcmp(out + ptext_len, zeros, padding_len) != 0) { + OPENSSL_cleanse(out, inlen); + return 0; + } + + /* Section 4.2 step 3: Remove padding */ + return ptext_len; +} diff --git a/openssl-1.1.0h/crypto/modes/xts128.c b/openssl-1.1.0h/crypto/modes/xts128.c new file mode 100644 index 0000000..81b1eac --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/xts128.c @@ -0,0 +1,157 @@ +/* + * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "modes_lcl.h" +#include + +int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc) +{ + const union { + long one; + char little; + } is_endian = { + 1 + }; + union { + u64 u[2]; + u32 d[4]; + u8 c[16]; + } tweak, scratch; + unsigned int i; + + if (len < 16) + return -1; + + memcpy(tweak.c, iv, 16); + + (*ctx->block2) (tweak.c, tweak.c, ctx->key2); + + if (!enc && (len % 16)) + len -= 16; + + while (len >= 16) { +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; +#else + scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0]; + scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0]; + ((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1]; +#endif + inp += 16; + out += 16; + len -= 16; + + if (len == 0) + return 0; + + if (is_endian.little) { + unsigned int carry, res; + + res = 0x87 & (((int)tweak.d[3]) >> 31); + carry = (unsigned int)(tweak.u[0] >> 63); + tweak.u[0] = (tweak.u[0] << 1) ^ res; + tweak.u[1] = (tweak.u[1] << 1) | carry; + } else { + size_t c; + + for (c = 0, i = 0; i < 16; ++i) { + /* + * + substitutes for |, because c is 1 bit + */ + c += ((size_t)tweak.c[i]) << 1; + tweak.c[i] = (u8)c; + c = c >> 8; + } + tweak.c[0] ^= (u8)(0x87 & (0 - c)); + } + } + if (enc) { + for (i = 0; i < len; ++i) { + u8 c = inp[i]; + out[i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out - 16, scratch.c, 16); + } else { + union { + u64 u[2]; + u8 c[16]; + } tweak1; + + if (is_endian.little) { + unsigned int carry, res; + + res = 0x87 & (((int)tweak.d[3]) >> 31); + carry = (unsigned int)(tweak.u[0] >> 63); + tweak1.u[0] = (tweak.u[0] << 1) ^ res; + tweak1.u[1] = (tweak.u[1] << 1) | carry; + } else { + size_t c; + + for (c = 0, i = 0; i < 16; ++i) { + /* + * + substitutes for |, because c is 1 bit + */ + c += ((size_t)tweak.c[i]) << 1; + tweak1.c[i] = (u8)c; + c = c >> 8; + } + tweak1.c[0] ^= (u8)(0x87 & (0 - c)); + } +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; +#else + scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0]; + scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; + + for (i = 0; i < len; ++i) { + u8 c = inp[16 + i]; + out[16 + i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0]; + ((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1]; +#endif + } + + return 0; +} -- cgit v1.2.3