From aa4d426b4d3527d7e166df1a05058c9a4a0f6683 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 00:33:56 +0200 Subject: initial/final commit --- openssl-1.1.0h/crypto/sha/asm/sha1-586.pl | 1488 +++++++++++++ openssl-1.1.0h/crypto/sha/asm/sha1-alpha.pl | 329 +++ openssl-1.1.0h/crypto/sha/asm/sha1-armv4-large.pl | 742 +++++++ openssl-1.1.0h/crypto/sha/asm/sha1-armv8.pl | 363 ++++ openssl-1.1.0h/crypto/sha/asm/sha1-c64xplus.pl | 337 +++ openssl-1.1.0h/crypto/sha/asm/sha1-ia64.pl | 314 +++ openssl-1.1.0h/crypto/sha/asm/sha1-mb-x86_64.pl | 1582 ++++++++++++++ openssl-1.1.0h/crypto/sha/asm/sha1-mips.pl | 457 ++++ openssl-1.1.0h/crypto/sha/asm/sha1-parisc.pl | 267 +++ openssl-1.1.0h/crypto/sha/asm/sha1-ppc.pl | 351 +++ openssl-1.1.0h/crypto/sha/asm/sha1-s390x.pl | 247 +++ openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9.pl | 434 ++++ openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9a.pl | 608 ++++++ openssl-1.1.0h/crypto/sha/asm/sha1-thumb.pl | 266 +++ openssl-1.1.0h/crypto/sha/asm/sha1-x86_64.pl | 2077 ++++++++++++++++++ openssl-1.1.0h/crypto/sha/asm/sha256-586.pl | 1293 +++++++++++ openssl-1.1.0h/crypto/sha/asm/sha256-armv4.pl | 732 +++++++ openssl-1.1.0h/crypto/sha/asm/sha256-c64xplus.pl | 320 +++ openssl-1.1.0h/crypto/sha/asm/sha256-mb-x86_64.pl | 1568 ++++++++++++++ openssl-1.1.0h/crypto/sha/asm/sha512-586.pl | 924 ++++++++ openssl-1.1.0h/crypto/sha/asm/sha512-armv4.pl | 668 ++++++ openssl-1.1.0h/crypto/sha/asm/sha512-armv8.pl | 446 ++++ openssl-1.1.0h/crypto/sha/asm/sha512-c64xplus.pl | 438 ++++ openssl-1.1.0h/crypto/sha/asm/sha512-ia64.pl | 692 ++++++ openssl-1.1.0h/crypto/sha/asm/sha512-mips.pl | 519 +++++ openssl-1.1.0h/crypto/sha/asm/sha512-parisc.pl | 800 +++++++ openssl-1.1.0h/crypto/sha/asm/sha512-ppc.pl | 799 +++++++ openssl-1.1.0h/crypto/sha/asm/sha512-s390x.pl | 322 +++ openssl-1.1.0h/crypto/sha/asm/sha512-sparcv9.pl | 857 ++++++++ openssl-1.1.0h/crypto/sha/asm/sha512-x86_64.pl | 2407 +++++++++++++++++++++ openssl-1.1.0h/crypto/sha/asm/sha512p8-ppc.pl | 431 ++++ 31 files changed, 23078 insertions(+) create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-586.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-alpha.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-armv4-large.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-armv8.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-c64xplus.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-ia64.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-mb-x86_64.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-mips.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-parisc.pl create mode 100755 openssl-1.1.0h/crypto/sha/asm/sha1-ppc.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-s390x.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9a.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha1-thumb.pl create mode 100755 openssl-1.1.0h/crypto/sha/asm/sha1-x86_64.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha256-586.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha256-armv4.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha256-c64xplus.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha256-mb-x86_64.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha512-586.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha512-armv4.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha512-armv8.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha512-c64xplus.pl create mode 100755 openssl-1.1.0h/crypto/sha/asm/sha512-ia64.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha512-mips.pl create mode 100755 openssl-1.1.0h/crypto/sha/asm/sha512-parisc.pl create mode 100755 openssl-1.1.0h/crypto/sha/asm/sha512-ppc.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha512-s390x.pl create mode 100644 openssl-1.1.0h/crypto/sha/asm/sha512-sparcv9.pl create mode 100755 openssl-1.1.0h/crypto/sha/asm/sha512-x86_64.pl create mode 100755 openssl-1.1.0h/crypto/sha/asm/sha512p8-ppc.pl (limited to 'openssl-1.1.0h/crypto/sha/asm') diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-586.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-586.pl new file mode 100644 index 0000000..5adca23 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-586.pl @@ -0,0 +1,1488 @@ +#! /usr/bin/env perl +# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# [Re]written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# "[Re]written" was achieved in two major overhauls. In 2004 BODY_* +# functions were re-implemented to address P4 performance issue [see +# commentary below], and in 2006 the rest was rewritten in order to +# gain freedom to liberate licensing terms. + +# January, September 2004. +# +# It was noted that Intel IA-32 C compiler generates code which +# performs ~30% *faster* on P4 CPU than original *hand-coded* +# SHA1 assembler implementation. To address this problem (and +# prove that humans are still better than machines:-), the +# original code was overhauled, which resulted in following +# performance changes: +# +# compared with original compared with Intel cc +# assembler impl. generated code +# Pentium -16% +48% +# PIII/AMD +8% +16% +# P4 +85%(!) +45% +# +# As you can see Pentium came out as looser:-( Yet I reckoned that +# improvement on P4 outweights the loss and incorporate this +# re-tuned code to 0.9.7 and later. +# ---------------------------------------------------------------- +# + +# August 2009. +# +# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as +# '(c&d) + (b&(c^d))', which allows to accumulate partial results +# and lighten "pressure" on scratch registers. This resulted in +# >12% performance improvement on contemporary AMD cores (with no +# degradation on other CPUs:-). Also, the code was revised to maximize +# "distance" between instructions producing input to 'lea' instruction +# and the 'lea' instruction itself, which is essential for Intel Atom +# core and resulted in ~15% improvement. + +# October 2010. +# +# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it +# is to offload message schedule denoted by Wt in NIST specification, +# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, +# and in SSE2 context was first explored by Dean Gaudet in 2004, see +# http://arctic.org/~dean/crypto/sha1.html. Since then several things +# have changed that made it interesting again: +# +# a) XMM units became faster and wider; +# b) instruction set became more versatile; +# c) an important observation was made by Max Locktykhin, which made +# it possible to reduce amount of instructions required to perform +# the operation in question, for further details see +# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. + +# April 2011. +# +# Add AVX code path, probably most controversial... The thing is that +# switch to AVX alone improves performance by as little as 4% in +# comparison to SSSE3 code path. But below result doesn't look like +# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as +# pair of µ-ops, and it's the additional µ-ops, two per round, that +# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded +# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with +# equivalent 'sh[rl]d' that is responsible for the impressive 5.1 +# cycles per processed byte. But 'sh[rl]d' is not something that used +# to be fast, nor does it appear to be fast in upcoming Bulldozer +# [according to its optimization manual]. Which is why AVX code path +# is guarded by *both* AVX and synthetic bit denoting Intel CPUs. +# One can argue that it's unfair to AMD, but without 'sh[rl]d' it +# makes no sense to keep the AVX code path. If somebody feels that +# strongly, it's probably more appropriate to discuss possibility of +# using vector rotate XOP on AMD... + +# March 2014. +# +# Add support for Intel SHA Extensions. + +###################################################################### +# Current performance is summarized in following table. Numbers are +# CPU clock cycles spent to process single byte (less is better). +# +# x86 SSSE3 AVX +# Pentium 15.7 - +# PIII 11.5 - +# P4 10.6 - +# AMD K8 7.1 - +# Core2 7.3 6.0/+22% - +# Westmere 7.3 5.5/+33% - +# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% +# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% +# Haswell 6.5 4.3/+51% 4.1(**)/+58% +# Bulldozer 11.6 6.0/+92% +# VIA Nano 10.6 7.5/+41% +# Atom 12.5 9.3(*)/+35% +# Silvermont 14.5 9.9(*)/+46% +# +# (*) Loop is 1056 instructions long and expected result is ~8.25. +# The discrepancy is because of front-end limitations, so +# called MS-ROM penalties, and on Silvermont even rotate's +# limited parallelism. +# +# (**) As per above comment, the result is for AVX *plus* sh[rl]d. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); + +$xmm=$ymm=0; +for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } + +$ymm=1 if ($xmm && + `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); # first version supporting AVX + +$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.03); # first version supporting AVX + +$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && + `ml 2>&1` =~ /Version ([0-9]+)\./ && + $1>=10); # first version supporting AVX + +$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && + $2>=3.0); # first version supporting AVX + +$shaext=$xmm; ### set to zero if compiling for 1.0.1 + +&external_label("OPENSSL_ia32cap_P") if ($xmm); + + +$A="eax"; +$B="ebx"; +$C="ecx"; +$D="edx"; +$E="edi"; +$T="esi"; +$tmp1="ebp"; + +@V=($A,$B,$C,$D,$E,$T); + +$alt=0; # 1 denotes alternative IALU implementation, which performs + # 8% *worse* on P4, same on Westmere and Atom, 2% better on + # Sandy Bridge... + +sub BODY_00_15 + { + local($n,$a,$b,$c,$d,$e,$f)=@_; + + &comment("00_15 $n"); + + &mov($f,$c); # f to hold F_00_19(b,c,d) + if ($n==0) { &mov($tmp1,$a); } + else { &mov($a,$tmp1); } + &rotl($tmp1,5); # tmp1=ROTATE(a,5) + &xor($f,$d); + &add($tmp1,$e); # tmp1+=e; + &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded + # with xi, also note that e becomes + # f in next round... + &and($f,$b); + &rotr($b,2); # b=ROTATE(b,30) + &xor($f,$d); # f holds F_00_19(b,c,d) + &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi + + if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round + &add($f,$tmp1); } # f+=tmp1 + else { &add($tmp1,$f); } # f becomes a in next round + &mov($tmp1,$a) if ($alt && $n==15); + } + +sub BODY_16_19 + { + local($n,$a,$b,$c,$d,$e,$f)=@_; + + &comment("16_19 $n"); + +if ($alt) { + &xor($c,$d); + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d + &xor($f,&swtmp(($n+8)%16)); + &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &add($e,$tmp1); # e+=F_00_19(b,c,d) + &xor($c,$d); # restore $c + &mov($tmp1,$a); # b in next round + &rotr($b,$n==16?2:7); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f); # xi=f + &rotl($a,5); # ROTATE(a,5) + &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) +} else { + &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &and($tmp1,$b); + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) + &add($e,$tmp1); # e+=F_00_19(b,c,d) + &mov($tmp1,$a); + &rotr($b,2); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f); # xi=f + &rotl($tmp1,5); # ROTATE(a,5) + &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$tmp1); # f+=ROTATE(a,5) +} + } + +sub BODY_20_39 + { + local($n,$a,$b,$c,$d,$e,$f)=@_; + local $K=($n<40)?0x6ed9eba1:0xca62c1d6; + + &comment("20_39 $n"); + +if ($alt) { + &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) + &xor($f,&swtmp(($n+8)%16)); + &add($e,$tmp1); # e+=F_20_39(b,c,d) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &mov($tmp1,$a); # b in next round + &rotr($b,7); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f) if($n<77);# xi=f + &rotl($a,5); # ROTATE(a,5) + &xor($b,$c) if($n==39);# warm up for BODY_40_59 + &and($tmp1,$b) if($n==39); + &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY + &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) + &rotr($a,5) if ($n==79); +} else { + &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$c); + &xor($f,&swtmp(($n+8)%16)); + &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &add($e,$tmp1); # e+=F_20_39(b,c,d) + &rotr($b,2); # b=ROTATE(b,30) + &mov($tmp1,$a); + &rotl($tmp1,5); # ROTATE(a,5) + &mov(&swtmp($n%16),$f) if($n<77);# xi=f + &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY + &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round + &add($f,$tmp1); # f+=ROTATE(a,5) +} + } + +sub BODY_40_59 + { + local($n,$a,$b,$c,$d,$e,$f)=@_; + + &comment("40_59 $n"); + +if ($alt) { + &add($e,$tmp1); # e+=b&(c^d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &mov($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &xor($c,$d); # restore $c + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &and($tmp1,$c); + &rotr($b,7); # b=ROTATE(b,30) + &add($e,$tmp1); # e+=c&d + &mov($tmp1,$a); # b in next round + &mov(&swtmp($n%16),$f); # xi=f + &rotl($a,5); # ROTATE(a,5) + &xor($b,$c) if ($n<59); + &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) + &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) +} else { + &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &and($tmp1,$b); + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &add($tmp1,$e); # b&(c^d)+=e + &rotr($b,2); # b=ROTATE(b,30) + &mov($e,$a); # e becomes volatile + &rotl($e,5); # ROTATE(a,5) + &mov(&swtmp($n%16),$f); # xi=f + &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) + &mov($tmp1,$c); + &add($f,$e); # f+=ROTATE(a,5) + &and($tmp1,$d); + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$tmp1); # f+=c&d +} + } + +&function_begin("sha1_block_data_order"); +if ($xmm) { + &static_label("shaext_shortcut") if ($shaext); + &static_label("ssse3_shortcut"); + &static_label("avx_shortcut") if ($ymm); + &static_label("K_XX_XX"); + + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); + + &mov ($A,&DWP(0,$T)); + &mov ($D,&DWP(4,$T)); + &test ($D,1<<9); # check SSSE3 bit + &jz (&label("x86")); + &mov ($C,&DWP(8,$T)); + &test ($A,1<<24); # check FXSR bit + &jz (&label("x86")); + if ($shaext) { + &test ($C,1<<29); # check SHA bit + &jnz (&label("shaext_shortcut")); + } + if ($ymm) { + &and ($D,1<<28); # mask AVX bit + &and ($A,1<<30); # mask "Intel CPU" bit + &or ($A,$D); + &cmp ($A,1<<28|1<<30); + &je (&label("avx_shortcut")); + } + &jmp (&label("ssse3_shortcut")); + &set_label("x86",16); +} + &mov($tmp1,&wparam(0)); # SHA_CTX *c + &mov($T,&wparam(1)); # const void *input + &mov($A,&wparam(2)); # size_t num + &stack_push(16+3); # allocate X[16] + &shl($A,6); + &add($A,$T); + &mov(&wparam(2),$A); # pointer beyond the end of input + &mov($E,&DWP(16,$tmp1));# pre-load E + &jmp(&label("loop")); + +&set_label("loop",16); + + # copy input chunk to X, but reversing byte order! + for ($i=0; $i<16; $i+=4) + { + &mov($A,&DWP(4*($i+0),$T)); + &mov($B,&DWP(4*($i+1),$T)); + &mov($C,&DWP(4*($i+2),$T)); + &mov($D,&DWP(4*($i+3),$T)); + &bswap($A); + &bswap($B); + &bswap($C); + &bswap($D); + &mov(&swtmp($i+0),$A); + &mov(&swtmp($i+1),$B); + &mov(&swtmp($i+2),$C); + &mov(&swtmp($i+3),$D); + } + &mov(&wparam(1),$T); # redundant in 1st spin + + &mov($A,&DWP(0,$tmp1)); # load SHA_CTX + &mov($B,&DWP(4,$tmp1)); + &mov($C,&DWP(8,$tmp1)); + &mov($D,&DWP(12,$tmp1)); + # E is pre-loaded + + for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } + for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } + for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } + for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + + (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check + + &mov($tmp1,&wparam(0)); # re-load SHA_CTX* + &mov($D,&wparam(1)); # D is last "T" and is discarded + + &add($E,&DWP(0,$tmp1)); # E is last "A"... + &add($T,&DWP(4,$tmp1)); + &add($A,&DWP(8,$tmp1)); + &add($B,&DWP(12,$tmp1)); + &add($C,&DWP(16,$tmp1)); + + &mov(&DWP(0,$tmp1),$E); # update SHA_CTX + &add($D,64); # advance input pointer + &mov(&DWP(4,$tmp1),$T); + &cmp($D,&wparam(2)); # have we reached the end yet? + &mov(&DWP(8,$tmp1),$A); + &mov($E,$C); # C is last "E" which needs to be "pre-loaded" + &mov(&DWP(12,$tmp1),$B); + &mov($T,$D); # input pointer + &mov(&DWP(16,$tmp1),$C); + &jb(&label("loop")); + + &stack_pop(16+3); +&function_end("sha1_block_data_order"); + +if ($xmm) { +if ($shaext) { +###################################################################### +# Intel SHA Extensions implementation of SHA1 update function. +# +my ($ctx,$inp,$num)=("edi","esi","ecx"); +my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3)); +my @MSG=map("xmm$_",(4..7)); + +sub sha1rnds4 { + my ($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm); } +} +sub sha1op38 { + my ($opcodelet,$dst,$src)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } +} +sub sha1nexte { sha1op38(0xc8,@_); } +sub sha1msg1 { sha1op38(0xc9,@_); } +sub sha1msg2 { sha1op38(0xca,@_); } + +&function_begin("_sha1_block_data_order_shaext"); + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); +&set_label("shaext_shortcut"); + &mov ($ctx,&wparam(0)); + &mov ("ebx","esp"); + &mov ($inp,&wparam(1)); + &mov ($num,&wparam(2)); + &sub ("esp",32); + + &movdqu ($ABCD,&QWP(0,$ctx)); + &movd ($E,&DWP(16,$ctx)); + &and ("esp",-32); + &movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap + + &movdqu (@MSG[0],&QWP(0,$inp)); + &pshufd ($ABCD,$ABCD,0b00011011); # flip word order + &movdqu (@MSG[1],&QWP(0x10,$inp)); + &pshufd ($E,$E,0b00011011); # flip word order + &movdqu (@MSG[2],&QWP(0x20,$inp)); + &pshufb (@MSG[0],$BSWAP); + &movdqu (@MSG[3],&QWP(0x30,$inp)); + &pshufb (@MSG[1],$BSWAP); + &pshufb (@MSG[2],$BSWAP); + &pshufb (@MSG[3],$BSWAP); + &jmp (&label("loop_shaext")); + +&set_label("loop_shaext",16); + &dec ($num); + &lea ("eax",&DWP(0x40,$inp)); + &movdqa (&QWP(0,"esp"),$E); # offload $E + &paddd ($E,@MSG[0]); + &cmovne ($inp,"eax"); + &movdqa (&QWP(16,"esp"),$ABCD); # offload $ABCD + +for($i=0;$i<20-4;$i+=2) { + &sha1msg1 (@MSG[0],@MSG[1]); + &movdqa ($E_,$ABCD); + &sha1rnds4 ($ABCD,$E,int($i/5)); # 0-3... + &sha1nexte ($E_,@MSG[1]); + &pxor (@MSG[0],@MSG[2]); + &sha1msg1 (@MSG[1],@MSG[2]); + &sha1msg2 (@MSG[0],@MSG[3]); + + &movdqa ($E,$ABCD); + &sha1rnds4 ($ABCD,$E_,int(($i+1)/5)); + &sha1nexte ($E,@MSG[2]); + &pxor (@MSG[1],@MSG[3]); + &sha1msg2 (@MSG[1],@MSG[0]); + + push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); +} + &movdqu (@MSG[0],&QWP(0,$inp)); + &movdqa ($E_,$ABCD); + &sha1rnds4 ($ABCD,$E,3); # 64-67 + &sha1nexte ($E_,@MSG[1]); + &movdqu (@MSG[1],&QWP(0x10,$inp)); + &pshufb (@MSG[0],$BSWAP); + + &movdqa ($E,$ABCD); + &sha1rnds4 ($ABCD,$E_,3); # 68-71 + &sha1nexte ($E,@MSG[2]); + &movdqu (@MSG[2],&QWP(0x20,$inp)); + &pshufb (@MSG[1],$BSWAP); + + &movdqa ($E_,$ABCD); + &sha1rnds4 ($ABCD,$E,3); # 72-75 + &sha1nexte ($E_,@MSG[3]); + &movdqu (@MSG[3],&QWP(0x30,$inp)); + &pshufb (@MSG[2],$BSWAP); + + &movdqa ($E,$ABCD); + &sha1rnds4 ($ABCD,$E_,3); # 76-79 + &movdqa ($E_,&QWP(0,"esp")); + &pshufb (@MSG[3],$BSWAP); + &sha1nexte ($E,$E_); + &paddd ($ABCD,&QWP(16,"esp")); + + &jnz (&label("loop_shaext")); + + &pshufd ($ABCD,$ABCD,0b00011011); + &pshufd ($E,$E,0b00011011); + &movdqu (&QWP(0,$ctx),$ABCD) + &movd (&DWP(16,$ctx),$E); + &mov ("esp","ebx"); +&function_end("_sha1_block_data_order_shaext"); +} +###################################################################### +# The SSSE3 implementation. +# +# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last +# 32 elements of the message schedule or Xupdate outputs. First 4 +# quadruples are simply byte-swapped input, next 4 are calculated +# according to method originally suggested by Dean Gaudet (modulo +# being implemented in SSSE3). Once 8 quadruples or 32 elements are +# collected, it switches to routine proposed by Max Locktyukhin. +# +# Calculations inevitably require temporary reqisters, and there are +# no %xmm registers left to spare. For this reason part of the ring +# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring +# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - +# X[-5], and X[4] - X[-4]... +# +# Another notable optimization is aggressive stack frame compression +# aiming to minimize amount of 9-byte instructions... +# +# Yet another notable optimization is "jumping" $B variable. It means +# that there is no register permanently allocated for $B value. This +# allowed to eliminate one instruction from body_20_39... +# +my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded +my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 +my @V=($A,$B,$C,$D,$E); +my $j=0; # hash round +my $rx=0; +my @T=($T,$tmp1); +my $inp; + +my $_rol=sub { &rol(@_) }; +my $_ror=sub { &ror(@_) }; + +&function_begin("_sha1_block_data_order_ssse3"); + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); +&set_label("ssse3_shortcut"); + + &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 + &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 + &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 + &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 + &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask + + &mov ($E,&wparam(0)); # load argument block + &mov ($inp=@T[1],&wparam(1)); + &mov ($D,&wparam(2)); + &mov (@T[0],"esp"); + + # stack frame layout + # + # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area + # X[4]+K X[5]+K X[6]+K X[7]+K + # X[8]+K X[9]+K X[10]+K X[11]+K + # X[12]+K X[13]+K X[14]+K X[15]+K + # + # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area + # X[4] X[5] X[6] X[7] + # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 + # + # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants + # K_40_59 K_40_59 K_40_59 K_40_59 + # K_60_79 K_60_79 K_60_79 K_60_79 + # K_00_19 K_00_19 K_00_19 K_00_19 + # pbswap mask + # + # +192 ctx # argument block + # +196 inp + # +200 end + # +204 esp + &sub ("esp",208); + &and ("esp",-64); + + &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants + &movdqa (&QWP(112+16,"esp"),@X[5]); + &movdqa (&QWP(112+32,"esp"),@X[6]); + &shl ($D,6); # len*64 + &movdqa (&QWP(112+48,"esp"),@X[3]); + &add ($D,$inp); # end of input + &movdqa (&QWP(112+64,"esp"),@X[2]); + &add ($inp,64); + &mov (&DWP(192+0,"esp"),$E); # save argument block + &mov (&DWP(192+4,"esp"),$inp); + &mov (&DWP(192+8,"esp"),$D); + &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp + + &mov ($A,&DWP(0,$E)); # load context + &mov ($B,&DWP(4,$E)); + &mov ($C,&DWP(8,$E)); + &mov ($D,&DWP(12,$E)); + &mov ($E,&DWP(16,$E)); + &mov (@T[0],$B); # magic seed + + &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] + &movdqu (@X[-3&7],&QWP(-48,$inp)); + &movdqu (@X[-2&7],&QWP(-32,$inp)); + &movdqu (@X[-1&7],&QWP(-16,$inp)); + &pshufb (@X[-4&7],@X[2]); # byte swap + &pshufb (@X[-3&7],@X[2]); + &pshufb (@X[-2&7],@X[2]); + &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + &pshufb (@X[-1&7],@X[2]); + &paddd (@X[-4&7],@X[3]); # add K_00_19 + &paddd (@X[-3&7],@X[3]); + &paddd (@X[-2&7],@X[3]); + &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU + &psubd (@X[-4&7],@X[3]); # restore X[] + &movdqa (&QWP(0+16,"esp"),@X[-3&7]); + &psubd (@X[-3&7],@X[3]); + &movdqa (&QWP(0+32,"esp"),@X[-2&7]); + &mov (@T[1],$C); + &psubd (@X[-2&7],@X[3]); + &xor (@T[1],$D); + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); + &and (@T[0],@T[1]); + &jmp (&label("loop")); + +###################################################################### +# SSE instruction sequence is first broken to groups of independent +# instructions, independent in respect to their inputs and shifter +# (not all architectures have more than one). Then IALU instructions +# are "knitted in" between the SSE groups. Distance is maintained for +# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer +# [which allegedly also implements SSSE3]... +# +# Temporary registers usage. X[2] is volatile at the entry and at the +# end is restored from backtrace ring buffer. X[3] is expected to +# contain current K_XX_XX constant and is used to calculate X[-1]+K +# from previous round, it becomes volatile the moment the value is +# saved to stack for transfer to IALU. X[4] becomes volatile whenever +# X[-4] is accumulated and offloaded to backtrace ring buffer, at the +# end it is loaded with next K_XX_XX [which becomes X[3] in next +# round]... +# +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); # ror + eval(shift(@insns)); + eval(shift(@insns)); + &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); + &movdqa (@X[2],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + + &paddd (@X[3],@X[-1&7]); + &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer + eval(shift(@insns)); # rol + eval(shift(@insns)); + &psrldq (@X[2],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); # ror + + &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); # rol + &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (@X[4],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + &movdqa (@X[2],@X[0]); + eval(shift(@insns)); + + &pslldq (@X[4],12); # "X[0]"<<96, extract one dword + &paddd (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@X[2],31); + eval(shift(@insns)); + eval(shift(@insns)); # rol + &movdqa (@X[3],@X[4]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@X[4],30); + eval(shift(@insns)); + eval(shift(@insns)); # ror + &por (@X[0],@X[2]); # "X[0]"<<<=1 + eval(shift(@insns)); + &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + + &pslld (@X[3],2); + eval(shift(@insns)); + eval(shift(@insns)); # rol + &pxor (@X[0],@X[4]); + &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 + &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7]) + &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7); + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_ssse3_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); # body_20_39 + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); + if ($Xi%5) { + &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... + } else { # ... or load next one + &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); + } + eval(shift(@insns)); # ror + &paddd (@X[3],@X[-1&7]); + eval(shift(@insns)); + + &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &movdqa (@X[2],@X[0]); + &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); + + &pslld (@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &psrld (@X[2],30); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[1] =~ /_rol/); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); + + &por (@X[0],@X[2]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0]) + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xuplast_ssse3_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[3],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &mov ($inp=@T[1],&DWP(192+4,"esp")); + &cmp ($inp,&DWP(192+8,"esp")); + &je (&label("done")); + + &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 + &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask + &movdqu (@X[-4&7],&QWP(0,$inp)); # load input + &movdqu (@X[-3&7],&QWP(16,$inp)); + &movdqu (@X[-2&7],&QWP(32,$inp)); + &movdqu (@X[-1&7],&QWP(48,$inp)); + &add ($inp,64); + &pshufb (@X[-4&7],@X[2]); # byte swap + &mov (&DWP(192+4,"esp"),$inp); + &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + + $Xi=0; +} + +sub Xloop_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[($Xi-4)&7],@X[3]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psubd (@X[($Xi-4)&7],@X[3]); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +sub body_00_19 () { # ((c^d)&b)^d + # on start @T[0]=(c^d)&b + return &body_20_39() if ($rx==19); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + '&$_ror ($b,$j?7:2);', # $b>>>2 + '&xor (@T[0],$d);', + '&mov (@T[1],$a);', # $b in next round + + '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer + '&xor ($b,$c);', # $c^$d for next round + + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&and (@T[1],$b);', # ($b&($c^$d)) for next round + + '&xor ($b,$c);', # restore $b + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_20_39 () { # b^d^c + # on entry @T[0]=b^d + return &body_40_59() if ($rx==39); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer + '&xor (@T[0],$d) if($j==19);'. + '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) + '&mov (@T[1],$a);', # $b in next round + + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round + + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_40_59 () { # ((b^c)&(c^d))^c + # on entry @T[0]=(b^c), (c^=d) + $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer + '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) + '&xor ($c,$d) if ($j>=40);', # restore $c + + '&$_ror ($b,7);', # $b>>>2 + '&mov (@T[1],$a);', # $b for next round + '&xor (@T[0],$c);', + + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor (@T[1],$c) if ($j==59);'. + '&xor (@T[1],$b) if ($j< 59);', # b^c for next round + + '&xor ($b,$c) if ($j< 59);', # c^d for next round + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} +###### +sub bodyx_00_19 () { # ((c^d)&b)^d + # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K + return &bodyx_20_39() if ($rx==19); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + + '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2 + '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2 + '&lea ($e,&DWP(0,$e,@T[0]));', + '&rorx (@T[0],$a,5);', + + '&andn (@T[1],$a,$c);', + '&and ($a,$b)', + '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer + + '&xor (@T[1],$a)', + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub bodyx_20_39 () { # b^d^c + # on start $b=b^c^d + return &bodyx_40_59() if ($rx==39); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + + '&add ($e,($j==19?@T[0]:$b))', + '&rorx ($b,@T[1],7);', # $b>>>2 + '&rorx (@T[0],$a,5);', + + '&xor ($a,$b) if ($j<79);', + '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer + '&xor ($a,$c) if ($j<79);', + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub bodyx_40_59 () { # ((b^c)&(c^d))^c + # on start $b=((b^c)&(c^d))^c + return &bodyx_20_39() if ($rx==59); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + + '&rorx (@T[0],$a,5)', + '&lea ($e,&DWP(0,$e,$b))', + '&rorx ($b,@T[1],7)', # $b>>>2 + '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer + + '&mov (@T[1],$c)', + '&xor ($a,$b)', # b^c for next round + '&xor (@T[1],$b)', # c^d for next round + + '&and ($a,@T[1])', + '&add ($e,@T[0])', + '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +&set_label("loop",16); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov (&DWP(8,@T[1]),$C); + &mov ($B,$C); + &mov (&DWP(12,@T[1]),$D); + &xor ($B,$D); + &mov (&DWP(16,@T[1]),$E); + &mov (@T[1],@T[0]); + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); + &and (@T[0],$B); + &mov ($B,$T[1]); + + &jmp (&label("loop")); + +&set_label("done",16); $j=$saved_j; @V=@saved_V; + + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &mov ("esp",&DWP(192+12,"esp")); # restore %esp + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov (&DWP(8,@T[1]),$C); + &mov (&DWP(12,@T[1]),$D); + &mov (&DWP(16,@T[1]),$E); + +&function_end("_sha1_block_data_order_ssse3"); + +$rx=0; # reset + +if ($ymm) { +my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded +my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 +my @V=($A,$B,$C,$D,$E); +my $j=0; # hash round +my @T=($T,$tmp1); +my $inp; + +my $_rol=sub { &shld(@_[0],@_) }; +my $_ror=sub { &shrd(@_[0],@_) }; + +&function_begin("_sha1_block_data_order_avx"); + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); +&set_label("avx_shortcut"); + &vzeroall(); + + &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 + &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 + &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 + &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 + &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask + + &mov ($E,&wparam(0)); # load argument block + &mov ($inp=@T[1],&wparam(1)); + &mov ($D,&wparam(2)); + &mov (@T[0],"esp"); + + # stack frame layout + # + # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area + # X[4]+K X[5]+K X[6]+K X[7]+K + # X[8]+K X[9]+K X[10]+K X[11]+K + # X[12]+K X[13]+K X[14]+K X[15]+K + # + # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area + # X[4] X[5] X[6] X[7] + # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 + # + # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants + # K_40_59 K_40_59 K_40_59 K_40_59 + # K_60_79 K_60_79 K_60_79 K_60_79 + # K_00_19 K_00_19 K_00_19 K_00_19 + # pbswap mask + # + # +192 ctx # argument block + # +196 inp + # +200 end + # +204 esp + &sub ("esp",208); + &and ("esp",-64); + + &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants + &vmovdqa(&QWP(112+16,"esp"),@X[5]); + &vmovdqa(&QWP(112+32,"esp"),@X[6]); + &shl ($D,6); # len*64 + &vmovdqa(&QWP(112+48,"esp"),@X[3]); + &add ($D,$inp); # end of input + &vmovdqa(&QWP(112+64,"esp"),@X[2]); + &add ($inp,64); + &mov (&DWP(192+0,"esp"),$E); # save argument block + &mov (&DWP(192+4,"esp"),$inp); + &mov (&DWP(192+8,"esp"),$D); + &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp + + &mov ($A,&DWP(0,$E)); # load context + &mov ($B,&DWP(4,$E)); + &mov ($C,&DWP(8,$E)); + &mov ($D,&DWP(12,$E)); + &mov ($E,&DWP(16,$E)); + &mov (@T[0],$B); # magic seed + + &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] + &vmovdqu(@X[-3&7],&QWP(-48,$inp)); + &vmovdqu(@X[-2&7],&QWP(-32,$inp)); + &vmovdqu(@X[-1&7],&QWP(-16,$inp)); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &vpshufb(@X[-3&7],@X[-3&7],@X[2]); + &vpshufb(@X[-2&7],@X[-2&7],@X[2]); + &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + &vpshufb(@X[-1&7],@X[-1&7],@X[2]); + &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 + &vpaddd (@X[1],@X[-3&7],@X[3]); + &vpaddd (@X[2],@X[-2&7],@X[3]); + &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU + &mov (@T[1],$C); + &vmovdqa(&QWP(0+16,"esp"),@X[1]); + &xor (@T[1],$D); + &vmovdqa(&QWP(0+32,"esp"),@X[2]); + &and (@T[0],@T[1]); + &jmp (&label("loop")); + +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@X[3],@X[3],@X[-1&7]); + &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@X[2],@X[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@X[3],@X[4],30); + &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@X[4],@X[4],2); + &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[3]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_avx_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + if ($Xi%5) { + &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... + } else { # ... or load next one + &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); + } + &vpaddd (@X[3],@X[3],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpsrld (@X[2],@X[0],30); + &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xuplast_avx_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &vpaddd (@X[3],@X[3],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &mov ($inp=@T[1],&DWP(192+4,"esp")); + &cmp ($inp,&DWP(192+8,"esp")); + &je (&label("done")); + + &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 + &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask + &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input + &vmovdqu(@X[-3&7],&QWP(16,$inp)); + &vmovdqu(@X[-2&7],&QWP(32,$inp)); + &vmovdqu(@X[-1&7],&QWP(48,$inp)); + &add ($inp,64); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &mov (&DWP(192+4,"esp"),$inp); + &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + + $Xi=0; +} + +sub Xloop_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +&set_label("loop",16); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_32_79(\&body_00_19); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_20_39); + &Xuplast_avx_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov ($B,$C); + &mov (&DWP(8,@T[1]),$C); + &xor ($B,$D); + &mov (&DWP(12,@T[1]),$D); + &mov (&DWP(16,@T[1]),$E); + &mov (@T[1],@T[0]); + &and (@T[0],$B); + &mov ($B,@T[1]); + + &jmp (&label("loop")); + +&set_label("done",16); $j=$saved_j; @V=@saved_V; + + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + + &vzeroall(); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &mov ("esp",&DWP(192+12,"esp")); # restore %esp + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov (&DWP(8,@T[1]),$C); + &mov (&DWP(12,@T[1]),$D); + &mov (&DWP(16,@T[1]),$E); +&function_end("_sha1_block_data_order_avx"); +} +&set_label("K_XX_XX",64); +&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 +&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 +&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 +&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 +&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask +&data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0); +} +&asciz("SHA1 block transform for x86, CRYPTOGAMS by "); + +&asm_finish(); + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-alpha.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-alpha.pl new file mode 100644 index 0000000..4124958 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-alpha.pl @@ -0,0 +1,329 @@ +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for Alpha. + +# On 21264 performance is 33% better than code generated by vendor +# compiler, and 75% better than GCC [3.4], and in absolute terms is +# 8.7 cycles per processed byte. Implementation features vectorized +# byte swap, but not Xupdate. + +@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", + "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15"); +$ctx="a0"; # $16 +$inp="a1"; +$num="a2"; +$A="a3"; +$B="a4"; # 20 +$C="a5"; +$D="t8"; +$E="t9"; @V=($A,$B,$C,$D,$E); +$t0="t10"; # 24 +$t1="t11"; +$t2="ra"; +$t3="t12"; +$K="AT"; # 28 + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i==0); + ldq_u @X[0],0+0($inp) + ldq_u @X[1],0+7($inp) +___ +$code.=<<___ if (!($i&1) && $i<14); + ldq_u @X[$i+2],($i+2)*4+0($inp) + ldq_u @X[$i+3],($i+2)*4+7($inp) +___ +$code.=<<___ if (!($i&1) && $i<15); + extql @X[$i],$inp,@X[$i] + extqh @X[$i+1],$inp,@X[$i+1] + + or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched + + srl @X[$i],24,$t0 # vectorized byte swap + srl @X[$i],8,$t2 + + sll @X[$i],8,$t3 + sll @X[$i],24,@X[$i] + zapnot $t0,0x11,$t0 + zapnot $t2,0x22,$t2 + + zapnot @X[$i],0x88,@X[$i] + or $t0,$t2,$t0 + zapnot $t3,0x44,$t3 + sll $a,5,$t1 + + or @X[$i],$t0,@X[$i] + addl $K,$e,$e + and $b,$c,$t2 + zapnot $a,0xf,$a + + or @X[$i],$t3,@X[$i] + srl $a,27,$t0 + bic $d,$b,$t3 + sll $b,30,$b + + extll @X[$i],4,@X[$i+1] # extract upper half + or $t2,$t3,$t2 + addl @X[$i],$e,$e + + addl $t1,$e,$e + srl $b,32,$t3 + zapnot @X[$i],0xf,@X[$i] + + addl $t0,$e,$e + addl $t2,$e,$e + or $t3,$b,$b +___ +$code.=<<___ if (($i&1) && $i<15); + sll $a,5,$t1 + addl $K,$e,$e + and $b,$c,$t2 + zapnot $a,0xf,$a + + srl $a,27,$t0 + addl @X[$i%16],$e,$e + bic $d,$b,$t3 + sll $b,30,$b + + or $t2,$t3,$t2 + addl $t1,$e,$e + srl $b,32,$t3 + zapnot @X[$i],0xf,@X[$i] + + addl $t0,$e,$e + addl $t2,$e,$e + or $t3,$b,$b +___ +$code.=<<___ if ($i>=15); # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + and $b,$c,$t2 + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + zapnot $a,0xf,$a + addl @X[$i%16],$e,$e + bic $d,$b,$t3 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + srl $a,27,$t0 + addl $t1,$e,$e + or $t2,$t3,$t2 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$b + addl $t0,$e,$e + srl @X[$j%16],31,$t1 + + addl $t2,$e,$e + srl $b,32,$t3 + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + zapnot @X[$i%16],0xf,@X[$i%16] + or $t1,@X[$j%16],@X[$j%16] +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$t3 + addl $t1,$e,$e + xor $b,$c,$t2 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + srl $b,2,$b + addl @X[$i%16],$e,$e + xor $d,$t2,$t2 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + srl @X[$j%16],31,$t1 + addl $t2,$e,$e + srl $a,27,$t0 + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + addl $t0,$e,$e + or $t1,@X[$j%16],@X[$j%16] +___ +$code.=<<___ if ($i<77); + zapnot @X[$i%16],0xf,@X[$i%16] +___ +$code.=<<___ if ($i==79); # with context fetch + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + ldl @X[0],0($ctx) + + sll $b,30,$t3 + addl $t1,$e,$e + xor $b,$c,$t2 + ldl @X[1],4($ctx) + + srl $b,2,$b + addl @X[$i%16],$e,$e + xor $d,$t2,$t2 + ldl @X[2],8($ctx) + + srl $a,27,$t0 + addl $t2,$e,$e + ldl @X[3],12($ctx) + + or $t3,$b,$b + addl $t0,$e,$e + ldl @X[4],16($ctx) +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + srl $a,27,$t0 + and $b,$c,$t2 + and $b,$d,$t3 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$b + addl $t1,$e,$e + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + srl @X[$j%16],31,$t1 + addl $t0,$e,$e + or $t2,$t3,$t2 + and $c,$d,$t3 + + or $t2,$t3,$t2 + srl $b,32,$t3 + addl @X[$i%16],$e,$e + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + addl $t2,$e,$e + or $t1,@X[$j%16],@X[$j%16] + zapnot @X[$i%16],0xf,@X[$i%16] +___ +} + +$code=<<___; +#ifdef __linux__ +#include +#else +#include +#include +#endif + +.text + +.set noat +.set noreorder +.globl sha1_block_data_order +.align 5 +.ent sha1_block_data_order +sha1_block_data_order: + lda sp,-64(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + stq s2,24(sp) + stq s3,32(sp) + stq s4,40(sp) + stq s5,48(sp) + stq fp,56(sp) + .mask 0x0400fe00,-64 + .frame sp,64,ra + .prologue 0 + + ldl $A,0($ctx) + ldl $B,4($ctx) + sll $num,6,$num + ldl $C,8($ctx) + ldl $D,12($ctx) + ldl $E,16($ctx) + addq $inp,$num,$num + +.Lloop: + .set noreorder + ldah $K,23170(zero) + zapnot $B,0xf,$B + lda $K,31129($K) # K_00_19 +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,28378(zero) + lda $K,-5215($K) # K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,-28900(zero) + lda $K,-17188($K) # K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,-13725(zero) + lda $K,-15914($K) # K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + addl @X[0],$A,$A + addl @X[1],$B,$B + addl @X[2],$C,$C + addl @X[3],$D,$D + addl @X[4],$E,$E + stl $A,0($ctx) + stl $B,4($ctx) + addq $inp,64,$inp + stl $C,8($ctx) + stl $D,12($ctx) + stl $E,16($ctx) + cmpult $inp,$num,$t1 + bne $t1,.Lloop + + .set noreorder + ldq ra,0(sp) + ldq s0,8(sp) + ldq s1,16(sp) + ldq s2,24(sp) + ldq s3,32(sp) + ldq s4,40(sp) + ldq s5,48(sp) + ldq fp,56(sp) + lda sp,64(sp) + ret (ra) +.end sha1_block_data_order +.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by " +.align 2 +___ +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-armv4-large.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-armv4-large.pl new file mode 100644 index 0000000..7ff5bfb --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-armv4-large.pl @@ -0,0 +1,742 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# sha1_block procedure for ARMv4. +# +# January 2007. + +# Size/performance trade-off +# ==================================================================== +# impl size in bytes comp cycles[*] measured performance +# ==================================================================== +# thumb 304 3212 4420 +# armv4-small 392/+29% 1958/+64% 2250/+96% +# armv4-compact 740/+89% 1552/+26% 1840/+22% +# armv4-large 1420/+92% 1307/+19% 1370/+34%[***] +# full unroll ~5100/+260% ~1260/+4% ~1300/+5% +# ==================================================================== +# thumb = same as 'small' but in Thumb instructions[**] and +# with recurring code in two private functions; +# small = detached Xload/update, loops are folded; +# compact = detached Xload/update, 5x unroll; +# large = interleaved Xload/update, 5x unroll; +# full unroll = interleaved Xload/update, full unroll, estimated[!]; +# +# [*] Manually counted instructions in "grand" loop body. Measured +# performance is affected by prologue and epilogue overhead, +# i-cache availability, branch penalties, etc. +# [**] While each Thumb instruction is twice smaller, they are not as +# diverse as ARM ones: e.g., there are only two arithmetic +# instructions with 3 arguments, no [fixed] rotate, addressing +# modes are limited. As result it takes more instructions to do +# the same job in Thumb, therefore the code is never twice as +# small and always slower. +# [***] which is also ~35% better than compiler generated code. Dual- +# issue Cortex A8 core was measured to process input block in +# ~990 cycles. + +# August 2010. +# +# Rescheduling for dual-issue pipeline resulted in 13% improvement on +# Cortex A8 core and in absolute terms ~870 cycles per input block +# [or 13.6 cycles per byte]. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 10% +# improvement on Cortex A8 core and 12.2 cycles per byte. + +# September 2013. +# +# Add NEON implementation (see sha1-586.pl for background info). On +# Cortex A8 it was measured to process one byte in 6.7 cycles or >80% +# faster than integer-only code. Because [fully unrolled] NEON code +# is ~2.5x larger and there are some redundant instructions executed +# when processing last block, improvement is not as big for smallest +# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per +# byte, which is also >80% faster than integer-only code. Cortex-A15 +# is even faster spending 5.6 cycles per byte outperforming integer- +# only code by factor of 2. + +# May 2014. +# +# Add ARMv8 code path performing at 2.35 cpb on Apple A7. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$ctx="r0"; +$inp="r1"; +$len="r2"; +$a="r3"; +$b="r4"; +$c="r5"; +$d="r6"; +$e="r7"; +$K="r8"; +$t0="r9"; +$t1="r10"; +$t2="r11"; +$t3="r12"; +$Xi="r14"; +@V=($a,$b,$c,$d,$e); + +sub Xupdate { +my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; +$code.=<<___; + ldr $t0,[$Xi,#15*4] + ldr $t1,[$Xi,#13*4] + ldr $t2,[$Xi,#7*4] + add $e,$K,$e,ror#2 @ E+=K_xx_xx + ldr $t3,[$Xi,#2*4] + eor $t0,$t0,$t1 + eor $t2,$t2,$t3 @ 1 cycle stall + eor $t1,$c,$d @ F_xx_xx + mov $t0,$t0,ror#31 + add $e,$e,$a,ror#27 @ E+=ROR(A,27) + eor $t0,$t0,$t2,ror#31 + str $t0,[$Xi,#-4]! + $opt1 @ F_xx_xx + $opt2 @ F_xx_xx + add $e,$e,$t0 @ E+=X[i] +___ +} + +sub BODY_00_15 { +my ($a,$b,$c,$d,$e)=@_; +$code.=<<___; +#if __ARM_ARCH__<7 + ldrb $t1,[$inp,#2] + ldrb $t0,[$inp,#3] + ldrb $t2,[$inp,#1] + add $e,$K,$e,ror#2 @ E+=K_00_19 + ldrb $t3,[$inp],#4 + orr $t0,$t0,$t1,lsl#8 + eor $t1,$c,$d @ F_xx_xx + orr $t0,$t0,$t2,lsl#16 + add $e,$e,$a,ror#27 @ E+=ROR(A,27) + orr $t0,$t0,$t3,lsl#24 +#else + ldr $t0,[$inp],#4 @ handles unaligned + add $e,$K,$e,ror#2 @ E+=K_00_19 + eor $t1,$c,$d @ F_xx_xx + add $e,$e,$a,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev $t0,$t0 @ byte swap +#endif +#endif + and $t1,$b,$t1,ror#2 + add $e,$e,$t0 @ E+=X[i] + eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) + str $t0,[$Xi,#-4]! + add $e,$e,$t1 @ E+=F_00_19(B,C,D) +___ +} + +sub BODY_16_19 { +my ($a,$b,$c,$d,$e)=@_; + &Xupdate(@_,"and $t1,$b,$t1,ror#2"); +$code.=<<___; + eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) + add $e,$e,$t1 @ E+=F_00_19(B,C,D) +___ +} + +sub BODY_20_39 { +my ($a,$b,$c,$d,$e)=@_; + &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); +$code.=<<___; + add $e,$e,$t1 @ E+=F_20_39(B,C,D) +___ +} + +sub BODY_40_59 { +my ($a,$b,$c,$d,$e)=@_; + &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); +$code.=<<___; + add $e,$e,$t1 @ E+=F_40_59(B,C,D) + add $e,$e,$t2,ror#2 +___ +} + +$code=<<___; +#include "arm_arch.h" + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.global sha1_block_data_order +.type sha1_block_data_order,%function + +.align 5 +sha1_block_data_order: +#if __ARM_MAX_ARCH__>=7 +.Lsha1_block: + adr r3,.Lsha1_block + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA1 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + stmdb sp!,{r4-r12,lr} + add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp + ldmia $ctx,{$a,$b,$c,$d,$e} +.Lloop: + ldr $K,.LK_00_19 + mov $Xi,sp + sub sp,sp,#15*4 + mov $c,$c,ror#30 + mov $d,$d,ror#30 + mov $e,$e,ror#30 @ [6] +.L_00_15: +___ +for($i=0;$i<5;$i++) { + &BODY_00_15(@V); unshift(@V,pop(@V)); +} +$code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else + teq $Xi,sp +#endif + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +___ + &BODY_00_15(@V); unshift(@V,pop(@V)); + &BODY_16_19(@V); unshift(@V,pop(@V)); + &BODY_16_19(@V); unshift(@V,pop(@V)); + &BODY_16_19(@V); unshift(@V,pop(@V)); + &BODY_16_19(@V); unshift(@V,pop(@V)); +$code.=<<___; + + ldr $K,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: +___ +for($i=0;$i<5;$i++) { + &BODY_20_39(@V); unshift(@V,pop(@V)); +} +$code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else + teq $Xi,sp @ preserve carry +#endif + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr $K,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: +___ +for($i=0;$i<5;$i++) { + &BODY_40_59(@V); unshift(@V,pop(@V)); +} +$code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else + teq $Xi,sp +#endif + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr $K,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia $ctx,{$K,$t0,$t1,$t2,$t3} + add $a,$K,$a + add $b,$t0,$b + add $c,$t1,$c,ror#2 + add $d,$t2,$d,ror#2 + add $e,$t3,$e,ror#2 + stmia $ctx,{$a,$b,$c,$d,$e} + teq $inp,$len + bne .Lloop @ [+18], total 1307 + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha1_block_data_order,.-sha1_block_data_order + +.align 5 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +#if __ARM_MAX_ARCH__>=7 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha1_block +#endif +.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " +.align 5 +___ +##################################################################### +# NEON stuff +# +{{{ +my @V=($a,$b,$c,$d,$e); +my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14)); +my $Xi=4; +my @X=map("q$_",(8..11,0..3)); +my @Tx=("q12","q13"); +my ($K,$zero)=("q14","q15"); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub body_00_19 () { + ( + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. + '&bic ($t0,$d,$b)', + '&add ($e,$e,$Ki)', # e+=X[i]+K + '&and ($t1,$c,$b)', + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) + '&eor ($t1,$t1,$t0)', # F_00_19 + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) + '&add ($e,$e,$t1);'. # e+=F_00_19 + '$j++; unshift(@V,pop(@V));' + ) +} +sub body_20_39 () { + ( + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. + '&eor ($t0,$b,$d)', + '&add ($e,$e,$Ki)', # e+=X[i]+K + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)', + '&eor ($t1,$t0,$c)', # F_20_39 + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) + '&add ($e,$e,$t1);'. # e+=F_20_39 + '$j++; unshift(@V,pop(@V));' + ) +} +sub body_40_59 () { + ( + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. + '&add ($e,$e,$Ki)', # e+=X[i]+K + '&and ($t0,$c,$d)', + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) + '&eor ($t1,$c,$d)', + '&add ($e,$e,$t0)', + '&and ($t1,$t1,$b)', + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) + '&add ($e,$e,$t1);'. # e+=F_40_59 + '$j++; unshift(@V,pop(@V));' + ) +} + +sub Xupdate_16_31 () +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@Tx[1],@X[-1&7],$K); + eval(shift(@insns)); + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); + eval(shift(@insns)); + &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8] + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer + &sub ($Xfer,$Xfer,64) if ($Xi%4==0); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@Tx[0],@Tx[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 (@Tx[0],@Tx[1],30); + eval(shift(@insns)); + eval(shift(@insns)); + &vshl_u32 (@Tx[1],@Tx[1],2); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@Tx[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_32_79 () +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@Tx[1],@X[-1&7],$K); + eval(shift(@insns)); + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); + eval(shift(@insns)); + &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 (@X[0],@Tx[0],30); + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer + &sub ($Xfer,$Xfer,64) if ($Xi%4==0); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xuplast_80 () +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vadd_i32 (@Tx[1],@X[-1&7],$K); + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); + &sub ($Xfer,$Xfer,64); + + &teq ($inp,$len); + &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX + &it ("eq"); + &subeq ($inp,$inp,64); # reload last block to avoid SEGV + &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19 + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[-4&7],@X[-4&7]); + + foreach (@insns) { eval; } # remaining instructions + + $Xi=0; +} + +sub Xloop() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e); + + &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K); + eval(shift(@insns)); + eval(shift(@insns)); + &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU + + foreach (@insns) { eval; } + + $Xi++; +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.type sha1_block_data_order_neon,%function +.align 4 +sha1_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov $saved_sp,sp + sub $Xfer,sp,#64 + adr $K_XX_XX,.LK_00_19 + bic $Xfer,$Xfer,#15 @ align for 128-bit stores + + ldmia $ctx,{$a,$b,$c,$d,$e} @ load context + mov sp,$Xfer @ alloca + + vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned + veor $zero,$zero,$zero + vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]! + vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19 + vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on + vrev32.8 @X[-3&7],@X[-3&7] @ big-endian... + vrev32.8 @X[-2&7],@X[-2&7] + vadd.i32 @X[0],@X[-4&7],$K + vrev32.8 @X[-1&7],@X[-1&7] + vadd.i32 @X[1],@X[-3&7],$K + vst1.32 {@X[0]},[$Xfer,:128]! + vadd.i32 @X[2],@X[-2&7],$K + vst1.32 {@X[1]},[$Xfer,:128]! + vst1.32 {@X[2]},[$Xfer,:128]! + ldr $Ki,[sp] @ big RAW stall + +.Loop_neon: +___ + &Xupdate_16_31(\&body_00_19); + &Xupdate_16_31(\&body_00_19); + &Xupdate_16_31(\&body_00_19); + &Xupdate_16_31(\&body_00_19); + &Xupdate_32_79(\&body_00_19); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_20_39); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_40_59); + &Xupdate_32_79(\&body_20_39); + &Xuplast_80(\&body_20_39); + &Xloop(\&body_20_39); + &Xloop(\&body_20_39); + &Xloop(\&body_20_39); +$code.=<<___; + ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context + add $a,$a,$Ki + ldr $Ki,[$ctx,#16] + add $b,$b,$t0 + add $c,$c,$t1 + add $d,$d,$Xfer + it eq + moveq sp,$saved_sp + add $e,$e,$Ki + it ne + ldrne $Ki,[sp] + stmia $ctx,{$a,$b,$c,$d,$e} + itt ne + addne $Xfer,sp,#3*16 + bne .Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4-r12,pc} +.size sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +___ +}}} +##################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3)); +my @MSG=map("q$_",(4..7)); +my @Kxx=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + +.type sha1_block_data_order_armv8,%function +.align 5 +sha1_block_data_order_armv8: +.LARMv8: + vstmdb sp!,{d8-d15} @ ABI specification says so + + veor $E,$E,$E + adr r3,.LK_00_19 + vld1.32 {$ABCD},[$ctx]! + vld1.32 {$E\[0]},[$ctx] + sub $ctx,$ctx,#16 + vld1.32 {@Kxx[0]\[]},[r3,:32]! + vld1.32 {@Kxx[1]\[]},[r3,:32]! + vld1.32 {@Kxx[2]\[]},[r3,:32]! + vld1.32 {@Kxx[3]\[]},[r3,:32] + +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + + vadd.i32 $W0,@Kxx[0],@MSG[0] + vrev32.8 @MSG[2],@MSG[2] + vmov $ABCD_SAVE,$ABCD @ offload + subs $len,$len,#1 + + vadd.i32 $W1,@Kxx[0],@MSG[1] + vrev32.8 @MSG[3],@MSG[3] + sha1h $E1,$ABCD @ 0 + sha1c $ABCD,$E,$W0 + vadd.i32 $W0,@Kxx[$j],@MSG[2] + sha1su0 @MSG[0],@MSG[1],@MSG[2] +___ +for ($j=0,$i=1;$i<20-3;$i++) { +my $f=("c","p","m","p")[$i/5]; +$code.=<<___; + sha1h $E0,$ABCD @ $i + sha1$f $ABCD,$E1,$W1 + vadd.i32 $W1,@Kxx[$j],@MSG[3] + sha1su1 @MSG[0],@MSG[3] +___ +$code.=<<___ if ($i<20-4); + sha1su0 @MSG[1],@MSG[2],@MSG[3] +___ + ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); + push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); +} +$code.=<<___; + sha1h $E0,$ABCD @ $i + sha1p $ABCD,$E1,$W1 + vadd.i32 $W1,@Kxx[$j],@MSG[3] + + sha1h $E1,$ABCD @ 18 + sha1p $ABCD,$E0,$W0 + + sha1h $E0,$ABCD @ 19 + sha1p $ABCD,$E1,$W1 + + vadd.i32 $E,$E,$E0 + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + bne .Loop_v8 + + vst1.32 {$ABCD},[$ctx]! + vst1.32 {$E\[0]},[$ctx] + + vldmia sp!,{d8-d15} + ret @ bx lr +.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +{ my %opcode = ( + "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40, + "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40, + "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 ); + + sub unsha1 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + + # this fix-up provides Thumb encoding in conjunction with INST + $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000); + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or + s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo; + + s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo; + + s/\bret\b/bx lr/o or + s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4 + + print $_,$/; +} + +close STDOUT; # enforce flush diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-armv8.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-armv8.pl new file mode 100644 index 0000000..84a00bf --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-armv8.pl @@ -0,0 +1,363 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# hardware-assisted software(*) +# Apple A7 2.31 4.13 (+14%) +# Cortex-A53 2.24 8.03 (+97%) +# Cortex-A57 2.35 7.88 (+74%) +# Denver 2.13 3.97 (+0%)(**) +# X-Gene 8.80 (+200%) +# Mongoose 2.05 6.50 (+160%) +# +# (*) Software results are presented mostly for reference purposes. +# (**) Keep in mind that Denver relies on binary translation, which +# optimizes compiler output at run-time. + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +($ctx,$inp,$num)=("x0","x1","x2"); +@Xw=map("w$_",(3..17,19)); +@Xx=map("x$_",(3..17,19)); +@V=($A,$B,$C,$D,$E)=map("w$_",(20..24)); +($t0,$t1,$t2,$K)=map("w$_",(25..28)); + + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i<15 && !($i&1)); + lsr @Xx[$i+1],@Xx[$i],#32 +___ +$code.=<<___ if ($i<14 && !($i&1)); + ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`] +___ +$code.=<<___ if ($i<14 && ($i&1)); +#ifdef __ARMEB__ + ror @Xx[$i+1],@Xx[$i+1],#32 +#else + rev32 @Xx[$i+1],@Xx[$i+1] +#endif +___ +$code.=<<___ if ($i<14); + bic $t0,$d,$b + and $t1,$c,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + orr $t0,$t0,$t1 + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) +___ +$code.=<<___ if ($i==19); + movz $K,#0xeba1 + movk $K,#0x6ed9,lsl#16 +___ +$code.=<<___ if ($i>=14); + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + bic $t0,$d,$b + and $t1,$c,$b + ror $t2,$a,#27 + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + add $d,$d,$K // future e+=K + orr $t0,$t0,$t1 + add $e,$e,$t2 // e+=rot(a,5) + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==59); + movz $K,#0xc1d6 + movk $K,#0xca62,lsl#16 +___ +$code.=<<___; + orr $t0,$b,$c + and $t1,$b,$c + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + ror $t2,$a,#27 + and $t0,$t0,$d + add $d,$d,$K // future e+=K + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + add $e,$e,$t2 // e+=rot(a,5) + orr $t0,$t0,$t1 + ror $b,$b,#2 + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==39); + movz $K,#0xbcdc + movk $K,#0x8f1b,lsl#16 +___ +$code.=<<___ if ($i<78); + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + eor $t0,$d,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +$code.=<<___ if ($i==78); + ldp @Xw[1],@Xw[2],[$ctx] + eor $t0,$d,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) +___ +$code.=<<___ if ($i==79); + ldp @Xw[3],@Xw[4],[$ctx,#8] + eor $t0,$d,$b + ror $t2,$a,#27 + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + ldr @Xw[5],[$ctx,#16] + add $e,$e,$t0 // e+=F(b,c,d) +___ +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.extern OPENSSL_armcap_P +.globl sha1_block_data_order +.type sha1_block_data_order,%function +.align 6 +sha1_block_data_order: +#ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +#else + ldr x16,.LOPENSSL_armcap_P +#endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA1 + b.ne .Lv8_entry + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldr $E,[$ctx,#16] + +.Loop: + ldr @Xx[0],[$inp],#64 + movz $K,#0x7999 + sub $num,$num,#1 + movk $K,#0x5a82,lsl#16 +#ifdef __ARMEB__ + ror $Xx[0],@Xx[0],#32 +#else + rev32 @Xx[0],@Xx[0] +#endif + add $E,$E,$K // warm it up + add $E,$E,@Xw[0] +___ +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + add $B,$B,@Xw[2] + add $C,$C,@Xw[3] + add $A,$A,@Xw[1] + add $D,$D,@Xw[4] + add $E,$E,@Xw[5] + stp $A,$B,[$ctx] + stp $C,$D,[$ctx,#8] + str $E,[$ctx,#16] + cbnz $num,.Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret +.size sha1_block_data_order,.-sha1_block_data_order +___ +{{{ +my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3)); +my @MSG=map("v$_.16b",(4..7)); +my @Kxx=map("v$_.4s",(16..19)); +my ($W0,$W1)=("v20.4s","v21.4s"); +my $ABCD_SAVE="v22.16b"; + +$code.=<<___; +.type sha1_block_armv8,%function +.align 6 +sha1_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adr x4,.Lconst + eor $E,$E,$E + ld1.32 {$ABCD},[$ctx],#16 + ld1.32 {$E}[0],[$ctx] + sub $ctx,$ctx,#16 + ld1.32 {@Kxx[0]-@Kxx[3]},[x4] + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + + add.i32 $W0,@Kxx[0],@MSG[0] + rev32 @MSG[2],@MSG[2] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + + add.i32 $W1,@Kxx[0],@MSG[1] + rev32 @MSG[3],@MSG[3] + sha1h $E1,$ABCD + sha1c $ABCD,$E,$W0 // 0 + add.i32 $W0,@Kxx[$j],@MSG[2] + sha1su0 @MSG[0],@MSG[1],@MSG[2] +___ +for ($j=0,$i=1;$i<20-3;$i++) { +my $f=("c","p","m","p")[$i/5]; +$code.=<<___; + sha1h $E0,$ABCD // $i + sha1$f $ABCD,$E1,$W1 + add.i32 $W1,@Kxx[$j],@MSG[3] + sha1su1 @MSG[0],@MSG[3] +___ +$code.=<<___ if ($i<20-4); + sha1su0 @MSG[1],@MSG[2],@MSG[3] +___ + ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); + push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); +} +$code.=<<___; + sha1h $E0,$ABCD // $i + sha1p $ABCD,$E1,$W1 + add.i32 $W1,@Kxx[$j],@MSG[3] + + sha1h $E1,$ABCD // 18 + sha1p $ABCD,$E0,$W0 + + sha1h $E0,$ABCD // 19 + sha1p $ABCD,$E1,$W1 + + add.i32 $E,$E,$E0 + add.i32 $ABCD,$ABCD,$ABCD_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD},[$ctx],#16 + st1.32 {$E}[0],[$ctx] + + ldr x29,[sp],#16 + ret +.size sha1_block_armv8,.-sha1_block_armv8 +.align 6 +.Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by " +.align 2 +.comm OPENSSL_armcap_P,4,4 +___ +}}} + +{ my %opcode = ( + "sha1c" => 0x5e000000, "sha1p" => 0x5e001000, + "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000, + "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 ); + + sub unsha1 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo; + + s/\.\w?32\b//o and s/\.16b/\.4s/go; + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-c64xplus.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-c64xplus.pl new file mode 100644 index 0000000..4db2bcb --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-c64xplus.pl @@ -0,0 +1,337 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for C64x+. +# +# November 2011 +# +# If compared to compiler-generated code with similar characteristics, +# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, +# this implementation is 25% smaller and >2x faster. In absolute terms +# performance is (quite impressive) ~6.5 cycles per processed byte. +# Fully unrolled assembler would be ~5x larger and is likely to be +# ~15% faster. It would be free from references to intermediate ring +# buffer, but put more pressure on L1P [both because the code would be +# larger and won't be using SPLOOP buffer]. There are no plans to +# realize fully unrolled variant though... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments + +($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); +($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); +($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); +($XPA,$XPB) = ("A5","B5"); # X circular buffer +($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM + +$code=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg sha1_block_data_order,_sha1_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg MV,SWAP2 + .asg MV,SWAP4 + .endif + + .global _sha1_block_data_order +_sha1_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] LDW *${CTX}[0],$A ; load A-E... +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + [A0] LDW *${CTX}[1],$B +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + [A0] LDW *${CTX}[2],$C +|| [A0] MVK 0x00404,B0 + [A0] LDW *${CTX}[3],$D +|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] + [A0] LDW *${CTX}[4],$E +|| [A0] MVC B0,AMR ; setup circular addressing + LDNW *${INP}++,$TX1 ; pre-fetch input + NOP 1 + +loop?: + MVK 0x00007999,$K +|| ADDAW SP,2,$XPA +|| SUB A0,1,A0 +|| MVK 13,B0 + MVKH 0x5a820000,$K ; K_00_19 +|| ADDAW SP,2,$XPB +|| MV $A,$Actx +|| MV $B,$Bctx +;;================================================== + SPLOOPD 5 ; BODY_00_13 +|| MV $C,$Cctx +|| MV $D,$Dctx +|| MV $E,$Ectx +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX3 ; byte swap + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A + + ADD $TX3,$T,$A ; A=T+Xi +|| STW $TX3,*${XPB}++ + SPKERNEL +;;================================================== + ROTL $A,5,$Arot ; BODY_14 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are +|| LDW *${XPB}[4],$X2 ; 2 iterations ahead + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +;;================================================== + ROTL $A,5,$Arot ; BODY_15 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| MVK 3,B0 +;;================================================== + SPLOOPD 5 ; BODY_16_19 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 + SPKERNEL + + MVK 0xffffeba1,$K +|| MVK 19,B0 + MVKH 0x6ed90000,$K ; K_20_39 +___ +sub BODY_20_39 { +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_20_39 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| XOR $B,$C,$F +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $D,$F,$F ; F_20_39(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ ; last one is redundant +|| XOR $TX0,$TX1,$TX1 + SPKERNEL +___ +$code.=<<___ if (!shift); + MVK 0xffffbcdc,$K + MVKH 0x8f1b0000,$K ; K_40_59 +___ +} &BODY_20_39(); +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_40_59 +|| MVC B0,ILC +|| AND $B,$C,$F +|| AND $B,$D,$F0 + + ROTL $A,5,$Arot +|| XOR $F0,$F,$F +|| AND $C,$D,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_40_59(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_40_59(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| AND $B,$C,$F +|| AND $B,$D,$F0 + SPKERNEL + + MVK 0xffffc1d6,$K +|| MVK 18,B0 + MVKH 0xca620000,$K ; K_60_79 +___ + &BODY_20_39(-1); # BODY_60_78 +$code.=<<___; +;;================================================== + [A0] B loop? +|| ROTL $A,5,$Arot ; BODY_79 +|| XOR $B,$C,$F +|| ROTL $TX1,1,$TX2 ; Xupdate output + + [A0] LDNW *${INP}++,$TX1 ; pre-fetch input +|| ADD $K,$E,$T ; T=E+K +|| XOR $D,$F,$F ; F_20_39(B,C,D) + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ADD $Ectx,$D,$E ; E=D,E+=Ectx +|| ADD $Dctx,$C,$D ; D=C,D+=Dctx +|| ROTL $B,30,$C ; C=ROL(B,30) + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| ADD $Bctx,$A,$B ; B=A,B+=Bctx + + ADD $TX2,$T,$A ; A=T+Xi + + ADD $Actx,$A,$A ; A+=Actx +|| ADD $Cctx,$C,$C ; C+=Cctx +;; end of loop? + + BNOP RA ; return +|| MV FP,SP ; restore stack pointer +|| LDW *FP[0],FP ; restore frame pointer + STW $A,*${CTX}[0] ; emit A-E... +|| MVK 0,B0 + STW $B,*${CTX}[1] +|| MVC B0,AMR ; clear AMR + STW $C,*${CTX}[2] + STW $D,*${CTX}[3] + STW $E,*${CTX}[4] + .endasmfunc + + .sect .const + .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-ia64.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-ia64.pl new file mode 100644 index 0000000..dec21f9 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-ia64.pl @@ -0,0 +1,314 @@ +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Eternal question is what's wrong with compiler generated code? The +# trick is that it's possible to reduce the number of shifts required +# to perform rotations by maintaining copy of 32-bit value in upper +# bits of 64-bit register. Just follow mux2 and shrp instructions... +# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which +# is >50% better than HP C and >2x better than gcc. + +$output = pop; + +$code=<<___; +.ident \"sha1-ia64.s, version 1.3\" +.ident \"IA-64 ISA artwork by Andy Polyakov \" +.explicit + +___ + + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } + +#$human=1; +if ($human) { # useful for visual code auditing... + ($A,$B,$C,$D,$E) = ("A","B","C","D","E"); + ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); + ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = + ( "K_00_19","K_20_39","K_40_59","K_60_79" ); + @X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", + "X8", "X9","X10","X11","X12","X13","X14","X15" ); +} +else { + ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4"); + ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9"); + ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = + ( "r14", "r15", "loc10", "loc11" ); + @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", + "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); +} + +sub BODY_00_15 { +local *code=shift; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; + +$code.=<<___ if ($i==0); +{ .mmi; ld1 $X[$i]=[inp],2 // MSB + ld1 tmp2=[tmp3],2 };; +{ .mmi; ld1 tmp0=[inp],2 + ld1 tmp4=[tmp3],2 // LSB + dep $X[$i]=$X[$i],tmp2,8,8 };; +___ +if ($i<15) { + $code.=<<___; +{ .mmi; ld1 $Xn=[inp],2 // forward Xload + nop.m 0x0 + dep tmp1=tmp0,tmp4,8,8 };; +{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload + and tmp4=$c,$b + dep $X[$i]=$X[$i],tmp1,16,16} //;; +{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19 + andcm tmp1=$d,$b + dep.z tmp5=$a,5,27 };; // a<<5 +{ .mmi; add $e=$e,$X[$i] // e+=Xload + or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) + extr.u tmp1=$a,27,5 };; // a>>27 +{ .mmi; ld1 tmp0=[inp],2 // forward Xload + add $e=$e,tmp4 // e+=F_00_19(b,c,d) + shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) +{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload + or tmp5=tmp1,tmp5 // ROTATE(a,5) + mux2 tmp6=$a,0x44 };; // see b in next iteration +{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5) + dep $Xn=$Xn,tmp2,8,8 // forward Xload + mux2 $X[$i]=$X[$i],0x44 } //;; + +___ + } +else { + $code.=<<___; +{ .mii; and tmp3=$c,$b + dep tmp1=tmp0,tmp4,8,8;; + dep $X[$i]=$X[$i],tmp1,16,16} //;; +{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19 + andcm tmp1=$d,$b + dep.z tmp5=$a,5,27 };; // a<<5 +{ .mmi; add $e=$e,$X[$i] // e+=Xupdate + or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) + extr.u tmp1=$a,27,5 } // a>>27 +{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate + nop.i 0 };; +{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d) + xor $Xn=$Xn,tmp3 // forward Xupdate + shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) +{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) + mux2 tmp6=$a,0x44 };; // see b in next iteration +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + mux2 $X[$i]=$X[$i],0x44 };; + +___ + } +} + +sub BODY_16_19 { +local *code=shift; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; + +$code.=<<___; +{ .mib; add $e=$e,$K_00_19 // e+=K_00_19 + dep.z tmp5=$a,5,27 } // a<<5 +{ .mib; andcm tmp1=$d,$b + and tmp0=$c,$b };; +{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate + or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) + extr.u tmp1=$a,27,5 } // a>>27 +{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate + nop.i 0 };; +{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d) + xor $Xn=$Xn,tmp3 // forward Xupdate + shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) +{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) + mux2 tmp6=$a,0x44 };; // see b in next iteration +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + nop.i 0 };; + +___ +} + +sub BODY_20_39 { +local *code=shift; +my ($i,$a,$b,$c,$d,$e,$Konst)=@_; + $Konst = $K_20_39 if (!defined($Konst)); +my $j=$i+1; +my $Xn=@X[$j%16]; + +if ($i<79) { +$code.=<<___; +{ .mib; add $e=$e,$Konst // e+=K_XX_XX + dep.z tmp5=$a,5,27 } // a<<5 +{ .mib; xor tmp0=$c,$b + xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate +{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate + extr.u tmp1=$a,27,5 } // a>>27 +{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d + xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate +{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) + xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate + shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) +{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) + mux2 tmp6=$a,0x44 };; // see b in next iteration +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + nop.i 0 };; + +___ +} +else { +$code.=<<___; +{ .mib; add $e=$e,$Konst // e+=K_60_79 + dep.z tmp5=$a,5,27 } // a<<5 +{ .mib; xor tmp0=$c,$b + add $h1=$h1,$a };; // wrap up +{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate + extr.u tmp1=$a,27,5 } // a>>27 +{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d + add $h3=$h3,$c };; // wrap up +{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) + or tmp1=tmp1,tmp5 // ROTATE(a,5) + shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;? +{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5) + add tmp3=1,inp // used in unaligned codepath + add $h4=$h4,$d };; // wrap up + +___ +} +} + +sub BODY_40_59 { +local *code=shift; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; + +$code.=<<___; +{ .mib; add $e=$e,$K_40_59 // e+=K_40_59 + dep.z tmp5=$a,5,27 } // a<<5 +{ .mib; and tmp1=$c,$d + xor tmp0=$c,$d };; +{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate + add tmp5=tmp5,tmp1 // a<<5+(c&d) + extr.u tmp1=$a,27,5 } // a>>27 +{ .mmi; and tmp0=tmp0,$b + xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate +{ .mmi; add $e=$e,tmp0 // e+=b&(c^d) + add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d) + shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) +{ .mmi; xor $Xn=$Xn,tmp3 + mux2 tmp6=$a,0x44 };; // see b in next iteration +{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + nop.i 0x0 };; + +___ +} +sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); } + +$code.=<<___; +.text + +tmp0=r8; +tmp1=r9; +tmp2=r10; +tmp3=r11; +ctx=r32; // in0 +inp=r33; // in1 + +// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num); +.global sha1_block_data_order# +.proc sha1_block_data_order# +.align 32 +sha1_block_data_order: + .prologue +{ .mmi; alloc tmp1=ar.pfs,3,14,0,0 + $ADDP tmp0=4,ctx + .save ar.lc,r3 + mov r3=ar.lc } +{ .mmi; $ADDP ctx=0,ctx + $ADDP inp=0,inp + mov r2=pr };; +tmp4=in2; +tmp5=loc12; +tmp6=loc13; + .body +{ .mlx; ld4 $h0=[ctx],8 + movl $K_00_19=0x5a827999 } +{ .mlx; ld4 $h1=[tmp0],8 + movl $K_20_39=0x6ed9eba1 };; +{ .mlx; ld4 $h2=[ctx],8 + movl $K_40_59=0x8f1bbcdc } +{ .mlx; ld4 $h3=[tmp0] + movl $K_60_79=0xca62c1d6 };; +{ .mmi; ld4 $h4=[ctx],-16 + add in2=-1,in2 // adjust num for ar.lc + mov ar.ec=1 };; +{ .mmi; nop.m 0 + add tmp3=1,inp + mov ar.lc=in2 };; // brp.loop.imp: too far + +.Ldtop: +{ .mmi; mov $A=$h0 + mov $B=$h1 + mux2 tmp6=$h1,0x44 } +{ .mmi; mov $C=$h2 + mov $D=$h3 + mov $E=$h4 };; + +___ + +{ my $i; + my @V=($A,$B,$C,$D,$E); + + for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } + + (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check +} + +$code.=<<___; +{ .mmb; add $h0=$h0,$A + add $h2=$h2,$C + br.ctop.dptk.many .Ldtop };; +.Ldend: +{ .mmi; add tmp0=4,ctx + mov ar.lc=r3 };; +{ .mmi; st4 [ctx]=$h0,8 + st4 [tmp0]=$h1,8 };; +{ .mmi; st4 [ctx]=$h2,8 + st4 [tmp0]=$h3 };; +{ .mib; st4 [ctx]=$h4,-16 + mov pr=r2,0x1ffff + br.ret.sptk.many b0 };; +.endp sha1_block_data_order# +stringz "SHA1 block transform for IA64, CRYPTOGAMS by " +___ + +open STDOUT,">$output" if $output; +print $code; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-mb-x86_64.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-mb-x86_64.pl new file mode 100644 index 0000000..51c73c0 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-mb-x86_64.pl @@ -0,0 +1,1582 @@ +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer SHA1 procedure processes n buffers in parallel by +# placing buffer data to designated lane of SIMD register. n is +# naturally limited to 4 on pre-AVX2 processors and to 8 on +# AVX2-capable processors such as Haswell. +# +# this +aesni(i) sha1 aesni-sha1 gain(iv) +# ------------------------------------------------------------------- +# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% +# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% +# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% +# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% +# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% +# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% +# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% +# +# (i) multi-block CBC encrypt with 128-bit key; +# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, +# because of lower AES-NI instruction throughput; +# (iii) "this" is for n=8, when we gather twice as much data, result +# for n=4 is 8.00+4.44=12.4; +# (iv) presented improvement coefficients are asymptotic limits and +# in real-life application are somewhat lower, e.g. for 2KB +# fragments they range from 30% to 100% (on Haswell); + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +# void sha1_multi_block ( +# struct { unsigned int A[8]; +# unsigned int B[8]; +# unsigned int C[8]; +# unsigned int D[8]; +# unsigned int E[8]; } *ctx, +# struct { void *ptr; int blocks; } inp[8], +# int num); /* 1 or 2 */ +# +$ctx="%rdi"; # 1st arg +$inp="%rsi"; # 2nd arg +$num="%edx"; +@ptr=map("%r$_",(8..11)); +$Tbl="%rbp"; + +@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); +($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); +@Xi=map("%xmm$_",(10..14)); +$K="%xmm15"; + +if (1) { + # Atom-specific optimization aiming to eliminate pshufb with high + # registers [and thus get rid of 48 cycles accumulated penalty] + @Xi=map("%xmm$_",(0..4)); + ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); + @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); +} + +$REG_SZ=16; + +sub Xi_off { +my $off = shift; + + $off %= 16; $off *= $REG_SZ; + $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; +} + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $k=$i+2; + +# Loads are performed 2+3/4 iterations in advance. 3/4 means that out +# of 4 words you would expect to be loaded per given iteration one is +# spilled to next iteration. In other words indices in four input +# streams are distributed as following: +# +# $i==0: 0,0,0,0,1,1,1,1,2,2,2, +# $i==1: 2,3,3,3, +# $i==2: 3,4,4,4, +# ... +# $i==13: 14,15,15,15, +# $i==14: 15 +# +# Then at $i==15 Xupdate is applied one iteration in advance... +$code.=<<___ if ($i==0); + movd (@ptr[0]),@Xi[0] + lea `16*4`(@ptr[0]),@ptr[0] + movd (@ptr[1]),@Xi[2] # borrow @Xi[2] + lea `16*4`(@ptr[1]),@ptr[1] + movd (@ptr[2]),@Xi[3] # borrow @Xi[3] + lea `16*4`(@ptr[2]),@ptr[2] + movd (@ptr[3]),@Xi[4] # borrow @Xi[4] + lea `16*4`(@ptr[3]),@ptr[3] + punpckldq @Xi[3],@Xi[0] + movd `4*$j-16*4`(@ptr[0]),@Xi[1] + punpckldq @Xi[4],@Xi[2] + movd `4*$j-16*4`(@ptr[1]),$t3 + punpckldq @Xi[2],@Xi[0] + movd `4*$j-16*4`(@ptr[2]),$t2 + pshufb $tx,@Xi[0] +___ +$code.=<<___ if ($i<14); # just load input + movd `4*$j-16*4`(@ptr[3]),$t1 + punpckldq $t2,@Xi[1] + movdqa $a,$t2 + paddd $K,$e # e+=K_00_19 + punpckldq $t1,$t3 + movdqa $b,$t1 + movdqa $b,$t0 + pslld \$5,$t2 + pandn $d,$t1 + pand $c,$t0 + punpckldq $t3,@Xi[1] + movdqa $a,$t3 + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + movd `4*$k-16*4`(@ptr[0]),@Xi[2] + psrld \$27,$t3 + pxor $t1,$t0 # Ch(b,c,d) + movdqa $b,$t1 + + por $t3,$t2 # rol(a,5) + movd `4*$k-16*4`(@ptr[1]),$t3 + pslld \$30,$t1 + paddd $t0,$e # e+=Ch(b,c,d) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + pshufb $tx,@Xi[1] + movd `4*$k-16*4`(@ptr[2]),$t2 + por $t1,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==14); # just load input + movd `4*$j-16*4`(@ptr[3]),$t1 + punpckldq $t2,@Xi[1] + movdqa $a,$t2 + paddd $K,$e # e+=K_00_19 + punpckldq $t1,$t3 + movdqa $b,$t1 + movdqa $b,$t0 + pslld \$5,$t2 + prefetcht0 63(@ptr[0]) + pandn $d,$t1 + pand $c,$t0 + punpckldq $t3,@Xi[1] + movdqa $a,$t3 + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + psrld \$27,$t3 + pxor $t1,$t0 # Ch(b,c,d) + movdqa $b,$t1 + prefetcht0 63(@ptr[1]) + + por $t3,$t2 # rol(a,5) + pslld \$30,$t1 + paddd $t0,$e # e+=Ch(b,c,d) + prefetcht0 63(@ptr[2]) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + pshufb $tx,@Xi[1] + prefetcht0 63(@ptr[3]) + por $t1,$b # b=rol(b,30) +___ +$code.=<<___ if ($i>=13 && $i<15); + movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" +___ +$code.=<<___ if ($i>=15); # apply Xupdate + pxor @Xi[-2],@Xi[1] # "X[13]" + movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + movdqa $a,$t2 + pxor `&Xi_off($j+8)`,@Xi[1] + paddd $K,$e # e+=K_00_19 + movdqa $b,$t1 + pslld \$5,$t2 + pxor @Xi[3],@Xi[1] + movdqa $b,$t0 + pandn $d,$t1 + movdqa @Xi[1],$tx + pand $c,$t0 + movdqa $a,$t3 + psrld \$31,$tx + paddd @Xi[1],@Xi[1] + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + psrld \$27,$t3 + pxor $t1,$t0 # Ch(b,c,d) + + movdqa $b,$t1 + por $t3,$t2 # rol(a,5) + pslld \$30,$t1 + paddd $t0,$e # e+=Ch(b,c,d) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $tx,@Xi[1] # rol \$1,@Xi[1] + por $t1,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if ($i<79); + pxor @Xi[-2],@Xi[1] # "X[13]" + movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + movdqa $a,$t2 + movdqa $d,$t0 + pxor `&Xi_off($j+8)`,@Xi[1] + paddd $K,$e # e+=K_20_39 + pslld \$5,$t2 + pxor $b,$t0 + + movdqa $a,$t3 +___ +$code.=<<___ if ($i<72); + movdqa @Xi[0],`&Xi_off($i)` +___ +$code.=<<___ if ($i<79); + paddd @Xi[0],$e # e+=X[i] + pxor @Xi[3],@Xi[1] + psrld \$27,$t3 + pxor $c,$t0 # Parity(b,c,d) + movdqa $b,$t1 + + pslld \$30,$t1 + movdqa @Xi[1],$tx + por $t3,$t2 # rol(a,5) + psrld \$31,$tx + paddd $t0,$e # e+=Parity(b,c,d) + paddd @Xi[1],@Xi[1] + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $tx,@Xi[1] # rol(@Xi[1],1) + por $t1,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==79); + movdqa $a,$t2 + paddd $K,$e # e+=K_20_39 + movdqa $d,$t0 + pslld \$5,$t2 + pxor $b,$t0 + + movdqa $a,$t3 + paddd @Xi[0],$e # e+=X[i] + psrld \$27,$t3 + movdqa $b,$t1 + pxor $c,$t0 # Parity(b,c,d) + + pslld \$30,$t1 + por $t3,$t2 # rol(a,5) + paddd $t0,$e # e+=Parity(b,c,d) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $t1,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___; + pxor @Xi[-2],@Xi[1] # "X[13]" + movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + movdqa $a,$t2 + movdqa $d,$t1 + pxor `&Xi_off($j+8)`,@Xi[1] + pxor @Xi[3],@Xi[1] + paddd $K,$e # e+=K_40_59 + pslld \$5,$t2 + movdqa $a,$t3 + pand $c,$t1 + + movdqa $d,$t0 + movdqa @Xi[1],$tx + psrld \$27,$t3 + paddd $t1,$e + pxor $c,$t0 + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + por $t3,$t2 # rol(a,5) + psrld \$31,$tx + pand $b,$t0 + movdqa $b,$t1 + + pslld \$30,$t1 + paddd @Xi[1],@Xi[1] + paddd $t0,$e # e+=Maj(b,d,c) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $tx,@Xi[1] # rol(@X[1],1) + por $t1,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl sha1_multi_block +.type sha1_multi_block,\@function,3 +.align 32 +sha1_multi_block: + mov OPENSSL_ia32cap_P+4(%rip),%rcx + bt \$61,%rcx # check SHA bit + jc _shaext_shortcut +___ +$code.=<<___ if ($avx); + test \$`1<<28`,%ecx + jnz _avx_shortcut +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`,%rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody: + lea K_XX_XX(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + +.Loop_grande: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone + + movdqu 0x00($ctx),$A # load context + lea 128(%rsp),%rax + movdqu 0x20($ctx),$B + movdqu 0x40($ctx),$C + movdqu 0x60($ctx),$D + movdqu 0x80($ctx),$E + movdqa 0x60($Tbl),$tx # pbswap_mask + movdqa -0x20($Tbl),$K # K_00_19 + jmp .Loop + +.align 32 +.Loop: +___ +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + movdqa (%rbx),@Xi[0] # pull counters + mov \$1,%ecx + cmp 4*0(%rbx),%ecx # examinte counters + pxor $t2,$t2 + cmovge $Tbl,@ptr[0] # cancel input + cmp 4*1(%rbx),%ecx + movdqa @Xi[0],@Xi[1] + cmovge $Tbl,@ptr[1] + cmp 4*2(%rbx),%ecx + pcmpgtd $t2,@Xi[1] # mask value + cmovge $Tbl,@ptr[2] + cmp 4*3(%rbx),%ecx + paddd @Xi[1],@Xi[0] # counters-- + cmovge $Tbl,@ptr[3] + + movdqu 0x00($ctx),$t0 + pand @Xi[1],$A + movdqu 0x20($ctx),$t1 + pand @Xi[1],$B + paddd $t0,$A + movdqu 0x40($ctx),$t2 + pand @Xi[1],$C + paddd $t1,$B + movdqu 0x60($ctx),$t3 + pand @Xi[1],$D + paddd $t2,$C + movdqu 0x80($ctx),$tx + pand @Xi[1],$E + movdqu $A,0x00($ctx) + paddd $t3,$D + movdqu $B,0x20($ctx) + paddd $tx,$E + movdqu $C,0x40($ctx) + movdqu $D,0x60($ctx) + movdqu $E,0x80($ctx) + + movdqa @Xi[0],(%rbx) # save counters + movdqa 0x60($Tbl),$tx # pbswap_mask + movdqa -0x20($Tbl),$K # K_00_19 + dec $num + jnz .Loop + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande + +.Ldone: + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue: + ret +.size sha1_multi_block,.-sha1_multi_block +___ + {{{ +my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); +my @MSG0=map("%xmm$_",(4..7)); +my @MSG1=map("%xmm$_",(11..14)); + +$code.=<<___; +.type sha1_multi_block_shaext,\@function,3 +.align 32 +sha1_multi_block_shaext: +_shaext_shortcut: + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`,%rsp + shl \$1,$num # we process pair at a time + and \$-256,%rsp + lea 0x40($ctx),$ctx # size optimization + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody_shaext: + lea `$REG_SZ*16`(%rsp),%rbx + movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap + +.Loop_grande_shaext: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<2;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone_shaext + + movq 0x00-0x40($ctx),$ABCD0 # a1.a0 + movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 + movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 + movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 + movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 + + punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 + punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 + + movdqa $ABCD0,$ABCD1 + punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 + punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 + + pshufd \$0b00111111,@MSG0[3],$E0 + pshufd \$0b01111111,@MSG0[3],$E1 + pshufd \$0b00011011,$ABCD0,$ABCD0 + pshufd \$0b00011011,$ABCD1,$ABCD1 + jmp .Loop_shaext + +.align 32 +.Loop_shaext: + movdqu 0x00(@ptr[0]),@MSG0[0] + movdqu 0x00(@ptr[1]),@MSG1[0] + movdqu 0x10(@ptr[0]),@MSG0[1] + movdqu 0x10(@ptr[1]),@MSG1[1] + movdqu 0x20(@ptr[0]),@MSG0[2] + pshufb $BSWAP,@MSG0[0] + movdqu 0x20(@ptr[1]),@MSG1[2] + pshufb $BSWAP,@MSG1[0] + movdqu 0x30(@ptr[0]),@MSG0[3] + lea 0x40(@ptr[0]),@ptr[0] + pshufb $BSWAP,@MSG0[1] + movdqu 0x30(@ptr[1]),@MSG1[3] + lea 0x40(@ptr[1]),@ptr[1] + pshufb $BSWAP,@MSG1[1] + + movdqa $E0,0x50(%rsp) # offload + paddd @MSG0[0],$E0 + movdqa $E1,0x70(%rsp) + paddd @MSG1[0],$E1 + movdqa $ABCD0,0x40(%rsp) # offload + movdqa $ABCD0,$E0_ + movdqa $ABCD1,0x60(%rsp) + movdqa $ABCD1,$E1_ + sha1rnds4 \$0,$E0,$ABCD0 # 0-3 + sha1nexte @MSG0[1],$E0_ + sha1rnds4 \$0,$E1,$ABCD1 # 0-3 + sha1nexte @MSG1[1],$E1_ + pshufb $BSWAP,@MSG0[2] + prefetcht0 127(@ptr[0]) + sha1msg1 @MSG0[1],@MSG0[0] + pshufb $BSWAP,@MSG1[2] + prefetcht0 127(@ptr[1]) + sha1msg1 @MSG1[1],@MSG1[0] + + pshufb $BSWAP,@MSG0[3] + movdqa $ABCD0,$E0 + pshufb $BSWAP,@MSG1[3] + movdqa $ABCD1,$E1 + sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 + sha1nexte @MSG0[2],$E0 + sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 + sha1nexte @MSG1[2],$E1 + pxor @MSG0[2],@MSG0[0] + sha1msg1 @MSG0[2],@MSG0[1] + pxor @MSG1[2],@MSG1[0] + sha1msg1 @MSG1[2],@MSG1[1] +___ +for($i=2;$i<20-4;$i++) { +$code.=<<___; + movdqa $ABCD0,$E0_ + movdqa $ABCD1,$E1_ + sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 + sha1nexte @MSG0[3],$E0_ + sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 + sha1nexte @MSG1[3],$E1_ + sha1msg2 @MSG0[3],@MSG0[0] + sha1msg2 @MSG1[3],@MSG1[0] + pxor @MSG0[3],@MSG0[1] + sha1msg1 @MSG0[3],@MSG0[2] + pxor @MSG1[3],@MSG1[1] + sha1msg1 @MSG1[3],@MSG1[2] +___ + ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); + push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); +} +$code.=<<___; + movdqa $ABCD0,$E0_ + movdqa $ABCD1,$E1_ + sha1rnds4 \$3,$E0,$ABCD0 # 64-67 + sha1nexte @MSG0[3],$E0_ + sha1rnds4 \$3,$E1,$ABCD1 # 64-67 + sha1nexte @MSG1[3],$E1_ + sha1msg2 @MSG0[3],@MSG0[0] + sha1msg2 @MSG1[3],@MSG1[0] + pxor @MSG0[3],@MSG0[1] + pxor @MSG1[3],@MSG1[1] + + mov \$1,%ecx + pxor @MSG0[2],@MSG0[2] # zero + cmp 4*0(%rbx),%ecx # examine counters + cmovge %rsp,@ptr[0] # cancel input + + movdqa $ABCD0,$E0 + movdqa $ABCD1,$E1 + sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 + sha1nexte @MSG0[0],$E0 + sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 + sha1nexte @MSG1[0],$E1 + sha1msg2 @MSG0[0],@MSG0[1] + sha1msg2 @MSG1[0],@MSG1[1] + + cmp 4*1(%rbx),%ecx + cmovge %rsp,@ptr[1] + movq (%rbx),@MSG0[0] # pull counters + + movdqa $ABCD0,$E0_ + movdqa $ABCD1,$E1_ + sha1rnds4 \$3,$E0,$ABCD0 # 72-75 + sha1nexte @MSG0[1],$E0_ + sha1rnds4 \$3,$E1,$ABCD1 # 72-75 + sha1nexte @MSG1[1],$E1_ + + pshufd \$0x00,@MSG0[0],@MSG1[2] + pshufd \$0x55,@MSG0[0],@MSG1[3] + movdqa @MSG0[0],@MSG0[1] + pcmpgtd @MSG0[2],@MSG1[2] + pcmpgtd @MSG0[2],@MSG1[3] + + movdqa $ABCD0,$E0 + movdqa $ABCD1,$E1 + sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 + sha1nexte $MSG0[2],$E0 + sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 + sha1nexte $MSG0[2],$E1 + + pcmpgtd @MSG0[2],@MSG0[1] # counter mask + pand @MSG1[2],$ABCD0 + pand @MSG1[2],$E0 + pand @MSG1[3],$ABCD1 + pand @MSG1[3],$E1 + paddd @MSG0[1],@MSG0[0] # counters-- + + paddd 0x40(%rsp),$ABCD0 + paddd 0x50(%rsp),$E0 + paddd 0x60(%rsp),$ABCD1 + paddd 0x70(%rsp),$E1 + + movq @MSG0[0],(%rbx) # save counters + dec $num + jnz .Loop_shaext + + mov `$REG_SZ*17+8`(%rsp),$num + + pshufd \$0b00011011,$ABCD0,$ABCD0 + pshufd \$0b00011011,$ABCD1,$ABCD1 + + movdqa $ABCD0,@MSG0[0] + punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 + punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 + punpckhdq $E1,$E0 # e1.e0.xx.xx + movq $ABCD0,0x00-0x40($ctx) # a1.a0 + psrldq \$8,$ABCD0 + movq @MSG0[0],0x40-0x40($ctx)# c1.c0 + psrldq \$8,@MSG0[0] + movq $ABCD0,0x20-0x40($ctx) # b1.b0 + psrldq \$8,$E0 + movq @MSG0[0],0x60-0x40($ctx)# d1.d0 + movq $E0,0x80-0x40($ctx) # e1.e0 + + lea `$REG_SZ/2`($ctx),$ctx + lea `16*2`($inp),$inp + dec $num + jnz .Loop_grande_shaext + +.Ldone_shaext: + #mov `$REG_SZ*17`(%rsp),%rax # original %rsp +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue_shaext: + ret +.size sha1_multi_block_shaext,.-sha1_multi_block_shaext +___ + }}} + + if ($avx) {{{ +sub BODY_00_19_avx { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $k=$i+2; +my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; +my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; + +$code.=<<___ if ($i==0 && $REG_SZ==16); + vmovd (@ptr[0]),@Xi[0] + lea `16*4`(@ptr[0]),@ptr[0] + vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] + lea `16*4`(@ptr[1]),@ptr[1] + vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] + lea `16*4`(@ptr[3]),@ptr[3] + vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] + vpunpckldq @Xi[2],@Xi[0],@Xi[0] + vmovd `4*$j-16*4`($ptr_n),$t3 + vpshufb $tx,@Xi[0],@Xi[0] +___ +$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input + vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] + vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 +___ +$code.=<<___ if ($i==0 && $REG_SZ==32); + vmovd (@ptr[0]),@Xi[0] + lea `16*4`(@ptr[0]),@ptr[0] + vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] + lea `16*4`(@ptr[4]),@ptr[4] + vmovd (@ptr[1]),$t2 + lea `16*4`(@ptr[1]),@ptr[1] + vmovd (@ptr[5]),$t1 + lea `16*4`(@ptr[5]),@ptr[5] + vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] + lea `16*4`(@ptr[6]),@ptr[6] + vpinsrd \$1,(@ptr[3]),$t2,$t2 + lea `16*4`(@ptr[3]),@ptr[3] + vpunpckldq $t2,@Xi[0],@Xi[0] + vpinsrd \$1,(@ptr[7]),$t1,$t1 + lea `16*4`(@ptr[7]),@ptr[7] + vpunpckldq $t1,@Xi[2],@Xi[2] + vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] + vinserti128 @Xi[2],@Xi[0],@Xi[0] + vmovd `4*$j-16*4`($ptr_n),$t3 + vpshufb $tx,@Xi[0],@Xi[0] +___ +$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input + vmovd `4*$j-16*4`(@ptr[1]),$t2 + vmovd `4*$j-16*4`(@ptr[5]),$t1 + vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] + vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 + vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 + vpunpckldq $t2,@Xi[1],@Xi[1] + vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 + vpunpckldq $t1,$t3,$t3 +___ +$code.=<<___ if ($i<14); + vpaddd $K,$e,$e # e+=K_00_19 + vpslld \$5,$a,$t2 + vpandn $d,$b,$t1 + vpand $c,$b,$t0 + + vmovdqa @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + $vpack $t3,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + vpxor $t1,$t0,$t0 # Ch(b,c,d) + vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vmovd `4*$k-16*4`($ptr_n),$t3 + vpaddd $t0,$e,$e # e+=Ch(b,c,d) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpshufb $tx,@Xi[1],@Xi[1] + vpor $t1,$b,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==14); + vpaddd $K,$e,$e # e+=K_00_19 + prefetcht0 63(@ptr[0]) + vpslld \$5,$a,$t2 + vpandn $d,$b,$t1 + vpand $c,$b,$t0 + + vmovdqa @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + $vpack $t3,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + prefetcht0 63(@ptr[1]) + vpxor $t1,$t0,$t0 # Ch(b,c,d) + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + prefetcht0 63(@ptr[2]) + vpaddd $t0,$e,$e # e+=Ch(b,c,d) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + prefetcht0 63(@ptr[3]) + vpshufb $tx,@Xi[1],@Xi[1] + vpor $t1,$b,$b # b=rol(b,30) +___ +$code.=<<___ if ($i>=13 && $i<15); + vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" +___ +$code.=<<___ if ($i>=15); # apply Xupdate + vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" + vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + vpaddd $K,$e,$e # e+=K_00_19 + vpslld \$5,$a,$t2 + vpandn $d,$b,$t1 + `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` + vpand $c,$b,$t0 + + vmovdqa @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + vpxor $t1,$t0,$t0 # Ch(b,c,d) + vpxor @Xi[3],@Xi[1],@Xi[1] + `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vpaddd $t0,$e,$e # e+=Ch(b,c,d) + `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` + vpsrld \$31,@Xi[1],$tx + vpaddd @Xi[1],@Xi[1],@Xi[1] + + vpsrld \$2,$b,$b + `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] + vpor $t1,$b,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_20_39_avx { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if ($i<79); + vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" + vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + vpslld \$5,$a,$t2 + vpaddd $K,$e,$e # e+=K_20_39 + vpxor $b,$d,$t0 +___ +$code.=<<___ if ($i<72); + vmovdqa @Xi[0],`&Xi_off($i)` +___ +$code.=<<___ if ($i<79); + vpaddd @Xi[0],$e,$e # e+=X[i] + vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + vpxor $c,$t0,$t0 # Parity(b,c,d) + vpxor @Xi[3],@Xi[1],@Xi[1] + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vpaddd $t0,$e,$e # e+=Parity(b,c,d) + vpsrld \$31,@Xi[1],$tx + vpaddd @Xi[1],@Xi[1],@Xi[1] + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) + vpor $t1,$b,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==79); + vpslld \$5,$a,$t2 + vpaddd $K,$e,$e # e+=K_20_39 + vpxor $b,$d,$t0 + + vpsrld \$27,$a,$t3 + vpaddd @Xi[0],$e,$e # e+=X[i] + vpxor $c,$t0,$t0 # Parity(b,c,d) + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vpaddd $t0,$e,$e # e+=Parity(b,c,d) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $t1,$b,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_40_59_avx { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___; + vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" + vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + vpaddd $K,$e,$e # e+=K_40_59 + vpslld \$5,$a,$t2 + vpand $c,$d,$t1 + vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] + + vpaddd $t1,$e,$e + vpsrld \$27,$a,$t3 + vpxor $c,$d,$t0 + vpxor @Xi[3],@Xi[1],@Xi[1] + + vmovdqu @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + vpor $t3,$t2,$t2 # rol(a,5) + vpsrld \$31,@Xi[1],$tx + vpand $b,$t0,$t0 + vpaddd @Xi[1],@Xi[1],@Xi[1] + + vpslld \$30,$b,$t1 + vpaddd $t0,$e,$e # e+=Maj(b,d,c) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) + vpor $t1,$b,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +$code.=<<___; +.type sha1_multi_block_avx,\@function,3 +.align 32 +sha1_multi_block_avx: +_avx_shortcut: +___ +$code.=<<___ if ($avx>1); + shr \$32,%rcx + cmp \$2,$num + jb .Lavx + test \$`1<<5`,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody_avx: + lea K_XX_XX(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + + vzeroupper +.Loop_grande_avx: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone_avx + + vmovdqu 0x00($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20($ctx),$B + vmovdqu 0x40($ctx),$C + vmovdqu 0x60($ctx),$D + vmovdqu 0x80($ctx),$E + vmovdqu 0x60($Tbl),$tx # pbswap_mask + jmp .Loop_avx + +.align 32 +.Loop_avx: +___ +$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 +for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 +for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 +for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 +for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov \$1,%ecx +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu (%rbx),$t0 # pull counters + vpxor $t2,$t2,$t2 + vmovdqa $t0,$t1 + vpcmpgtd $t2,$t1,$t1 # mask value + vpaddd $t1,$t0,$t0 # counters-- + + vpand $t1,$A,$A + vpand $t1,$B,$B + vpaddd 0x00($ctx),$A,$A + vpand $t1,$C,$C + vpaddd 0x20($ctx),$B,$B + vpand $t1,$D,$D + vpaddd 0x40($ctx),$C,$C + vpand $t1,$E,$E + vpaddd 0x60($ctx),$D,$D + vpaddd 0x80($ctx),$E,$E + vmovdqu $A,0x00($ctx) + vmovdqu $B,0x20($ctx) + vmovdqu $C,0x40($ctx) + vmovdqu $D,0x60($ctx) + vmovdqu $E,0x80($ctx) + + vmovdqu $t0,(%rbx) # save counters + vmovdqu 0x60($Tbl),$tx # pbswap_mask + dec $num + jnz .Loop_avx + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande_avx + +.Ldone_avx: + mov `$REG_SZ*17`(%rsp),%rax # original %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue_avx: + ret +.size sha1_multi_block_avx,.-sha1_multi_block_avx +___ + + if ($avx>1) { +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +$REG_SZ=32; + +@ptr=map("%r$_",(12..15,8..11)); + +@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); +($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); +@Xi=map("%ymm$_",(10..14)); +$K="%ymm15"; + +$code.=<<___; +.type sha1_multi_block_avx2,\@function,3 +.align 32 +sha1_multi_block_avx2: +_avx2_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody_avx2: + lea K_XX_XX(%rip),$Tbl + shr \$1,$num + + vzeroupper +.Loop_grande_avx2: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num + lea `$REG_SZ*16`(%rsp),%rbx +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu 0x00($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20($ctx),$B + lea 256+128(%rsp),%rbx + vmovdqu 0x40($ctx),$C + vmovdqu 0x60($ctx),$D + vmovdqu 0x80($ctx),$E + vmovdqu 0x60($Tbl),$tx # pbswap_mask + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: +___ +$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 +for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 +for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 +for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 +for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov \$1,%ecx + lea `$REG_SZ*16`(%rsp),%rbx +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu (%rbx),$t0 # pull counters + vpxor $t2,$t2,$t2 + vmovdqa $t0,$t1 + vpcmpgtd $t2,$t1,$t1 # mask value + vpaddd $t1,$t0,$t0 # counters-- + + vpand $t1,$A,$A + vpand $t1,$B,$B + vpaddd 0x00($ctx),$A,$A + vpand $t1,$C,$C + vpaddd 0x20($ctx),$B,$B + vpand $t1,$D,$D + vpaddd 0x40($ctx),$C,$C + vpand $t1,$E,$E + vpaddd 0x60($ctx),$D,$D + vpaddd 0x80($ctx),$E,$E + vmovdqu $A,0x00($ctx) + vmovdqu $B,0x20($ctx) + vmovdqu $C,0x40($ctx) + vmovdqu $D,0x60($ctx) + vmovdqu $E,0x80($ctx) + + vmovdqu $t0,(%rbx) # save counters + lea 256+128(%rsp),%rbx + vmovdqu 0x60($Tbl),$tx # pbswap_mask + dec $num + jnz .Loop_avx2 + + #mov `$REG_SZ*17+8`(%rsp),$num + #lea $REG_SZ($ctx),$ctx + #lea `16*$REG_SZ/4`($inp),$inp + #dec $num + #jnz .Loop_grande_avx2 + +.Ldone_avx2: + mov `$REG_SZ*17`(%rsp),%rax # original %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue_avx2: + ret +.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 +___ + } }}} +$code.=<<___; + +.align 256 + .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 + .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +K_XX_XX: + .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 + .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 + .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 + .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 + .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 + .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap + .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 + .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by " +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<.Lbody + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + mov `16*17`(%rax),%rax # pull saved stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + + lea -24-10*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler +___ +$code.=<<___ if ($avx>1); +.type avx2_handler,\@abi-omnipotent +.align 16 +avx2_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov `32*17`($context),%rax # pull saved stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore cotnext->R12 + mov %r13,224($context) # restore cotnext->R13 + mov %r14,232($context) # restore cotnext->R14 + mov %r15,240($context) # restore cotnext->R15 + + lea -56-10*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lin_prologue +.size avx2_handler,.-avx2_handler +___ +$code.=<<___; +.section .pdata +.align 4 + .rva .LSEH_begin_sha1_multi_block + .rva .LSEH_end_sha1_multi_block + .rva .LSEH_info_sha1_multi_block + .rva .LSEH_begin_sha1_multi_block_shaext + .rva .LSEH_end_sha1_multi_block_shaext + .rva .LSEH_info_sha1_multi_block_shaext +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_sha1_multi_block_avx + .rva .LSEH_end_sha1_multi_block_avx + .rva .LSEH_info_sha1_multi_block_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_sha1_multi_block_avx2 + .rva .LSEH_end_sha1_multi_block_avx2 + .rva .LSEH_info_sha1_multi_block_avx2 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_sha1_multi_block: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbody,.Lepilogue # HandlerData[] +.LSEH_info_sha1_multi_block_shaext: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_sha1_multi_block_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] +___ +$code.=<<___ if ($avx>1); +.LSEH_info_sha1_multi_block_avx2: + .byte 9,0,0,0 + .rva avx2_handler + .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] +___ +} +#################################################################### + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if ($dst>=8); + $rex|=0x01 if ($src>=8); + unshift @opcode,$rex|0x40 if ($rex); +} + +sub sha1rnds4 { + if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x3a,0xcc); + rex(\@opcode,$3,$2); + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } else { + return "sha1rnds4\t".@_[0]; + } +} + +sub sha1op38 { + my $instr = shift; + my %opcodelet = ( + "sha1nexte" => 0xc8, + "sha1msg1" => 0xc9, + "sha1msg2" => 0xca ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x38); + rex(\@opcode,$2,$1); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or + s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or + + s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or + s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or + s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-mips.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-mips.pl new file mode 100644 index 0000000..882f973 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-mips.pl @@ -0,0 +1,457 @@ +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for MIPS. + +# Performance improvement is 30% on unaligned input. The "secret" is +# to deploy lwl/lwr pair to load unaligned input. One could have +# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- +# compatible subroutine. There is room for minor optimization on +# little-endian platforms... + +# September 2012. +# +# Add MIPS32r2 code (>25% less instructions). + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +# +# +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); + +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +# offsets of the Most and Least Significant Bytes +$MSB=$big_endian?0:3; +$LSB=3&~$MSB; + +@X=map("\$$_",(8..23)); # a4-a7,s0-s11 + +$ctx=$a0; +$inp=$a1; +$num=$a2; +$A="\$1"; +$B="\$2"; +$C="\$3"; +$D="\$7"; +$E="\$24"; @V=($A,$B,$C,$D,$E); +$t0="\$25"; +$t1=$num; # $num is offloaded to stack +$t2="\$30"; # fp +$K="\$31"; # ra + +sub BODY_00_14 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if (!$big_endian); +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + wsbh @X[$i],@X[$i] # byte swap($i) + rotr @X[$i],@X[$i],16 +#else + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or $t1,$t2 + or @X[$i],$t1 +#endif +___ +$code.=<<___; +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + addu $e,$K # $i + xor $t0,$c,$d + rotr $t1,$a,27 + lwl @X[$j],$j*4+$MSB($inp) + and $t0,$b + addu $e,$t1 + lwr @X[$j],$j*4+$LSB($inp) + xor $t0,$d + addu $e,@X[$i] + rotr $b,$b,2 + addu $e,$t0 +#else + lwl @X[$j],$j*4+$MSB($inp) + sll $t0,$a,5 # $i + addu $e,$K + lwr @X[$j],$j*4+$LSB($inp) + srl $t1,$a,27 + addu $e,$t0 + xor $t0,$c,$d + addu $e,$t1 + sll $t2,$b,30 + and $t0,$b + srl $b,$b,2 + xor $t0,$d + addu $e,@X[$i] + or $b,$t2 + addu $e,$t0 +#endif +___ +} + +sub BODY_15_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if (!$big_endian && $i==15); +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + wsbh @X[$i],@X[$i] # byte swap($i) + rotr @X[$i],@X[$i],16 +#else + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or @X[$i],$t1 + or @X[$i],$t2 +#endif +___ +$code.=<<___; +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + addu $e,$K # $i + xor @X[$j%16],@X[($j+2)%16] + xor $t0,$c,$d + rotr $t1,$a,27 + xor @X[$j%16],@X[($j+8)%16] + and $t0,$b + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + xor $t0,$d + addu $e,@X[$i%16] + rotr @X[$j%16],@X[$j%16],31 + rotr $b,$b,2 + addu $e,$t0 +#else + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + and $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + xor $t0,$d + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +#endif +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + xor @X[$j%16],@X[($j+2)%16] + addu $e,$K # $i + rotr $t1,$a,27 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + xor $t0,$b + addu $e,@X[$i%16] + rotr @X[$j%16],@X[$j%16],31 + rotr $b,$b,2 + addu $e,$t0 +#else + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + xor $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + addu $e,@X[$i%16] + or @X[$j%16],$t1 + or $b,$t2 + addu $e,$t0 +#endif +___ +$code.=<<___ if ($i==79); +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + lw @X[0],0($ctx) + addu $e,$K # $i + lw @X[1],4($ctx) + rotr $t1,$a,27 + lw @X[2],8($ctx) + xor $t0,$c,$d + addu $e,$t1 + lw @X[3],12($ctx) + xor $t0,$b + addu $e,@X[$i%16] + lw @X[4],16($ctx) + rotr $b,$b,2 + addu $e,$t0 +#else + lw @X[0],0($ctx) + sll $t0,$a,5 # $i + addu $e,$K + lw @X[1],4($ctx) + srl $t1,$a,27 + addu $e,$t0 + lw @X[2],8($ctx) + xor $t0,$c,$d + addu $e,$t1 + lw @X[3],12($ctx) + sll $t2,$b,30 + xor $t0,$b + lw @X[4],16($ctx) + srl $b,$b,2 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +#endif +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + addu $e,$K # $i + and $t0,$c,$d + xor @X[$j%16],@X[($j+2)%16] + rotr $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + and $t0,$b + addu $e,@X[$i%16] + rotr @X[$j%16],@X[$j%16],31 + rotr $b,$b,2 + addu $e,$t0 +#else + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + and $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + addu $e,$t0 + srl $t1,@X[$j%16],31 + xor $t0,$c,$d + addu @X[$j%16],@X[$j%16] + and $t0,$b + srl $b,$b,2 + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +#endif +___ +} + +$FRAMESIZE=16; # large enough to accommodate NUBI saved registers +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; + +$code=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include +#endif + +#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) +#define _MIPS_ARCH_MIPS32R2 +#endif + +.text + +.set noat +.set noreorder +.align 5 +.globl sha1_block_data_order +.ent sha1_block_data_order +sha1_block_data_order: + .frame $sp,$FRAMESIZE*$SZREG,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder + $PTR_SUB $sp,$FRAMESIZE*$SZREG + $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL $num,6 + $PTR_ADD $num,$inp + $REG_S $num,0($sp) + lw $A,0($ctx) + lw $B,4($ctx) + lw $C,8($ctx) + lw $D,12($ctx) + b .Loop + lw $E,16($ctx) +.align 4 +.Loop: + .set reorder + lwl @X[0],$MSB($inp) + lui $K,0x5a82 + lwr @X[0],$LSB($inp) + ori $K,0x7999 # K_00_19 +___ +for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } +for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x6ed9 + ori $K,0xeba1 # K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x8f1b + ori $K,0xbcdc # K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0xca62 + ori $K,0xc1d6 # K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + $PTR_ADD $inp,64 + $REG_L $num,0($sp) + + addu $A,$X[0] + addu $B,$X[1] + sw $A,0($ctx) + addu $C,$X[2] + addu $D,$X[3] + sw $B,4($ctx) + addu $E,$X[4] + sw $C,8($ctx) + sw $D,12($ctx) + sw $E,16($ctx) + .set noreorder + bne $inp,$num,.Loop + nop + + .set noreorder + $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE*$SZREG +.end sha1_block_data_order +.rdata +.asciiz "SHA1 for MIPS, CRYPTOGAMS by " +___ +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-parisc.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-parisc.pl new file mode 100644 index 0000000..a85d126 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-parisc.pl @@ -0,0 +1,267 @@ +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for PA-RISC. + +# June 2009. +# +# On PA-7100LC performance is >30% better than gcc 3.2 generated code +# for aligned input and >50% better for unaligned. Compared to vendor +# compiler on PA-8600 it's almost 60% faster in 64-bit build and just +# few percent faster in 32-bit one (this for aligned input, data for +# unaligned input is not available). +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker + # [+ argument transfer] +$ctx="%r26"; # arg0 +$inp="%r25"; # arg1 +$num="%r24"; # arg2 + +$t0="%r28"; +$t1="%r29"; +$K="%r31"; + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0); + +@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23"); + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<15); + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + addl @X[$i],$e,$e + and $c,$b,$t0 + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + addl $t0,$e,$e +___ +$code.=<<___ if ($i>=15); # with forward Xupdate + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + and $c,$b,$t0 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + add $t0,$e,$e + shd @X[$j%16],@X[$j%16],31,@X[$j%16] +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t0,$e,$e +___ +$code.=<<___ if ($i==79); # with context load + ldw 0($ctx),@X[0] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + ldw 4($ctx),@X[1] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + ldw 8($ctx),@X[2] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + ldw 12($ctx),@X[3] + addl $t0,$e,$e + ldw 16($ctx),@X[4] +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; + shd $a,$a,27,$t1 ; $i + addl $K,$e,$e + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + xor $d,$c,$t0 + addl @X[$i%16],$e,$e + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + and $b,$t0,$t0 + addl $t1,$e,$e + shd $b,$b,2,$b + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t0,$e,$e + and $d,$c,$t1 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t1,$e,$e +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR +sha1_block_data_order + .PROC + .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + + ldw 0($ctx),$A + ldw 4($ctx),$B + ldw 8($ctx),$C + ldw 12($ctx),$D + ldw 16($ctx),$E + + extru $inp,31,2,$t0 ; t0=inp&3; + sh3addl $t0,%r0,$t0 ; t0*=8; + subi 32,$t0,$t0 ; t0=32-t0; + mtctl $t0,%cr11 ; %sar=t0; + +L\$oop + ldi 3,$t0 + andcm $inp,$t0,$t0 ; 64-bit neutral +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\tldw `4*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + ldw 60($t0),@X[15] + ldw 64($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align input + $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + ldil L'0x5a827000,$K ; K_00_19 + ldo 0x999($K),$K +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x6ed9e000,$K ; K_20_39 + ldo 0xba1($K),$K +___ + +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x8f1bb000,$K ; K_40_59 + ldo 0xcdc($K),$K +___ + +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0xca62c000,$K ; K_60_79 + ldo 0x1d6($K),$K +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + addl @X[0],$A,$A + addl @X[1],$B,$B + addl @X[2],$C,$C + addl @X[3],$D,$D + addl @X[4],$E,$E + stw $A,0($ctx) + stw $B,4($ctx) + stw $C,8($ctx) + stw $D,12($ctx) + stw $E,16($ctx) + addib,*<> -1,$num,L\$oop + ldo 64($inp),$inp + + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/,\*/,/gm if ($SIZE_T==4); +$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8); +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-ppc.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-ppc.pl new file mode 100755 index 0000000..add5a9e --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-ppc.pl @@ -0,0 +1,351 @@ +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# I let hardware handle unaligned input(*), except on page boundaries +# (see below for details). Otherwise straightforward implementation +# with X vector in register bank. +# +# (*) this means that this module is inappropriate for PPC403? Does +# anybody know if pre-POWER3 can sustain unaligned load? + +# -m64 -m32 +# ---------------------------------- +# PPC970,gcc-4.0.0 +76% +59% +# Power6,xlc-7 +68% +33% + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $UCMP ="cmpld"; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $UCMP ="cmplw"; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; +} else { die "nonsense $flavour"; } + +# Define endianness based on flavour +# i.e.: linux64le +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=24*$SIZE_T+64; +$LOCALS=6*$SIZE_T; + +$K ="r0"; +$sp ="r1"; +$toc="r2"; +$ctx="r3"; +$inp="r4"; +$num="r5"; +$t0 ="r15"; +$t1 ="r6"; + +$A ="r7"; +$B ="r8"; +$C ="r9"; +$D ="r10"; +$E ="r11"; +$T ="r12"; + +@V=($A,$B,$C,$D,$E,$T); +@X=("r16","r17","r18","r19","r20","r21","r22","r23", + "r24","r25","r26","r27","r28","r29","r30","r31"); + +sub loadbe { +my ($dst, $src, $temp_reg) = @_; +$code.=<<___ if (!$LITTLE_ENDIAN); + lwz $dst,$src +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lwz $temp_reg,$src + rotlwi $dst,$temp_reg,8 + rlwimi $dst,$temp_reg,24,0,7 + rlwimi $dst,$temp_reg,24,16,23 +___ +} + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e,$f)=@_; +my $j=$i+1; + + # Since the last value of $f is discarded, we can use + # it as a temp reg to swap byte-order when needed. + loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0); + loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15); +$code.=<<___ if ($i<15); + add $f,$K,$e + rotlwi $e,$a,5 + add $f,$f,@X[$i] + and $t0,$c,$b + add $f,$f,$e + andc $t1,$d,$b + rotlwi $b,$b,30 + or $t0,$t0,$t1 + add $f,$f,$t0 +___ +$code.=<<___ if ($i>=15); + add $f,$K,$e + rotlwi $e,$a,5 + xor @X[$j%16],@X[$j%16],@X[($j+2)%16] + add $f,$f,@X[$i%16] + and $t0,$c,$b + xor @X[$j%16],@X[$j%16],@X[($j+8)%16] + add $f,$f,$e + andc $t1,$d,$b + rotlwi $b,$b,30 + or $t0,$t0,$t1 + xor @X[$j%16],@X[$j%16],@X[($j+13)%16] + add $f,$f,$t0 + rotlwi @X[$j%16],@X[$j%16],1 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e,$f)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + add $f,$K,$e + xor $t0,$b,$d + rotlwi $e,$a,5 + xor @X[$j%16],@X[$j%16],@X[($j+2)%16] + add $f,$f,@X[$i%16] + xor $t0,$t0,$c + xor @X[$j%16],@X[$j%16],@X[($j+8)%16] + add $f,$f,$t0 + rotlwi $b,$b,30 + xor @X[$j%16],@X[$j%16],@X[($j+13)%16] + add $f,$f,$e + rotlwi @X[$j%16],@X[$j%16],1 +___ +$code.=<<___ if ($i==79); + add $f,$K,$e + xor $t0,$b,$d + rotlwi $e,$a,5 + lwz r16,0($ctx) + add $f,$f,@X[$i%16] + xor $t0,$t0,$c + lwz r17,4($ctx) + add $f,$f,$t0 + rotlwi $b,$b,30 + lwz r18,8($ctx) + lwz r19,12($ctx) + add $f,$f,$e + lwz r20,16($ctx) +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e,$f)=@_; +my $j=$i+1; +$code.=<<___; + add $f,$K,$e + rotlwi $e,$a,5 + xor @X[$j%16],@X[$j%16],@X[($j+2)%16] + add $f,$f,@X[$i%16] + and $t0,$b,$c + xor @X[$j%16],@X[$j%16],@X[($j+8)%16] + add $f,$f,$e + or $t1,$b,$c + rotlwi $b,$b,30 + xor @X[$j%16],@X[$j%16],@X[($j+13)%16] + and $t1,$t1,$d + or $t0,$t0,$t1 + rotlwi @X[$j%16],@X[$j%16],1 + add $f,$f,$t0 +___ +} + +$code=<<___; +.machine "any" +.text + +.globl .sha1_block_data_order +.align 4 +.sha1_block_data_order: + $STU $sp,-$FRAME($sp) + mflr r0 + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + lwz $A,0($ctx) + lwz $B,4($ctx) + lwz $C,8($ctx) + lwz $D,12($ctx) + lwz $E,16($ctx) + andi. r0,$inp,3 + bne Lunaligned +Laligned: + mtctr $num + bl Lsha1_block_private + b Ldone + +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for 64-byte input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... +.align 4 +Lunaligned: + subfic $t1,$inp,4096 + andi. $t1,$t1,4095 ; distance to closest page boundary + srwi. $t1,$t1,6 ; t1/=64 + beq Lcross_page + $UCMP $num,$t1 + ble Laligned ; didn't cross the page boundary + mtctr $t1 + subfc $num,$t1,$num + bl Lsha1_block_private +Lcross_page: + li $t1,16 + mtctr $t1 + addi r20,$sp,$LOCALS ; spot within the frame +Lmemcpy: + lbz r16,0($inp) + lbz r17,1($inp) + lbz r18,2($inp) + lbz r19,3($inp) + addi $inp,$inp,4 + stb r16,0(r20) + stb r17,1(r20) + stb r18,2(r20) + stb r19,3(r20) + addi r20,r20,4 + bdnz Lmemcpy + + $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) + li $t1,1 + addi $inp,$sp,$LOCALS + mtctr $t1 + bl Lsha1_block_private + $POP $inp,`$FRAME-$SIZE_T*18`($sp) + addic. $num,$num,-1 + bne Lunaligned + +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 +___ + +# This is private block function, which uses tailored calling +# interface, namely upon entry SHA_CTX is pre-loaded to given +# registers and counter register contains amount of chunks to +# digest... +$code.=<<___; +.align 4 +Lsha1_block_private: +___ +$code.=<<___; # load K_00_19 + lis $K,0x5a82 + ori $K,$K,0x7999 +___ +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; # load K_20_39 + lis $K,0x6ed9 + ori $K,$K,0xeba1 +___ +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; # load K_40_59 + lis $K,0x8f1b + ori $K,$K,0xbcdc +___ +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; # load K_60_79 + lis $K,0xca62 + ori $K,$K,0xc1d6 +___ +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + add r16,r16,$E + add r17,r17,$T + add r18,r18,$A + add r19,r19,$B + add r20,r20,$C + stw r16,0($ctx) + mr $A,r16 + stw r17,4($ctx) + mr $B,r17 + stw r18,8($ctx) + mr $C,r18 + stw r19,12($ctx) + mr $D,r19 + stw r20,16($ctx) + mr $E,r20 + addi $inp,$inp,`16*4` + bdnz Lsha1_block_private + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size .sha1_block_data_order,.-.sha1_block_data_order +___ +$code.=<<___; +.asciz "SHA1 block transform for PPC, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-s390x.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-s390x.pl new file mode 100644 index 0000000..79df1ff --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-s390x.pl @@ -0,0 +1,247 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for s390x. + +# April 2007. +# +# Performance is >30% better than gcc 3.3 generated code. But the real +# twist is that SHA1 hardware support is detected and utilized. In +# which case performance can reach further >4.5x for larger chunks. + +# January 2009. +# +# Optimize Xupdate for amount of memory references and reschedule +# instructions to favour dual-issue z10 pipeline. On z10 hardware is +# "only" ~2.3x faster than software. + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 23% better than code generated by gcc 4.3. + +$kimdfunc=1; # magic function code for kimd instruction + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$K_00_39="%r0"; $K=$K_00_39; +$K_40_79="%r1"; +$ctx="%r2"; $prefetch="%r2"; +$inp="%r3"; +$len="%r4"; + +$A="%r5"; +$B="%r6"; +$C="%r7"; +$D="%r8"; +$E="%r9"; @V=($A,$B,$C,$D,$E); +$t0="%r10"; +$t1="%r11"; +@X=("%r12","%r13","%r14"); +$sp="%r15"; + +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*4; + +sub Xupdate { +my $i=shift; + +$code.=<<___ if ($i==15); + lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up + lr $X[0],$X[2] +___ +return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle +$code.=<<___ if ($i<16); + lg $X[0],`$i*4`($inp) ### Xload($i) + rllg $X[1],$X[0],32 +___ +$code.=<<___ if ($i>=16); + xgr $X[0],$prefetch ### Xupdate($i) + lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) + xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) + xgr $X[0],$prefetch + rll $X[0],$X[0],1 + rllg $X[1],$X[0],32 + rll $X[1],$X[1],1 + rllg $X[0],$X[1],32 + lr $X[2],$X[1] # feedback +___ +$code.=<<___ if ($i<=70); + stg $X[0],`$stdframe+4*($i%16)`($sp) +___ +unshift(@X,pop(@X)); +} + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $xi=$X[1]; + + &Xupdate($i); +$code.=<<___; + alr $e,$K ### $i + rll $t1,$a,5 + lr $t0,$d + xr $t0,$c + alr $e,$t1 + nr $t0,$b + alr $e,$xi + xr $t0,$d + rll $b,$b,30 + alr $e,$t0 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $xi=$X[1]; + + &Xupdate($i); +$code.=<<___; + alr $e,$K ### $i + rll $t1,$a,5 + lr $t0,$b + alr $e,$t1 + xr $t0,$c + alr $e,$xi + xr $t0,$d + rll $b,$b,30 + alr $e,$t0 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $xi=$X[1]; + + &Xupdate($i); +$code.=<<___; + alr $e,$K ### $i + rll $t1,$a,5 + lr $t0,$b + alr $e,$t1 + or $t0,$c + lr $t1,$b + nr $t0,$d + nr $t1,$c + alr $e,$xi + or $t0,$t1 + rll $b,$b,30 + alr $e,$t0 +___ +} + +$code.=<<___; +.text +.align 64 +.type Ktable,\@object +Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6 + .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0 +.size Ktable,.-Ktable +.globl sha1_block_data_order +.type sha1_block_data_order,\@function +sha1_block_data_order: +___ +$code.=<<___ if ($kimdfunc); + larl %r1,OPENSSL_s390xcap_P + lg %r0,16(%r1) # check kimd capabilities + tmhh %r0,`0x8000>>$kimdfunc` + jz .Lsoftware + lghi %r0,$kimdfunc + lgr %r1,$ctx + lgr %r2,$inp + sllg %r3,$len,6 + .long 0xb93e0002 # kimd %r0,%r2 + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 16 +.Lsoftware: +___ +$code.=<<___; + lghi %r1,-$frame + st${g} $ctx,`2*$SIZE_T`($sp) + stm${g} %r6,%r15,`6*$SIZE_T`($sp) + lgr %r0,$sp + la $sp,0(%r1,$sp) + st${g} %r0,0($sp) + + larl $t0,Ktable + llgf $A,0($ctx) + llgf $B,4($ctx) + llgf $C,8($ctx) + llgf $D,12($ctx) + llgf $E,16($ctx) + + lg $K_00_39,0($t0) + lg $K_40_79,8($t0) + +.Lloop: + rllg $K_00_39,$K_00_39,32 +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + rllg $K_00_39,$K_00_39,32 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; $K=$K_40_79; + rllg $K_40_79,$K_40_79,32 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + rllg $K_40_79,$K_40_79,32 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + + l${g} $ctx,`$frame+2*$SIZE_T`($sp) + la $inp,64($inp) + al $A,0($ctx) + al $B,4($ctx) + al $C,8($ctx) + al $D,12($ctx) + al $E,16($ctx) + st $A,0($ctx) + st $B,4($ctx) + st $C,8($ctx) + st $D,12($ctx) + st $E,16($ctx) + brct${g} $len,.Lloop + + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) + br %r14 +.size sha1_block_data_order,.-sha1_block_data_order +.string "SHA1 block transform for s390x, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9.pl new file mode 100644 index 0000000..7437ff4 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9.pl @@ -0,0 +1,434 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Hardware SPARC T4 support by David S. Miller . +# ==================================================================== + +# Performance improvement is not really impressive on pre-T1 CPU: +8% +# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it +# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and +# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. +# X[16] vector is packed to 8 64-bit registers and as result nothing +# is spilled on stack. In addition input data is loaded in compact +# instruction sequence, thus minimizing the window when the code is +# subject to [inter-thread] cache-thrashing hazard. The goal is to +# ensure scalability on UltraSPARC T1, or rather to avoid decay when +# amount of active threads exceeds the number of physical cores. + +# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x +# faster than software. Multi-process benchmark saturates at 11x +# single-process result on 8-core processor, or ~9GBps per 2.85GHz +# socket. + +$output=pop; +open STDOUT,">$output"; + +@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); +$rot1m="%g2"; +$tmp64="%g3"; +$Xi="%g4"; +$A="%l0"; +$B="%l1"; +$C="%l2"; +$D="%l3"; +$E="%l4"; +@V=($A,$B,$C,$D,$E); +$K_00_19="%l5"; +$K_20_39="%l6"; +$K_40_59="%l7"; +$K_60_79="%g5"; +@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79); + +$ctx="%i0"; +$inp="%i1"; +$len="%i2"; +$tmp0="%i3"; +$tmp1="%i4"; +$tmp2="%i5"; + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $xi=($i&1)?@X[($i/2)%8]:$Xi; + +$code.=<<___; + sll $a,5,$tmp0 !! $i + add @K[$i/20],$e,$e + srl $a,27,$tmp1 + add $tmp0,$e,$e + and $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + andn $d,$b,$tmp1 + srl $b,2,$b + or $tmp1,$tmp0,$tmp1 + or $tmp2,$b,$b + add $xi,$e,$e +___ +if ($i&1 && $i<15) { + $code.= + " srlx @X[(($i+1)/2)%8],32,$Xi\n"; +} +$code.=<<___; + add $tmp1,$e,$e +___ +} + +sub Xupdate { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i/2; + +if ($i&1) { +$code.=<<___; + sll $a,5,$tmp0 !! $i + add @K[$i/20],$e,$e + srl $a,27,$tmp1 +___ +} else { +$code.=<<___; + sllx @X[($j+6)%8],32,$Xi ! Xupdate($i) + xor @X[($j+1)%8],@X[$j%8],@X[$j%8] + srlx @X[($j+7)%8],32,$tmp1 + xor @X[($j+4)%8],@X[$j%8],@X[$j%8] + sll $a,5,$tmp0 !! $i + or $tmp1,$Xi,$Xi + add @K[$i/20],$e,$e !! + xor $Xi,@X[$j%8],@X[$j%8] + srlx @X[$j%8],31,$Xi + add @X[$j%8],@X[$j%8],@X[$j%8] + and $Xi,$rot1m,$Xi + andn @X[$j%8],$rot1m,@X[$j%8] + srl $a,27,$tmp1 !! + or $Xi,@X[$j%8],@X[$j%8] +___ +} +} + +sub BODY_16_19 { +my ($i,$a,$b,$c,$d,$e)=@_; + + &Xupdate(@_); + if ($i&1) { + $xi=@X[($i/2)%8]; + } else { + $xi=$Xi; + $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; + } +$code.=<<___; + add $tmp0,$e,$e !! + and $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + add $xi,$e,$e + andn $d,$b,$tmp1 + srl $b,2,$b + or $tmp1,$tmp0,$tmp1 + or $tmp2,$b,$b + add $tmp1,$e,$e +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $xi; + &Xupdate(@_); + if ($i&1) { + $xi=@X[($i/2)%8]; + } else { + $xi=$Xi; + $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; + } +$code.=<<___; + add $tmp0,$e,$e !! + xor $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + srl $b,2,$b + add $tmp1,$e,$e + or $tmp2,$b,$b + add $xi,$e,$e +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $xi; + &Xupdate(@_); + if ($i&1) { + $xi=@X[($i/2)%8]; + } else { + $xi=$Xi; + $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; + } +$code.=<<___; + add $tmp0,$e,$e !! + and $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + or $c,$b,$tmp1 + srl $b,2,$b + and $d,$tmp1,$tmp1 + add $xi,$e,$e + or $tmp1,$tmp0,$tmp1 + or $tmp2,$b,$b + add $tmp1,$e,$e +___ +} + +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +#endif + +.section ".text",#alloc,#execinstr + +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif + +.align 32 +.globl sha1_block_data_order +sha1_block_data_order: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] + + andcc %g1, CFR_SHA1, %g0 + be .Lsoftware + nop + + ld [%o0 + 0x00], %f0 ! load context + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x0c], %f3 + bne,pn %icc, .Lhwunaligned + ld [%o0 + 0x10], %f4 + +.Lhw_loop: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + subcc %o2, 1, %o2 ! done yet? + ldd [%o1 + 0x38], %f22 + add %o1, 0x40, %o1 + prefetch [%o1 + 63], 20 + + .word 0x81b02820 ! SHA1 + + bne,pt SIZE_T_CC, .Lhw_loop + nop + +.Lhwfinish: + st %f0, [%o0 + 0x00] ! store context + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + retl + st %f4, [%o0 + 0x10] + +.align 8 +.Lhwunaligned: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +.Lhwunaligned_loop: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + subcc %o2, 1, %o2 ! done yet? + ldd [%o1 + 0x40], %f26 + add %o1, 0x40, %o1 + prefetch [%o1 + 63], 20 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + .word 0x81b02820 ! SHA1 + + bne,pt SIZE_T_CC, .Lhwunaligned_loop + for %f26, %f26, %f10 ! %f10=%f26 + + ba .Lhwfinish + nop + +.align 16 +.Lsoftware: + save %sp,-STACK_FRAME,%sp + sllx $len,6,$len + add $inp,$len,$len + + or %g0,1,$rot1m + sllx $rot1m,32,$rot1m + or $rot1m,1,$rot1m + + ld [$ctx+0],$A + ld [$ctx+4],$B + ld [$ctx+8],$C + ld [$ctx+12],$D + ld [$ctx+16],$E + andn $inp,7,$tmp0 + + sethi %hi(0x5a827999),$K_00_19 + or $K_00_19,%lo(0x5a827999),$K_00_19 + sethi %hi(0x6ed9eba1),$K_20_39 + or $K_20_39,%lo(0x6ed9eba1),$K_20_39 + sethi %hi(0x8f1bbcdc),$K_40_59 + or $K_40_59,%lo(0x8f1bbcdc),$K_40_59 + sethi %hi(0xca62c1d6),$K_60_79 + or $K_60_79,%lo(0xca62c1d6),$K_60_79 + +.Lloop: + ldx [$tmp0+0],@X[0] + ldx [$tmp0+16],@X[2] + ldx [$tmp0+32],@X[4] + ldx [$tmp0+48],@X[6] + and $inp,7,$tmp1 + ldx [$tmp0+8],@X[1] + sll $tmp1,3,$tmp1 + ldx [$tmp0+24],@X[3] + subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too + ldx [$tmp0+40],@X[5] + bz,pt %icc,.Laligned + ldx [$tmp0+56],@X[7] + + sllx @X[0],$tmp1,@X[0] + ldx [$tmp0+64],$tmp64 +___ +for($i=0;$i<7;$i++) +{ $code.=<<___; + srlx @X[$i+1],$tmp2,$Xi + sllx @X[$i+1],$tmp1,@X[$i+1] + or $Xi,@X[$i],@X[$i] +___ +} +$code.=<<___; + srlx $tmp64,$tmp2,$tmp64 + or $tmp64,@X[7],@X[7] +.Laligned: + srlx @X[0],32,$Xi +___ +for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + + ld [$ctx+0],@X[0] + ld [$ctx+4],@X[1] + ld [$ctx+8],@X[2] + ld [$ctx+12],@X[3] + add $inp,64,$inp + ld [$ctx+16],@X[4] + cmp $inp,$len + + add $A,@X[0],$A + st $A,[$ctx+0] + add $B,@X[1],$B + st $B,[$ctx+4] + add $C,@X[2],$C + st $C,[$ctx+8] + add $D,@X[3],$D + st $D,[$ctx+12] + add $E,@X[4],$E + st $E,[$ctx+16] + + bne SIZE_T_CC,.Lloop + andn $inp,7,$tmp0 + + ret + restore +.type sha1_block_data_order,#function +.size sha1_block_data_order,(.-sha1_block_data_order) +.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by " +.align 4 +___ + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my $ref,$opf; +my %visopf = ( "faligndata" => 0x048, + "for" => 0x07c ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} +sub unalignaddr { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my $ref="$mnemonic\t$rs1,$rs2,$rd"; + + foreach ($rs1,$rs2,$rd) { + if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } + else { return $ref; } + } + return sprintf ".word\t0x%08x !%s", + 0x81b00300|$rd<<25|$rs1<<14|$rs2, + $ref; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ + &unvis($1,$2,$3,$4) + /ge; + s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unalignaddr($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9a.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9a.pl new file mode 100644 index 0000000..f9ed563 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9a.pl @@ -0,0 +1,608 @@ +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# January 2009 +# +# Provided that UltraSPARC VIS instructions are pipe-lined(*) and +# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC +# Graphic Unit would make it possible to achieve higher instruction- +# level parallelism, ILP, and thus higher performance. It should be +# explicitly noted that ILP is the keyword, and it means that this +# code would be unsuitable for cores like UltraSPARC-Tx. The idea is +# not really novel, Sun had VIS-powered implementation for a while. +# Unlike Sun's implementation this one can process multiple unaligned +# input blocks, and as such works as drop-in replacement for OpenSSL +# sha1_block_data_order. Performance improvement was measured to be +# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on +# UltraSPARC-III. See below for discussion... +# +# The module does not present direct interest for OpenSSL, because +# it doesn't provide better performance on contemporary SPARCv9 CPUs, +# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they +# absolutely must score on UltraSPARC-I-IV can simply replace +# crypto/sha/asm/sha1-sparcv9.pl with this module. +# +# (*) "Pipe-lined" means that even if it takes several cycles to +# complete, next instruction using same functional unit [but not +# depending on the result of the current instruction] can start +# execution without having to wait for the unit. "Pairable" +# means that two [or more] independent instructions can be +# issued at the very same time. + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=112; } + +$output=shift; +open STDOUT,">$output"; + +$ctx="%i0"; +$inp="%i1"; +$len="%i2"; +$tmp0="%i3"; +$tmp1="%i4"; +$tmp2="%i5"; +$tmp3="%g5"; + +$base="%g1"; +$align="%g4"; +$Xfer="%o5"; +$nXfer=$tmp3; +$Xi="%o7"; + +$A="%l0"; +$B="%l1"; +$C="%l2"; +$D="%l3"; +$E="%l4"; +@V=($A,$B,$C,$D,$E); + +$Actx="%o0"; +$Bctx="%o1"; +$Cctx="%o2"; +$Dctx="%o3"; +$Ectx="%o4"; + +$fmul="%f32"; +$VK_00_19="%f34"; +$VK_20_39="%f36"; +$VK_40_59="%f38"; +$VK_60_79="%f40"; +@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79); +@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7", + "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16"); + +# This is reference 2x-parallelized VIS-powered Xupdate procedure. It +# covers even K_NN_MM addition... +sub Xupdate { +my ($i)=@_; +my $K=@VK[($i+16)/20]; +my $j=($i+16)%16; + +# [ provided that GSR.alignaddr_offset is 5, $mul contains +# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to +# chosen registers... ] +$code.=<<___; + fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13] + fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] + fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] + fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] + faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 + fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 + fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 + ![fxors %f15,%f2,%f2] + for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp + ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency + fpadd32 $K,@X[$j],%f20 + std %f20,[$Xfer+`4*$j`] +___ +# The numbers delimited with slash are the earliest possible dispatch +# cycles for given instruction assuming 1 cycle latency for simple VIS +# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as +# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being +# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1 +# round. As [long as] FPU/VIS instructions are perfectly pairable with +# IALU ones, the round timing is defined by the maximum between VIS +# and IALU timings. The latter varies from round to round and averages +# out at 6.25 ticks. This means that USI&II should operate at IALU +# rate, while USIII&IV - at VIS rate. This explains why performance +# improvement varies among processors. Well, given that pure IALU +# sha1-sparcv9.pl module exhibits virtually uniform performance of +# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical +# lower limits. Real-life performance was measured to be 6.6 cycles +# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than +# half-round VIS timing, because there are 16 Xupdate-free rounds, +# which "push down" average theoretical timing to 8 cycles... + +# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS +# latency. Well, it might have, but it doesn't have dedicated +# VIS-unit. Instead, VIS instructions are executed by other +# functional units, ones used here - by IALU. This doesn't +# improve effective ILP... +} + +# The reference Xupdate procedure is then "strained" over *pairs* of +# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13] +# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves +# plenty of room to amortize for read-after-write hazard, as well as +# to fetch and align input for the next spin. The VIS instructions are +# scheduled for latency of 2 cycles, because there are not enough IALU +# instructions to schedule for latency of 3, while scheduling for 1 +# would give no gain on USI&II anyway. + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i&~1; +my $k=($j+16+2)%16; # ahead reference +my $l=($j+16-2)%16; # behind reference +my $K=@VK[($j+16-2)/20]; + +$j=($j+16)%16; + +$code.=<<___ if (!($i&1)); + sll $a,5,$tmp0 !! $i + and $c,$b,$tmp3 + ld [$Xfer+`4*($i%16)`],$Xi + fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] + srl $a,27,$tmp1 + add $tmp0,$e,$e + fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] + sll $b,30,$tmp2 + add $tmp1,$e,$e + andn $d,$b,$tmp1 + add $Xi,$e,$e + fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] + srl $b,2,$b + or $tmp1,$tmp3,$tmp1 + or $tmp2,$b,$b + add $tmp1,$e,$e + faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 +___ +$code.=<<___ if ($i&1); + sll $a,5,$tmp0 !! $i + and $c,$b,$tmp3 + ld [$Xfer+`4*($i%16)`],$Xi + fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 + srl $a,27,$tmp1 + add $tmp0,$e,$e + fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 + sll $b,30,$tmp2 + add $tmp1,$e,$e + fpadd32 $K,@X[$l],%f20 ! + andn $d,$b,$tmp1 + add $Xi,$e,$e + fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] + srl $b,2,$b + or $tmp1,$tmp3,$tmp1 + fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp + or $tmp2,$b,$b + add $tmp1,$e,$e +___ +$code.=<<___ if ($i&1 && $i>=2); + std %f20,[$Xfer+`4*$l`] ! +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i&~1; +my $k=($j+16+2)%16; # ahead reference +my $l=($j+16-2)%16; # behind reference +my $K=@VK[($j+16-2)/20]; + +$j=($j+16)%16; + +$code.=<<___ if (!($i&1) && $i<64); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] + srl $a,27,$tmp1 + add $tmp0,$e,$e + fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] + xor $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] + srl $b,2,$b + add $tmp1,$e,$e + or $tmp2,$b,$b + add $Xi,$e,$e + faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 +___ +$code.=<<___ if ($i&1 && $i<64); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 + srl $a,27,$tmp1 + add $tmp0,$e,$e + fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 + xor $c,$b,$tmp0 + add $tmp1,$e,$e + fpadd32 $K,@X[$l],%f20 ! + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] + srl $b,2,$b + add $tmp1,$e,$e + fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp + or $tmp2,$b,$b + add $Xi,$e,$e + std %f20,[$Xfer+`4*$l`] ! +___ +$code.=<<___ if ($i==64); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + fpadd32 $K,@X[$l],%f20 + srl $a,27,$tmp1 + add $tmp0,$e,$e + xor $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + std %f20,[$Xfer+`4*$l`] + srl $b,2,$b + add $tmp1,$e,$e + or $tmp2,$b,$b + add $Xi,$e,$e +___ +$code.=<<___ if ($i>64); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + srl $a,27,$tmp1 + add $tmp0,$e,$e + xor $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + srl $b,2,$b + add $tmp1,$e,$e + or $tmp2,$b,$b + add $Xi,$e,$e +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i&~1; +my $k=($j+16+2)%16; # ahead reference +my $l=($j+16-2)%16; # behind reference +my $K=@VK[($j+16-2)/20]; + +$j=($j+16)%16; + +$code.=<<___ if (!($i&1)); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] + srl $a,27,$tmp1 + add $tmp0,$e,$e + fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] + and $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + or $c,$b,$tmp1 + fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] + srl $b,2,$b + and $d,$tmp1,$tmp1 + add $Xi,$e,$e + or $tmp1,$tmp0,$tmp1 + faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 + or $tmp2,$b,$b + add $tmp1,$e,$e + fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 +___ +$code.=<<___ if ($i&1); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + srl $a,27,$tmp1 + add $tmp0,$e,$e + fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 + and $c,$b,$tmp0 + add $tmp1,$e,$e + fpadd32 $K,@X[$l],%f20 ! + sll $b,30,$tmp2 + or $c,$b,$tmp1 + fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] + srl $b,2,$b + and $d,$tmp1,$tmp1 + fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp + add $Xi,$e,$e + or $tmp1,$tmp0,$tmp1 + or $tmp2,$b,$b + add $tmp1,$e,$e + std %f20,[$Xfer+`4*$l`] ! +___ +} + +# If there is more data to process, then we pre-fetch the data for +# next iteration in last ten rounds... +sub BODY_70_79 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i&~1; +my $m=($i%8)*2; + +$j=($j+16)%16; + +$code.=<<___ if ($i==70); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + srl $a,27,$tmp1 + add $tmp0,$e,$e + ldd [$inp+64],@X[0] + xor $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + srl $b,2,$b + add $tmp1,$e,$e + or $tmp2,$b,$b + add $Xi,$e,$e + + and $inp,-64,$nXfer + inc 64,$inp + and $nXfer,255,$nXfer + alignaddr %g0,$align,%g0 + add $base,$nXfer,$nXfer +___ +$code.=<<___ if ($i==71); + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + srl $a,27,$tmp1 + add $tmp0,$e,$e + xor $c,$b,$tmp0 + add $tmp1,$e,$e + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + srl $b,2,$b + add $tmp1,$e,$e + or $tmp2,$b,$b + add $Xi,$e,$e +___ +$code.=<<___ if ($i>=72); + faligndata @X[$m],@X[$m+2],@X[$m] + sll $a,5,$tmp0 !! $i + ld [$Xfer+`4*($i%16)`],$Xi + srl $a,27,$tmp1 + add $tmp0,$e,$e + xor $c,$b,$tmp0 + add $tmp1,$e,$e + fpadd32 $VK_00_19,@X[$m],%f20 + sll $b,30,$tmp2 + xor $d,$tmp0,$tmp1 + srl $b,2,$b + add $tmp1,$e,$e + or $tmp2,$b,$b + add $Xi,$e,$e +___ +$code.=<<___ if ($i<77); + ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)] +___ +$code.=<<___ if ($i==77); # redundant if $inp was aligned + add $align,63,$tmp0 + and $tmp0,-8,$tmp0 + ldd [$inp+$tmp0],@X[16] +___ +$code.=<<___ if ($i>=72); + std %f20,[$nXfer+`4*$m`] +___ +} + +$code.=<<___; +.section ".text",#alloc,#execinstr + +.align 64 +vis_const: +.long 0x5a827999,0x5a827999 ! K_00_19 +.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59 +.long 0xca62c1d6,0xca62c1d6 ! K_60_79 +.long 0x00000100,0x00000100 +.align 64 +.type vis_const,#object +.size vis_const,(.-vis_const) + +.globl sha1_block_data_order +sha1_block_data_order: + save %sp,-$frame,%sp + add %fp,$bias-256,$base + +1: call .+8 + add %o7,vis_const-1b,$tmp0 + + ldd [$tmp0+0],$VK_00_19 + ldd [$tmp0+8],$VK_20_39 + ldd [$tmp0+16],$VK_40_59 + ldd [$tmp0+24],$VK_60_79 + ldd [$tmp0+32],$fmul + + ld [$ctx+0],$Actx + and $base,-256,$base + ld [$ctx+4],$Bctx + sub $base,$bias+$frame,%sp + ld [$ctx+8],$Cctx + and $inp,7,$align + ld [$ctx+12],$Dctx + and $inp,-8,$inp + ld [$ctx+16],$Ectx + + ! X[16] is maintained in FP register bank + alignaddr %g0,$align,%g0 + ldd [$inp+0],@X[0] + sub $inp,-64,$Xfer + ldd [$inp+8],@X[2] + and $Xfer,-64,$Xfer + ldd [$inp+16],@X[4] + and $Xfer,255,$Xfer + ldd [$inp+24],@X[6] + add $base,$Xfer,$Xfer + ldd [$inp+32],@X[8] + ldd [$inp+40],@X[10] + ldd [$inp+48],@X[12] + brz,pt $align,.Laligned + ldd [$inp+56],@X[14] + + ldd [$inp+64],@X[16] + faligndata @X[0],@X[2],@X[0] + faligndata @X[2],@X[4],@X[2] + faligndata @X[4],@X[6],@X[4] + faligndata @X[6],@X[8],@X[6] + faligndata @X[8],@X[10],@X[8] + faligndata @X[10],@X[12],@X[10] + faligndata @X[12],@X[14],@X[12] + faligndata @X[14],@X[16],@X[14] + +.Laligned: + mov 5,$tmp0 + dec 1,$len + alignaddr %g0,$tmp0,%g0 + fpadd32 $VK_00_19,@X[0],%f16 + fpadd32 $VK_00_19,@X[2],%f18 + fpadd32 $VK_00_19,@X[4],%f20 + fpadd32 $VK_00_19,@X[6],%f22 + fpadd32 $VK_00_19,@X[8],%f24 + fpadd32 $VK_00_19,@X[10],%f26 + fpadd32 $VK_00_19,@X[12],%f28 + fpadd32 $VK_00_19,@X[14],%f30 + std %f16,[$Xfer+0] + mov $Actx,$A + std %f18,[$Xfer+8] + mov $Bctx,$B + std %f20,[$Xfer+16] + mov $Cctx,$C + std %f22,[$Xfer+24] + mov $Dctx,$D + std %f24,[$Xfer+32] + mov $Ectx,$E + std %f26,[$Xfer+40] + fxors @X[13],@X[0],@X[0] + std %f28,[$Xfer+48] + ba .Loop + std %f30,[$Xfer+56] +.align 32 +.Loop: +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + tst $len + bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail + nop +___ +for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + add $A,$Actx,$Actx + add $B,$Bctx,$Bctx + add $C,$Cctx,$Cctx + add $D,$Dctx,$Dctx + add $E,$Ectx,$Ectx + mov 5,$tmp0 + fxors @X[13],@X[0],@X[0] + mov $Actx,$A + mov $Bctx,$B + mov $Cctx,$C + mov $Dctx,$D + mov $Ectx,$E + alignaddr %g0,$tmp0,%g0 + dec 1,$len + ba .Loop + mov $nXfer,$Xfer + +.align 32 +.Ltail: +___ +for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + add $A,$Actx,$Actx + add $B,$Bctx,$Bctx + add $C,$Cctx,$Cctx + add $D,$Dctx,$Dctx + add $E,$Ectx,$Ectx + + st $Actx,[$ctx+0] + st $Bctx,[$ctx+4] + st $Cctx,[$ctx+8] + st $Dctx,[$ctx+12] + st $Ectx,[$ctx+16] + + ret + restore +.type sha1_block_data_order,#function +.size sha1_block_data_order,(.-sha1_block_data_order) +.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by " +.align 4 +___ + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my ($ref,$opf); +my %visopf = ( "fmul8ulx16" => 0x037, + "faligndata" => 0x048, + "fpadd32" => 0x052, + "fxor" => 0x06c, + "fxors" => 0x06d ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} +sub unalignaddr { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my $ref="$mnemonic\t$rs1,$rs2,$rd"; + + foreach ($rs1,$rs2,$rd) { + if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } + else { return $ref; } + } + return sprintf ".word\t0x%08x !%s", + 0x81b00300|$rd<<25|$rs1<<14|$rs2, + $ref; +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/ + &unvis($1,$2,$3,$4) + /gem; +$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/ + &unalignaddr($1,$2,$3,$4) + /gem; +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-thumb.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-thumb.pl new file mode 100644 index 0000000..661fd9f --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-thumb.pl @@ -0,0 +1,266 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# sha1_block for Thumb. +# +# January 2007. +# +# The code does not present direct interest to OpenSSL, because of low +# performance. Its purpose is to establish _size_ benchmark. Pretty +# useless one I must say, because 30% or 88 bytes larger ARMv4 code +# [avialable on demand] is almost _twice_ as fast. It should also be +# noted that in-lining of .Lcommon and .Lrotate improves performance +# by over 40%, while code increases by only 10% or 32 bytes. But once +# again, the goal was to establish _size_ benchmark, not performance. + +$output=shift; +open STDOUT,">$output"; + +$inline=0; +#$cheat_on_binutils=1; + +$t0="r0"; +$t1="r1"; +$t2="r2"; +$a="r3"; +$b="r4"; +$c="r5"; +$d="r6"; +$e="r7"; +$K="r8"; # "upper" registers can be used in add/sub and mov insns +$ctx="r9"; +$inp="r10"; +$len="r11"; +$Xi="r12"; + +sub common { +<<___; + sub $t0,#4 + ldr $t1,[$t0] + add $e,$K @ E+=K_xx_xx + lsl $t2,$a,#5 + add $t2,$e + lsr $e,$a,#27 + add $t2,$e @ E+=ROR(A,27) + add $t2,$t1 @ E+=X[i] +___ +} +sub rotate { +<<___; + mov $e,$d @ E=D + mov $d,$c @ D=C + lsl $c,$b,#30 + lsr $b,$b,#2 + orr $c,$b @ C=ROR(B,2) + mov $b,$a @ B=A + add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D) +___ +} + +sub BODY_00_19 { +$code.=$inline?&common():"\tbl .Lcommon\n"; +$code.=<<___; + mov $t1,$c + eor $t1,$d + and $t1,$b + eor $t1,$d @ F_00_19(B,C,D) +___ +$code.=$inline?&rotate():"\tbl .Lrotate\n"; +} + +sub BODY_20_39 { +$code.=$inline?&common():"\tbl .Lcommon\n"; +$code.=<<___; + mov $t1,$b + eor $t1,$c + eor $t1,$d @ F_20_39(B,C,D) +___ +$code.=$inline?&rotate():"\tbl .Lrotate\n"; +} + +sub BODY_40_59 { +$code.=$inline?&common():"\tbl .Lcommon\n"; +$code.=<<___; + mov $t1,$b + and $t1,$c + mov $e,$b + orr $e,$c + and $e,$d + orr $t1,$e @ F_40_59(B,C,D) +___ +$code.=$inline?&rotate():"\tbl .Lrotate\n"; +} + +$code=<<___; +.text +.code 16 + +.global sha1_block_data_order +.type sha1_block_data_order,%function + +.align 2 +sha1_block_data_order: +___ +if ($cheat_on_binutils) { +$code.=<<___; +.code 32 + add r3,pc,#1 + bx r3 @ switch to Thumb ISA +.code 16 +___ +} +$code.=<<___; + push {r4-r7} + mov r3,r8 + mov r4,r9 + mov r5,r10 + mov r6,r11 + mov r7,r12 + push {r3-r7,lr} + lsl r2,#6 + mov $ctx,r0 @ save context + mov $inp,r1 @ save inp + mov $len,r2 @ save len + add $len,$inp @ $len to point at inp end + +.Lloop: + mov $Xi,sp + mov $t2,sp + sub $t2,#16*4 @ [3] +.LXload: + ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp + ldrb $b,[$t1,#1] + ldrb $c,[$t1,#2] + ldrb $d,[$t1,#3] + lsl $a,#24 + lsl $b,#16 + lsl $c,#8 + orr $a,$b + orr $a,$c + orr $a,$d + add $t1,#4 + push {$a} + cmp sp,$t2 + bne .LXload @ [+14*16] + + mov $inp,$t1 @ update $inp + sub $t2,#32*4 + sub $t2,#32*4 + mov $e,#31 @ [+4] +.LXupdate: + ldr $a,[sp,#15*4] + ldr $b,[sp,#13*4] + ldr $c,[sp,#7*4] + ldr $d,[sp,#2*4] + eor $a,$b + eor $a,$c + eor $a,$d + ror $a,$e + push {$a} + cmp sp,$t2 + bne .LXupdate @ [+(11+1)*64] + + ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx + mov $t0,$Xi + + ldr $t2,.LK_00_19 + mov $t1,$t0 + sub $t1,#20*4 + mov $Xi,$t1 + mov $K,$t2 @ [+7+4] +.L_00_19: +___ + &BODY_00_19(); +$code.=<<___; + cmp $Xi,$t0 + bne .L_00_19 @ [+(2+9+4+2+8+2)*20] + + ldr $t2,.LK_20_39 + mov $t1,$t0 + sub $t1,#20*4 + mov $Xi,$t1 + mov $K,$t2 @ [+5] +.L_20_39_or_60_79: +___ + &BODY_20_39(); +$code.=<<___; + cmp $Xi,$t0 + bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2] + cmp sp,$t0 + beq .Ldone @ [+2] + + ldr $t2,.LK_40_59 + mov $t1,$t0 + sub $t1,#20*4 + mov $Xi,$t1 + mov $K,$t2 @ [+5] +.L_40_59: +___ + &BODY_40_59(); +$code.=<<___; + cmp $Xi,$t0 + bne .L_40_59 @ [+(2+9+6+2+8+2)*20] + + ldr $t2,.LK_60_79 + mov $Xi,sp + mov $K,$t2 + b .L_20_39_or_60_79 @ [+4] +.Ldone: + mov $t0,$ctx + ldr $t1,[$t0,#0] + ldr $t2,[$t0,#4] + add $a,$t1 + ldr $t1,[$t0,#8] + add $b,$t2 + ldr $t2,[$t0,#12] + add $c,$t1 + ldr $t1,[$t0,#16] + add $d,$t2 + add $e,$t1 + stmia $t0!,{$a,$b,$c,$d,$e} @ [+20] + + add sp,#80*4 @ deallocate stack frame + mov $t0,$ctx @ restore ctx + mov $t1,$inp @ restore inp + cmp $t1,$len + beq .Lexit + b .Lloop @ [+6] total 3212 cycles +.Lexit: + pop {r2-r7} + mov r8,r2 + mov r9,r3 + mov r10,r4 + mov r11,r5 + mov r12,r6 + mov lr,r7 + pop {r4-r7} + bx lr +.align 2 +___ +$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline); +$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline); +$code.=<<___; +.align 2 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +.size sha1_block_data_order,.-sha1_block_data_order +.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by " +___ + +print $code; +close STDOUT; # enforce flush diff --git a/openssl-1.1.0h/crypto/sha/asm/sha1-x86_64.pl b/openssl-1.1.0h/crypto/sha/asm/sha1-x86_64.pl new file mode 100755 index 0000000..6a3378b --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha1-x86_64.pl @@ -0,0 +1,2077 @@ +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# sha1_block procedure for x86_64. +# +# It was brought to my attention that on EM64T compiler-generated code +# was far behind 32-bit assembler implementation. This is unlike on +# Opteron where compiler-generated code was only 15% behind 32-bit +# assembler, which originally made it hard to motivate the effort. +# There was suggestion to mechanically translate 32-bit code, but I +# dismissed it, reasoning that x86_64 offers enough register bank +# capacity to fully utilize SHA-1 parallelism. Therefore this fresh +# implementation:-) However! While 64-bit code does perform better +# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, +# x86_64 does offer larger *addressable* bank, but out-of-order core +# reaches for even more registers through dynamic aliasing, and EM64T +# core must have managed to run-time optimize even 32-bit code just as +# good as 64-bit one. Performance improvement is summarized in the +# following table: +# +# gcc 3.4 32-bit asm cycles/byte +# Opteron +45% +20% 6.8 +# Xeon P4 +65% +0% 9.9 +# Core2 +60% +10% 7.0 + +# August 2009. +# +# The code was revised to minimize code size and to maximize +# "distance" between instructions producing input to 'lea' +# instruction and the 'lea' instruction itself, which is essential +# for Intel Atom core. + +# October 2010. +# +# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it +# is to offload message schedule denoted by Wt in NIST specification, +# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module +# for background and implementation details. The only difference from +# 32-bit code is that 64-bit code doesn't have to spill @X[] elements +# to free temporary registers. + +# April 2011. +# +# Add AVX code path. See sha1-586.pl for further information. + +# May 2013. +# +# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions +# and loading pair of consecutive blocks to 256-bit %ymm registers) +# did not provide impressive performance improvement till a crucial +# hint regarding the number of Xupdate iterations to pre-compute in +# advance was provided by Ilya Albrekht of Intel Corp. + +# March 2014. +# +# Add support for Intel SHA Extensions. + +###################################################################### +# Current performance is summarized in following table. Numbers are +# CPU clock cycles spent to process single byte (less is better). +# +# x86_64 SSSE3 AVX[2] +# P4 9.05 - +# Opteron 6.26 - +# Core2 6.55 6.05/+8% - +# Westmere 6.73 5.30/+27% - +# Sandy Bridge 7.70 6.10/+26% 4.99/+54% +# Ivy Bridge 6.06 4.67/+30% 4.60/+32% +# Haswell 5.45 4.15/+31% 3.57/+53% +# Skylake 5.18 4.06/+28% 3.54/+46% +# Bulldozer 9.11 5.95/+53% +# VIA Nano 9.32 7.15/+30% +# Atom 10.3 9.17/+12% +# Silvermont 13.1(*) 9.37/+40% +# Goldmont 8.13 6.42/+27% 1.70/+380%(**) +# +# (*) obviously suboptimal result, nothing was done about it, +# because SSSE3 code is compiled unconditionally; +# (**) SHAEXT result + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +$shaext=1; ### set to zero if compiling for 1.0.1 +$avx=1 if (!$shaext && $avx); + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$ctx="%rdi"; # 1st arg +$inp="%rsi"; # 2nd arg +$num="%rdx"; # 3rd arg + +# reassign arguments in order to produce more compact code +$ctx="%r8"; +$inp="%r9"; +$num="%r10"; + +$t0="%eax"; +$t1="%ebx"; +$t2="%ecx"; +@xi=("%edx","%ebp","%r14d"); +$A="%esi"; +$B="%edi"; +$C="%r11d"; +$D="%r12d"; +$E="%r13d"; + +@V=($A,$B,$C,$D,$E); + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i==0); + mov `4*$i`($inp),$xi[0] + bswap $xi[0] +___ +$code.=<<___ if ($i<15); + mov `4*$j`($inp),$xi[1] + mov $d,$t0 + mov $xi[0],`4*$i`(%rsp) + mov $a,$t2 + bswap $xi[1] + xor $c,$t0 + rol \$5,$t2 + and $b,$t0 + lea 0x5a827999($xi[0],$e),$e + add $t2,$e + xor $d,$t0 + rol \$30,$b + add $t0,$e +___ +$code.=<<___ if ($i>=15); + xor `4*($j%16)`(%rsp),$xi[1] + mov $d,$t0 + mov $xi[0],`4*($i%16)`(%rsp) + mov $a,$t2 + xor `4*(($j+2)%16)`(%rsp),$xi[1] + xor $c,$t0 + rol \$5,$t2 + xor `4*(($j+8)%16)`(%rsp),$xi[1] + and $b,$t0 + lea 0x5a827999($xi[0],$e),$e + rol \$30,$b + xor $d,$t0 + add $t2,$e + rol \$1,$xi[1] + add $t0,$e +___ +push(@xi,shift(@xi)); +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $K=($i<40)?0x6ed9eba1:0xca62c1d6; +$code.=<<___ if ($i<79); + xor `4*($j%16)`(%rsp),$xi[1] + mov $b,$t0 + `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` + mov $a,$t2 + xor `4*(($j+2)%16)`(%rsp),$xi[1] + xor $d,$t0 + rol \$5,$t2 + xor `4*(($j+8)%16)`(%rsp),$xi[1] + lea $K($xi[0],$e),$e + xor $c,$t0 + add $t2,$e + rol \$30,$b + add $t0,$e + rol \$1,$xi[1] +___ +$code.=<<___ if ($i==79); + mov $b,$t0 + mov $a,$t2 + xor $d,$t0 + lea $K($xi[0],$e),$e + rol \$5,$t2 + xor $c,$t0 + add $t2,$e + rol \$30,$b + add $t0,$e +___ +push(@xi,shift(@xi)); +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; + xor `4*($j%16)`(%rsp),$xi[1] + mov $d,$t0 + mov $xi[0],`4*($i%16)`(%rsp) + mov $d,$t1 + xor `4*(($j+2)%16)`(%rsp),$xi[1] + and $c,$t0 + mov $a,$t2 + xor `4*(($j+8)%16)`(%rsp),$xi[1] + lea 0x8f1bbcdc($xi[0],$e),$e + xor $c,$t1 + rol \$5,$t2 + add $t0,$e + rol \$1,$xi[1] + and $b,$t1 + add $t2,$e + rol \$30,$b + add $t1,$e +___ +push(@xi,shift(@xi)); +} + +$code.=<<___; +.text +.extern OPENSSL_ia32cap_P + +.globl sha1_block_data_order +.type sha1_block_data_order,\@function,3 +.align 16 +sha1_block_data_order: + mov OPENSSL_ia32cap_P+0(%rip),%r9d + mov OPENSSL_ia32cap_P+4(%rip),%r8d + mov OPENSSL_ia32cap_P+8(%rip),%r10d + test \$`1<<9`,%r8d # check SSSE3 bit + jz .Lialu +___ +$code.=<<___ if ($shaext); + test \$`1<<29`,%r10d # check SHA bit + jnz _shaext_shortcut +___ +$code.=<<___ if ($avx>1); + and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2 + cmp \$`1<<3|1<<5|1<<8`,%r10d + je _avx2_shortcut +___ +$code.=<<___ if ($avx); + and \$`1<<28`,%r8d # mask AVX bit + and \$`1<<30`,%r9d # mask "Intel CPU" bit + or %r9d,%r8d + cmp \$`1<<28|1<<30`,%r8d + je _avx_shortcut +___ +$code.=<<___; + jmp _ssse3_shortcut + +.align 16 +.Lialu: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov %rdi,$ctx # reassigned argument + sub \$`8+16*4`,%rsp + mov %rsi,$inp # reassigned argument + and \$-64,%rsp + mov %rdx,$num # reassigned argument + mov %rax,`16*4`(%rsp) +.Lprologue: + + mov 0($ctx),$A + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov 16($ctx),$E + jmp .Lloop + +.align 16 +.Lloop: +___ +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + add 0($ctx),$A + add 4($ctx),$B + add 8($ctx),$C + add 12($ctx),$D + add 16($ctx),$E + mov $A,0($ctx) + mov $B,4($ctx) + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + + sub \$1,$num + lea `16*4`($inp),$inp + jnz .Lloop + + mov `16*4`(%rsp),%rsi + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lepilogue: + ret +.size sha1_block_data_order,.-sha1_block_data_order +___ +if ($shaext) {{{ +###################################################################### +# Intel SHA Extensions implementation of SHA1 update function. +# +my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx"); +my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9)); +my @MSG=map("%xmm$_",(4..7)); + +$code.=<<___; +.type sha1_block_data_order_shaext,\@function,3 +.align 32 +sha1_block_data_order_shaext: +_shaext_shortcut: +___ +$code.=<<___ if ($win64); + lea `-8-4*16`(%rsp),%rsp + movaps %xmm6,-8-4*16(%rax) + movaps %xmm7,-8-3*16(%rax) + movaps %xmm8,-8-2*16(%rax) + movaps %xmm9,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + movdqu ($ctx),$ABCD + movd 16($ctx),$E + movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap + + movdqu ($inp),@MSG[0] + pshufd \$0b00011011,$ABCD,$ABCD # flip word order + movdqu 0x10($inp),@MSG[1] + pshufd \$0b00011011,$E,$E # flip word order + movdqu 0x20($inp),@MSG[2] + pshufb $BSWAP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + pshufb $BSWAP,@MSG[1] + pshufb $BSWAP,@MSG[2] + movdqa $E,$E_SAVE # offload $E + pshufb $BSWAP,@MSG[3] + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + dec $num + lea 0x40($inp),%r8 # next input block + paddd @MSG[0],$E + cmovne %r8,$inp + movdqa $ABCD,$ABCD_SAVE # offload $ABCD +___ +for($i=0;$i<20-4;$i+=2) { +$code.=<<___; + sha1msg1 @MSG[1],@MSG[0] + movdqa $ABCD,$E_ + sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3... + sha1nexte @MSG[1],$E_ + pxor @MSG[2],@MSG[0] + sha1msg1 @MSG[2],@MSG[1] + sha1msg2 @MSG[3],@MSG[0] + + movdqa $ABCD,$E + sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD + sha1nexte @MSG[2],$E + pxor @MSG[3],@MSG[1] + sha1msg2 @MSG[0],@MSG[1] +___ + push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqu ($inp),@MSG[0] + movdqa $ABCD,$E_ + sha1rnds4 \$3,$E,$ABCD # 64-67 + sha1nexte @MSG[1],$E_ + movdqu 0x10($inp),@MSG[1] + pshufb $BSWAP,@MSG[0] + + movdqa $ABCD,$E + sha1rnds4 \$3,$E_,$ABCD # 68-71 + sha1nexte @MSG[2],$E + movdqu 0x20($inp),@MSG[2] + pshufb $BSWAP,@MSG[1] + + movdqa $ABCD,$E_ + sha1rnds4 \$3,$E,$ABCD # 72-75 + sha1nexte @MSG[3],$E_ + movdqu 0x30($inp),@MSG[3] + pshufb $BSWAP,@MSG[2] + + movdqa $ABCD,$E + sha1rnds4 \$3,$E_,$ABCD # 76-79 + sha1nexte $E_SAVE,$E + pshufb $BSWAP,@MSG[3] + + paddd $ABCD_SAVE,$ABCD + movdqa $E,$E_SAVE # offload $E + + jnz .Loop_shaext + + pshufd \$0b00011011,$ABCD,$ABCD + pshufd \$0b00011011,$E,$E + movdqu $ABCD,($ctx) + movd $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -8-4*16(%rax),%xmm6 + movaps -8-3*16(%rax),%xmm7 + movaps -8-2*16(%rax),%xmm8 + movaps -8-1*16(%rax),%xmm9 + mov %rax,%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext +___ +}}} +{{{ +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my $Kx="%xmm11"; +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; +my $rx=0; +my $K_XX_XX="%r11"; + +my $_rol=sub { &rol(@_) }; +my $_ror=sub { &ror(@_) }; + +{ my $sn; +sub align32() { + ++$sn; +$code.=<<___; + jmp .Lalign32_$sn # see "Decoded ICache" in manual +.align 32 +.Lalign32_$sn: +___ +} +} + +$code.=<<___; +.type sha1_block_data_order_ssse3,\@function,3 +.align 16 +sha1_block_data_order_ssse3: +_ssse3_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 # redundant, done to share Win64 SE handler + push %r14 + lea `-64-($win64?6*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-40-6*16(%rax) + movaps %xmm7,-40-5*16(%rax) + movaps %xmm8,-40-4*16(%rax) + movaps %xmm9,-40-3*16(%rax) + movaps %xmm10,-40-2*16(%rax) + movaps %xmm11,-40-1*16(%rax) +.Lprologue_ssse3: +___ +$code.=<<___; + mov %rax,%r14 # original %rsp + and \$-64,%rsp + mov %rdi,$ctx # reassigned argument + mov %rsi,$inp # reassigned argument + mov %rdx,$num # reassigned argument + + shl \$6,$num + add $inp,$num + lea K_XX_XX+64(%rip),$K_XX_XX + + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] + + movdqa 64($K_XX_XX),@X[2] # pbswap mask + movdqa -64($K_XX_XX),@Tx[1] # K_00_19 + movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + movdqu 16($inp),@X[-3&7] + movdqu 32($inp),@X[-2&7] + movdqu 48($inp),@X[-1&7] + pshufb @X[2],@X[-4&7] # byte swap + pshufb @X[2],@X[-3&7] + pshufb @X[2],@X[-2&7] + add \$64,$inp + paddd @Tx[1],@X[-4&7] # add K_00_19 + pshufb @X[2],@X[-1&7] + paddd @Tx[1],@X[-3&7] + paddd @Tx[1],@X[-2&7] + movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU + psubd @Tx[1],@X[-4&7] # restore X[] + movdqa @X[-3&7],16(%rsp) + psubd @Tx[1],@X[-3&7] + movdqa @X[-2&7],32(%rsp) + psubd @Tx[1],@X[-2&7] + jmp .Loop_ssse3 +___ + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); # ror + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); + eval(shift(@insns)); + &movdqa (@Tx[0],@X[-1&7]); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + + &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + &psrldq (@Tx[0],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); # ror + &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); # rol + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (@Tx[2],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + &movdqa (@Tx[0],@X[0]); + eval(shift(@insns)); + + &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword + &paddd (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[0],31); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + &movdqa (@Tx[1],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[2],30); + eval(shift(@insns)); + eval(shift(@insns)); # ror + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslld (@Tx[1],2); + &pxor (@X[0],@Tx[2]); + eval(shift(@insns)); + &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_ssse3_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)) if ($Xi==8); + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)) if ($Xi==8); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[1] =~ /_ror/); + eval(shift(@insns)) if (@insns[0] =~ /_ror/); + &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)); + if ($Xi%5) { + &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); + } + eval(shift(@insns)); # ror + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)) if (@insns[0] =~ /_ror/); + + &movdqa (@Tx[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); # ror + eval(shift(@insns)); + eval(shift(@insns)); # body_20_39 + + &pslld (@X[0],2); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld (@Tx[0],30); + eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + + &por (@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)) if (@insns[1] =~ /_rol/); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); + &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_ssse3_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$num); + &je (".Ldone_ssse3"); + + unshift(@Tx,pop(@Tx)); + + &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask + &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19 + &movdqu (@X[-4&7],"0($inp)"); # load input + &movdqu (@X[-3&7],"16($inp)"); + &movdqu (@X[-2&7],"32($inp)"); + &movdqu (@X[-1&7],"48($inp)"); + &pshufb (@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psubd (@X[($Xi-4)&7],@Tx[1]); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +sub body_00_19 () { # ((c^d)&b)^d + # on start @T[0]=(c^d)&b + return &body_20_39() if ($rx==19); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + '&$_ror ($b,$j?7:2)', # $b>>>2 + '&xor (@T[0],$d)', + '&mov (@T[1],$a)', # $b for next round + + '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer + '&xor ($b,$c)', # $c^$d for next round + + '&$_rol ($a,5)', + '&add ($e,@T[0])', + '&and (@T[1],$b)', # ($b&($c^$d)) for next round + + '&xor ($b,$c)', # restore $b + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_20_39 () { # b^d^c + # on entry @T[0]=b^d + return &body_40_59() if ($rx==39); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer + '&xor (@T[0],$d) if($j==19);'. + '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c) + '&mov (@T[1],$a)', # $b for next round + + '&$_rol ($a,5)', + '&add ($e,@T[0])', + '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round + + '&$_ror ($b,7)', # $b>>>2 + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_40_59 () { # ((b^c)&(c^d))^c + # on entry @T[0]=(b^c), (c^=d) + $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer + '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d) + '&xor ($c,$d) if ($j>=40)', # restore $c + + '&$_ror ($b,7)', # $b>>>2 + '&mov (@T[1],$a)', # $b for next round + '&xor (@T[0],$c)', + + '&$_rol ($a,5)', + '&add ($e,@T[0])', + '&xor (@T[1],$c) if ($j==59);'. + '&xor (@T[1],$b) if ($j< 59)', # b^c for next round + + '&xor ($b,$c) if ($j< 59)', # c^d for next round + '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} +$code.=<<___; +.align 16 +.Loop_ssse3: +___ + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $C,@T[1] + mov $D,12($ctx) + xor $D,@T[1] + mov $E,16($ctx) + and @T[1],@T[0] + jmp .Loop_ssse3 + +.align 16 +.Ldone_ssse3: +___ + $j=$saved_j; @V=@saved_V; + + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -40-6*16(%r14),%xmm6 + movaps -40-5*16(%r14),%xmm7 + movaps -40-4*16(%r14),%xmm8 + movaps -40-3*16(%r14),%xmm9 + movaps -40-2*16(%r14),%xmm10 + movaps -40-1*16(%r14),%xmm11 +___ +$code.=<<___; + lea (%r14),%rsi + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lepilogue_ssse3: + ret +.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +___ + +if ($avx) { +$Xi=4; # reset variables +@X=map("%xmm$_",(4..7,0..3)); +@Tx=map("%xmm$_",(8..10)); +$j=0; +$rx=0; + +my $done_avx_label=".Ldone_avx"; + +my $_rol=sub { &shld(@_[0],@_) }; +my $_ror=sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type sha1_block_data_order_avx,\@function,3 +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 # redundant, done to share Win64 SE handler + push %r14 + lea `-64-($win64?6*16:0)`(%rsp),%rsp + vzeroupper +___ +$code.=<<___ if ($win64); + vmovaps %xmm6,-40-6*16(%rax) + vmovaps %xmm7,-40-5*16(%rax) + vmovaps %xmm8,-40-4*16(%rax) + vmovaps %xmm9,-40-3*16(%rax) + vmovaps %xmm10,-40-2*16(%rax) + vmovaps %xmm11,-40-1*16(%rax) +.Lprologue_avx: +___ +$code.=<<___; + mov %rax,%r14 # original %rsp + and \$-64,%rsp + mov %rdi,$ctx # reassigned argument + mov %rsi,$inp # reassigned argument + mov %rdx,$num # reassigned argument + + shl \$6,$num + add $inp,$num + lea K_XX_XX+64(%rip),$K_XX_XX + + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + mov $C,@T[1] + xor $D,@T[1] + and @T[1],@T[0] + + vmovdqa 64($K_XX_XX),@X[2] # pbswap mask + vmovdqa -64($K_XX_XX),$Kx # K_00_19 + vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + vmovdqu 16($inp),@X[-3&7] + vmovdqu 32($inp),@X[-2&7] + vmovdqu 48($inp),@X[-1&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap + add \$64,$inp + vpshufb @X[2],@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-1&7],@X[-1&7] + vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 + vpaddd $Kx,@X[-3&7],@X[1] + vpaddd $Kx,@X[-2&7],@X[2] + vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU + vmovdqa @X[1],16(%rsp) + vmovdqa @X[2],32(%rsp) + jmp .Loop_avx +___ + +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@Tx[1],$Kx,@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[0],@X[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[1],@Tx[2],30); + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@Tx[2],@Tx[2],2); + &vpxor (@X[0],@X[0],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_avx_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); + &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpsrld (@Tx[0],@X[0],30); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xuplast_avx_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &vpaddd (@Tx[1],$Kx,@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$num); + &je ($done_avx_label); + + &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask + &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19 + &vmovdqu(@X[-4&7],"0($inp)"); # load input + &vmovdqu(@X[-3&7],"16($inp)"); + &vmovdqu(@X[-2&7],"32($inp)"); + &vmovdqu(@X[-1&7],"48($inp)"); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +$code.=<<___; +.align 16 +.Loop_avx: +___ + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_32_79(\&body_00_19); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_20_39); + &Xuplast_avx_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $C,@T[1] + mov $D,12($ctx) + xor $D,@T[1] + mov $E,16($ctx) + and @T[1],@T[0] + jmp .Loop_avx + +.align 16 +$done_avx_label: +___ + $j=$saved_j; @V=@saved_V; + + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + +$code.=<<___; + vzeroupper + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -40-6*16(%r14),%xmm6 + movaps -40-5*16(%r14),%xmm7 + movaps -40-4*16(%r14),%xmm8 + movaps -40-3*16(%r14),%xmm9 + movaps -40-2*16(%r14),%xmm10 + movaps -40-1*16(%r14),%xmm11 +___ +$code.=<<___; + lea (%r14),%rsi + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lepilogue_avx: + ret +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx +___ + +if ($avx>1) { +use integer; +$Xi=4; # reset variables +@X=map("%ymm$_",(4..7,0..3)); +@Tx=map("%ymm$_",(8..10)); +$Kx="%ymm11"; +$j=0; + +my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi"); +my ($a5,$t0)=("%r12d","%edi"); + +my ($A,$F,$B,$C,$D,$E)=@ROTX; +my $rx=0; +my $frame="%r13"; + +$code.=<<___; +.type sha1_block_data_order_avx2,\@function,3 +.align 16 +sha1_block_data_order_avx2: +_avx2_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + vzeroupper +___ +$code.=<<___ if ($win64); + lea -6*16(%rsp),%rsp + vmovaps %xmm6,-40-6*16(%rax) + vmovaps %xmm7,-40-5*16(%rax) + vmovaps %xmm8,-40-4*16(%rax) + vmovaps %xmm9,-40-3*16(%rax) + vmovaps %xmm10,-40-2*16(%rax) + vmovaps %xmm11,-40-1*16(%rax) +.Lprologue_avx2: +___ +$code.=<<___; + mov %rax,%r14 # original %rsp + mov %rdi,$ctx # reassigned argument + mov %rsi,$inp # reassigned argument + mov %rdx,$num # reassigned argument + + lea -640(%rsp),%rsp + shl \$6,$num + lea 64($inp),$frame + and \$-128,%rsp + add $inp,$num + lea K_XX_XX+64(%rip),$K_XX_XX + + mov 0($ctx),$A # load context + cmp $num,$frame + cmovae $inp,$frame # next or same block + mov 4($ctx),$F + mov 8($ctx),$C + mov 12($ctx),$D + mov 16($ctx),$E + vmovdqu 64($K_XX_XX),@X[2] # pbswap mask + + vmovdqu ($inp),%xmm0 + vmovdqu 16($inp),%xmm1 + vmovdqu 32($inp),%xmm2 + vmovdqu 48($inp),%xmm3 + lea 64($inp),$inp + vinserti128 \$1,($frame),@X[-4&7],@X[-4&7] + vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] + vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-3&7],@X[-3&7] + vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vmovdqu -64($K_XX_XX),$Kx # K_00_19 + vpshufb @X[2],@X[-1&7],@X[-1&7] + + vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 + vpaddd $Kx,@X[-3&7],@X[1] + vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU + vpaddd $Kx,@X[-2&7],@X[2] + vmovdqu @X[1],32(%rsp) + vpaddd $Kx,@X[-1&7],@X[3] + vmovdqu @X[2],64(%rsp) + vmovdqu @X[3],96(%rsp) +___ +for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31 + use integer; + + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + &vpsrld (@Tx[0],@X[0],31); + &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX + &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + &vpsrld (@Tx[1],@Tx[2],30); + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + &vpslld (@Tx[2],@Tx[2],2); + &vpxor (@X[0],@X[0],@Tx[1]); + &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + &vpaddd (@Tx[1],@X[0],$Kx); + &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU + + push(@X,shift(@X)); # "rotate" X[] +} +$code.=<<___; + lea 128(%rsp),$frame + jmp .Loop_avx2 +.align 32 +.Loop_avx2: + rorx \$2,$F,$B + andn $D,$F,$t0 + and $C,$F + xor $t0,$F +___ +sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path + # at start $f=(b&c)^(~b&d), $b>>>=2 + return &bodyx_20_39() if ($rx==19); $rx++; + ( + '($a,$f,$b,$c,$d,$e)=@ROTX;'. + + '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K + '&lea ($frame,"256($frame)") if ($j%32==31);', + '&andn ($t0,$a,$c)', # ~b&d for next round + + '&add ($e,$f)', # e+=(b&c)^(~b&d) + '&rorx ($a5,$a,27)', # a<<<5 + '&rorx ($f,$a,2)', # b>>>2 for next round + '&and ($a,$b)', # b&c for next round + + '&add ($e,$a5)', # e+=a<<<5 + '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round + + 'unshift(@ROTX,pop(@ROTX)); $j++;' + ) +} + +sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path + # on entry $f=b^c^d, $b>>>=2 + return &bodyx_40_59() if ($rx==39); $rx++; + ( + '($a,$f,$b,$c,$d,$e)=@ROTX;'. + + '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K + '&lea ($frame,"256($frame)") if ($j%32==31);', + + '&lea ($e,"($e,$f)")', # e+=b^c^d + '&rorx ($a5,$a,27)', # a<<<5 + '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round + '&xor ($a,$b) if ($j<79)', # b^c for next round + + '&add ($e,$a5)', # e+=a<<<5 + '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round + + 'unshift(@ROTX,pop(@ROTX)); $j++;' + ) +} + +sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path + # on entry $f=((b^c)&(c^d)), $b>>>=2 + $rx++; + ( + '($a,$f,$b,$c,$d,$e)=@ROTX;'. + + '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K + '&lea ($frame,"256($frame)") if ($j%32==31);', + '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c + '&mov ($t0,$b) if ($j<59)', # count on zero latency + '&xor ($t0,$c) if ($j<59)', # c^d for next round + + '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c + '&rorx ($a5,$a,27)', # a<<<5 + '&rorx ($f,$a,2)', # b>>>2 in next round + '&xor ($a,$b)', # b^c for next round + + '&add ($e,$a5)', # e+=a<<<5 + '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round + '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round + + 'unshift(@ROTX,pop(@ROTX)); $j++;' + ) +} + +sub Xupdate_avx2_16_31() # recall that $Xi starts with 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[0],@X[0],31); + &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[1],@Tx[2],30); + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@Tx[2],@Tx[2],2); + &vpxor (@X[0],@X[0],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@Tx[1],@X[0],$Kx); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_avx2_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[0],@X[0],30); + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + #&vpslld (@X[0],@X[0],2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@Tx[1],@X[0],$Kx); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xloop_avx2() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + + &align32(); + &Xupdate_avx2_32_79(\&bodyx_00_19); + &Xupdate_avx2_32_79(\&bodyx_00_19); + &Xupdate_avx2_32_79(\&bodyx_00_19); + &Xupdate_avx2_32_79(\&bodyx_00_19); + + &Xupdate_avx2_32_79(\&bodyx_20_39); + &Xupdate_avx2_32_79(\&bodyx_20_39); + &Xupdate_avx2_32_79(\&bodyx_20_39); + &Xupdate_avx2_32_79(\&bodyx_20_39); + + &align32(); + &Xupdate_avx2_32_79(\&bodyx_40_59); + &Xupdate_avx2_32_79(\&bodyx_40_59); + &Xupdate_avx2_32_79(\&bodyx_40_59); + &Xupdate_avx2_32_79(\&bodyx_40_59); + + &Xloop_avx2(\&bodyx_20_39); + &Xloop_avx2(\&bodyx_20_39); + &Xloop_avx2(\&bodyx_20_39); + &Xloop_avx2(\&bodyx_20_39); + +$code.=<<___; + lea 128($inp),$frame + lea 128($inp),%rdi # borrow $t0 + cmp $num,$frame + cmovae $inp,$frame # next or previous block + + # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c + add 0($ctx),@ROTX[0] # update context + add 4($ctx),@ROTX[1] + add 8($ctx),@ROTX[3] + mov @ROTX[0],0($ctx) + add 12($ctx),@ROTX[4] + mov @ROTX[1],4($ctx) + mov @ROTX[0],$A # A=d + add 16($ctx),@ROTX[5] + mov @ROTX[3],$a5 + mov @ROTX[3],8($ctx) + mov @ROTX[4],$D # D=b + #xchg @ROTX[5],$F # F=c, C=f + mov @ROTX[4],12($ctx) + mov @ROTX[1],$F # F=e + mov @ROTX[5],16($ctx) + #mov $F,16($ctx) + mov @ROTX[5],$E # E=c + mov $a5,$C # C=f + #xchg $F,$E # E=c, F=e + + cmp $num,$inp + je .Ldone_avx2 +___ + +$Xi=4; # reset variables +@X=map("%ymm$_",(4..7,0..3)); + +$code.=<<___; + vmovdqu 64($K_XX_XX),@X[2] # pbswap mask + cmp $num,%rdi # borrowed $t0 + ja .Last_avx2 + + vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7] + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7] + vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] + vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] + vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] + jmp .Last_avx2 + +.align 32 +.Last_avx2: + lea 128+16(%rsp),$frame + rorx \$2,$F,$B + andn $D,$F,$t0 + and $C,$F + xor $t0,$F + sub \$-128,$inp +___ + $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E); + + &Xloop_avx2 (\&bodyx_00_19); + &Xloop_avx2 (\&bodyx_00_19); + &Xloop_avx2 (\&bodyx_00_19); + &Xloop_avx2 (\&bodyx_00_19); + + &Xloop_avx2 (\&bodyx_20_39); + &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19 + &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap + &Xloop_avx2 (\&bodyx_20_39); + &vpshufb (@X[-3&7],@X[-3&7],@X[2]); + &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19 + &Xloop_avx2 (\&bodyx_20_39); + &vmovdqu ("0(%rsp)",@Tx[0]); + &vpshufb (@X[-2&7],@X[-2&7],@X[2]); + &vpaddd (@Tx[1],@X[-3&7],$Kx); + &Xloop_avx2 (\&bodyx_20_39); + &vmovdqu ("32(%rsp)",@Tx[1]); + &vpshufb (@X[-1&7],@X[-1&7],@X[2]); + &vpaddd (@X[2],@X[-2&7],$Kx); + + &Xloop_avx2 (\&bodyx_40_59); + &align32 (); + &vmovdqu ("64(%rsp)",@X[2]); + &vpaddd (@X[3],@X[-1&7],$Kx); + &Xloop_avx2 (\&bodyx_40_59); + &vmovdqu ("96(%rsp)",@X[3]); + &Xloop_avx2 (\&bodyx_40_59); + &Xupdate_avx2_16_31(\&bodyx_40_59); + + &Xupdate_avx2_16_31(\&bodyx_20_39); + &Xupdate_avx2_16_31(\&bodyx_20_39); + &Xupdate_avx2_16_31(\&bodyx_20_39); + &Xloop_avx2 (\&bodyx_20_39); + +$code.=<<___; + lea 128(%rsp),$frame + + # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c + add 0($ctx),@ROTX[0] # update context + add 4($ctx),@ROTX[1] + add 8($ctx),@ROTX[3] + mov @ROTX[0],0($ctx) + add 12($ctx),@ROTX[4] + mov @ROTX[1],4($ctx) + mov @ROTX[0],$A # A=d + add 16($ctx),@ROTX[5] + mov @ROTX[3],$a5 + mov @ROTX[3],8($ctx) + mov @ROTX[4],$D # D=b + #xchg @ROTX[5],$F # F=c, C=f + mov @ROTX[4],12($ctx) + mov @ROTX[1],$F # F=e + mov @ROTX[5],16($ctx) + #mov $F,16($ctx) + mov @ROTX[5],$E # E=c + mov $a5,$C # C=f + #xchg $F,$E # E=c, F=e + + cmp $num,$inp + jbe .Loop_avx2 + +.Ldone_avx2: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -40-6*16(%r14),%xmm6 + movaps -40-5*16(%r14),%xmm7 + movaps -40-4*16(%r14),%xmm8 + movaps -40-3*16(%r14),%xmm9 + movaps -40-2*16(%r14),%xmm10 + movaps -40-1*16(%r14),%xmm11 +___ +$code.=<<___; + lea (%r14),%rsi + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lepilogue_avx2: + ret +.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 +___ +} +} +$code.=<<___; +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +___ +}}} +$code.=<<___; +.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lprologue(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + mov `16*4`(%rax),%rax # pull saved stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler +___ + +$code.=<<___ if ($shaext); +.type shaext_handler,\@abi-omnipotent +.align 16 +shaext_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lprologue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + lea .Lepilogue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea -8-4*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$8,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size shaext_handler,.-shaext_handler +___ + +$code.=<<___; +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 232($context),%rax # pull context->R14 + + lea -40-6*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$12,%ecx + .long 0xa548f3fc # cld; rep movsq + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore cotnext->R12 + mov %r13,224($context) # restore cotnext->R13 + mov %r14,232($context) # restore cotnext->R14 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size ssse3_handler,.-ssse3_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_sha1_block_data_order + .rva .LSEH_end_sha1_block_data_order + .rva .LSEH_info_sha1_block_data_order +___ +$code.=<<___ if ($shaext); + .rva .LSEH_begin_sha1_block_data_order_shaext + .rva .LSEH_end_sha1_block_data_order_shaext + .rva .LSEH_info_sha1_block_data_order_shaext +___ +$code.=<<___; + .rva .LSEH_begin_sha1_block_data_order_ssse3 + .rva .LSEH_end_sha1_block_data_order_ssse3 + .rva .LSEH_info_sha1_block_data_order_ssse3 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_sha1_block_data_order_avx + .rva .LSEH_end_sha1_block_data_order_avx + .rva .LSEH_info_sha1_block_data_order_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_sha1_block_data_order_avx2 + .rva .LSEH_end_sha1_block_data_order_avx2 + .rva .LSEH_info_sha1_block_data_order_avx2 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_sha1_block_data_order: + .byte 9,0,0,0 + .rva se_handler +___ +$code.=<<___ if ($shaext); +.LSEH_info_sha1_block_data_order_shaext: + .byte 9,0,0,0 + .rva shaext_handler +___ +$code.=<<___; +.LSEH_info_sha1_block_data_order_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_sha1_block_data_order_avx: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] +___ +$code.=<<___ if ($avx>1); +.LSEH_info_sha1_block_data_order_avx2: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] +___ +} + +#################################################################### + +sub sha1rnds4 { + if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x3a,0xcc); + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } else { + return "sha1rnds4\t".@_[0]; + } +} + +sub sha1op38 { + my $instr = shift; + my %opcodelet = ( + "sha1nexte" => 0xc8, + "sha1msg1" => 0xc9, + "sha1msg2" => 0xca ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x38); + my $rex=0; + $rex|=0x04 if ($2>=8); + $rex|=0x01 if ($1>=8); + unshift @opcode,0x40|$rex if ($rex); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or + s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha256-586.pl b/openssl-1.1.0h/crypto/sha/asm/sha256-586.pl new file mode 100644 index 0000000..6af1d84 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha256-586.pl @@ -0,0 +1,1293 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256 block transform for x86. September 2007. +# +# Performance improvement over compiler generated code varies from +# 10% to 40% [see below]. Not very impressive on some µ-archs, but +# it's 5 times smaller and optimizies amount of writes. +# +# May 2012. +# +# Optimization including two of Pavel Semjanov's ideas, alternative +# Maj and full unroll, resulted in ~20-25% improvement on most CPUs, +# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost +# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not +# on P4, where it kills performance, nor Sandy Bridge, where folded +# loop is approximately as fast... +# +# June 2012. +# +# Add AMD XOP-specific code path, >30% improvement on Bulldozer over +# May version, >60% over original. Add AVX+shrd code path, >25% +# improvement on Sandy Bridge over May version, 60% over original. +# +# May 2013. +# +# Replace AMD XOP code path with SSSE3 to cover more processors. +# (Biggest improvement coefficient is on upcoming Atom Silvermont, +# not shown.) Add AVX+BMI code path. +# +# March 2014. +# +# Add support for Intel SHA Extensions. +# +# Performance in clock cycles per processed byte (less is better): +# +# gcc icc x86 asm(*) SIMD x86_64 asm(**) +# Pentium 46 57 40/38 - - +# PIII 36 33 27/24 - - +# P4 41 38 28 - 17.3 +# AMD K8 27 25 19/15.5 - 14.9 +# Core2 26 23 18/15.6 14.3 13.8 +# Westmere 27 - 19/15.7 13.4 12.3 +# Sandy Bridge 25 - 15.9 12.4 11.6 +# Ivy Bridge 24 - 15.0 11.4 10.3 +# Haswell 22 - 13.9 9.46 7.80 +# Bulldozer 36 - 27/22 17.0 13.6 +# VIA Nano 36 - 25/22 16.8 16.5 +# Atom 50 - 30/25 21.9 18.9 +# Silvermont 40 - 34/31 22.9 20.6 +# +# (*) numbers after slash are for unrolled loop, where applicable; +# (**) x86_64 assembly performance is presented for reference +# purposes, results are best-available; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); + +$xmm=$avx=0; +for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } + +if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if ($xmm && !$avx && $ARGV[0] eq "win32n" && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.03) + ($1>=2.10); +} + +if ($xmm && !$avx && $ARGV[0] eq "win32" && + `ml 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +$shaext=$xmm; ### set to zero if compiling for 1.0.1 + +$unroll_after = 64*4; # If pre-evicted from L1P cache first spin of + # fully unrolled loop was measured to run about + # 3-4x slower. If slowdown coefficient is N and + # unrolled loop is m times faster, then you break + # even at (N-1)/(m-1) blocks. Then it needs to be + # adjusted for probability of code being evicted, + # code size/cache size=1/4. Typical m is 1.15... + +$A="eax"; +$E="edx"; +$T="ebx"; +$Aoff=&DWP(4,"esp"); +$Boff=&DWP(8,"esp"); +$Coff=&DWP(12,"esp"); +$Doff=&DWP(16,"esp"); +$Eoff=&DWP(20,"esp"); +$Foff=&DWP(24,"esp"); +$Goff=&DWP(28,"esp"); +$Hoff=&DWP(32,"esp"); +$Xoff=&DWP(36,"esp"); +$K256="ebp"; + +sub BODY_16_63() { + &mov ($T,"ecx"); # "ecx" is preloaded + &mov ("esi",&DWP(4*(9+15+16-14),"esp")); + &ror ("ecx",18-7); + &mov ("edi","esi"); + &ror ("esi",19-17); + &xor ("ecx",$T); + &shr ($T,3); + &ror ("ecx",7); + &xor ("esi","edi"); + &xor ($T,"ecx"); # T = sigma0(X[-15]) + &ror ("esi",17); + &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] + &shr ("edi",10); + &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] + #&xor ("edi","esi") # sigma1(X[-2]) + # &add ($T,"edi"); # T += sigma1(X[-2]) + # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] + + &BODY_00_15(1); +} +sub BODY_00_15() { + my $in_16_63=shift; + + &mov ("ecx",$E); + &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) + &mov ("esi",$Foff); + &ror ("ecx",25-11); + &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) + &mov ("edi",$Goff); + &xor ("ecx",$E); + &xor ("esi","edi"); + &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); + &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] + &ror ("ecx",11-6); + &and ("esi",$E); + &mov ($Eoff,$E); # modulo-scheduled + &xor ($E,"ecx"); + &add ($T,$Hoff); # T += h + &xor ("esi","edi"); # Ch(e,f,g) + &ror ($E,6); # Sigma1(e) + &mov ("ecx",$A); + &add ($T,"esi"); # T += Ch(e,f,g) + + &ror ("ecx",22-13); + &add ($T,$E); # T += Sigma1(e) + &mov ("edi",$Boff); + &xor ("ecx",$A); + &mov ($Aoff,$A); # modulo-scheduled + &lea ("esp",&DWP(-4,"esp")); + &ror ("ecx",13-2); + &mov ("esi",&DWP(0,$K256)); + &xor ("ecx",$A); + &mov ($E,$Eoff); # e in next iteration, d in this one + &xor ($A,"edi"); # a ^= b + &ror ("ecx",2); # Sigma0(a) + + &add ($T,"esi"); # T+= K[i] + &mov (&DWP(0,"esp"),$A); # (b^c) in next round + &add ($E,$T); # d += T + &and ($A,&DWP(4,"esp")); # a &= (b^c) + &add ($T,"ecx"); # T += Sigma0(a) + &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) + &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T + &add ($K256,4); + &add ($A,$T); # h += T +} + +&external_label("OPENSSL_ia32cap_P") if (!$i386); + +&function_begin("sha256_block_data_order"); + &mov ("esi",wparam(0)); # ctx + &mov ("edi",wparam(1)); # inp + &mov ("eax",wparam(2)); # num + &mov ("ebx","esp"); # saved sp + + &call (&label("pic_point")); # make it PIC! +&set_label("pic_point"); + &blindpop($K256); + &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); + + &sub ("esp",16); + &and ("esp",-64); + + &shl ("eax",6); + &add ("eax","edi"); + &mov (&DWP(0,"esp"),"esi"); # ctx + &mov (&DWP(4,"esp"),"edi"); # inp + &mov (&DWP(8,"esp"),"eax"); # inp+num*128 + &mov (&DWP(12,"esp"),"ebx"); # saved sp + if (!$i386 && $xmm) { + &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); + &mov ("ecx",&DWP(0,"edx")); + &mov ("ebx",&DWP(4,"edx")); + &test ("ecx",1<<20); # check for P4 + &jnz (&label("loop")); + &mov ("edx",&DWP(8,"edx")) if ($xmm); + &test ("ecx",1<<24); # check for FXSR + &jz ($unroll_after?&label("no_xmm"):&label("loop")); + &and ("ecx",1<<30); # mask "Intel CPU" bit + &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits + &test ("edx",1<<29) if ($shaext); # check for SHA + &jnz (&label("shaext")) if ($shaext); + &or ("ecx","ebx"); + &and ("ecx",1<<28|1<<30); + &cmp ("ecx",1<<28|1<<30); + if ($xmm) { + &je (&label("AVX")) if ($avx); + &test ("ebx",1<<9); # check for SSSE3 + &jnz (&label("SSSE3")); + } else { + &je (&label("loop_shrd")); + } + if ($unroll_after) { +&set_label("no_xmm"); + &sub ("eax","edi"); + &cmp ("eax",$unroll_after); + &jae (&label("unrolled")); + } } + &jmp (&label("loop")); + +sub COMPACT_LOOP() { +my $suffix=shift; + +&set_label("loop$suffix",$suffix?32:16); + # copy input block to stack reversing byte and dword order + for($i=0;$i<4;$i++) { + &mov ("eax",&DWP($i*16+0,"edi")); + &mov ("ebx",&DWP($i*16+4,"edi")); + &mov ("ecx",&DWP($i*16+8,"edi")); + &bswap ("eax"); + &mov ("edx",&DWP($i*16+12,"edi")); + &bswap ("ebx"); + &push ("eax"); + &bswap ("ecx"); + &push ("ebx"); + &bswap ("edx"); + &push ("ecx"); + &push ("edx"); + } + &add ("edi",64); + &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H + &mov (&DWP(4*(9+16)+4,"esp"),"edi"); + + # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack + &mov ($A,&DWP(0,"esi")); + &mov ("ebx",&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edi",&DWP(12,"esi")); + # &mov ($Aoff,$A); + &mov ($Boff,"ebx"); + &xor ("ebx","ecx"); + &mov ($Coff,"ecx"); + &mov ($Doff,"edi"); + &mov (&DWP(0,"esp"),"ebx"); # magic + &mov ($E,&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &mov ("edi",&DWP(28,"esi")); + # &mov ($Eoff,$E); + &mov ($Foff,"ebx"); + &mov ($Goff,"ecx"); + &mov ($Hoff,"edi"); + +&set_label("00_15$suffix",16); + + &BODY_00_15(); + + &cmp ("esi",0xc19bf174); + &jne (&label("00_15$suffix")); + + &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) + &jmp (&label("16_63$suffix")); + +&set_label("16_63$suffix",16); + + &BODY_16_63(); + + &cmp ("esi",0xc67178f2); + &jne (&label("16_63$suffix")); + + &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx + # &mov ($A,$Aoff); + &mov ("ebx",$Boff); + # &mov ("edi",$Coff); + &mov ("ecx",$Doff); + &add ($A,&DWP(0,"esi")); + &add ("ebx",&DWP(4,"esi")); + &add ("edi",&DWP(8,"esi")); + &add ("ecx",&DWP(12,"esi")); + &mov (&DWP(0,"esi"),$A); + &mov (&DWP(4,"esi"),"ebx"); + &mov (&DWP(8,"esi"),"edi"); + &mov (&DWP(12,"esi"),"ecx"); + # &mov ($E,$Eoff); + &mov ("eax",$Foff); + &mov ("ebx",$Goff); + &mov ("ecx",$Hoff); + &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp + &add ($E,&DWP(16,"esi")); + &add ("eax",&DWP(20,"esi")); + &add ("ebx",&DWP(24,"esi")); + &add ("ecx",&DWP(28,"esi")); + &mov (&DWP(16,"esi"),$E); + &mov (&DWP(20,"esi"),"eax"); + &mov (&DWP(24,"esi"),"ebx"); + &mov (&DWP(28,"esi"),"ecx"); + + &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame + &sub ($K256,4*64); # rewind K + + &cmp ("edi",&DWP(8,"esp")); # are we done yet? + &jb (&label("loop$suffix")); +} + &COMPACT_LOOP(); + &mov ("esp",&DWP(12,"esp")); # restore sp +&function_end_A(); + if (!$i386 && !$xmm) { + # ~20% improvement on Sandy Bridge + local *ror = sub { &shrd(@_[0],@_) }; + &COMPACT_LOOP("_shrd"); + &mov ("esp",&DWP(12,"esp")); # restore sp +&function_end_A(); + } + +&set_label("K256",64); # Yes! I keep it in the code segment! +@K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, + 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, + 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, + 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, + 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, + 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, + 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, + 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, + 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, + 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, + 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, + 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, + 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, + 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, + 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); +&data_word(@K256); +&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask +&asciz("SHA256 block transform for x86, CRYPTOGAMS by "); + +($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets +sub off { &DWP(4*(((shift)-$i)&7),"esp"); } + +if (!$i386 && $unroll_after) { +my @AH=($A,$K256); + +&set_label("unrolled",16); + &lea ("esp",&DWP(-96,"esp")); + # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack + &mov ($AH[0],&DWP(0,"esi")); + &mov ($AH[1],&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("ebx",&DWP(12,"esi")); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"ecx"); # magic + &mov (&DWP(8,"esp"),"ecx"); + &mov (&DWP(12,"esp"),"ebx"); + &mov ($E,&DWP(16,"esi")); + &mov ("ebx",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &mov ("esi",&DWP(28,"esi")); + #&mov (&DWP(16,"esp"),$E); + &mov (&DWP(20,"esp"),"ebx"); + &mov (&DWP(24,"esp"),"ecx"); + &mov (&DWP(28,"esp"),"esi"); + &jmp (&label("grand_loop")); + +&set_label("grand_loop",16); + # copy input block to stack reversing byte order + for($i=0;$i<5;$i++) { + &mov ("ebx",&DWP(12*$i+0,"edi")); + &mov ("ecx",&DWP(12*$i+4,"edi")); + &bswap ("ebx"); + &mov ("esi",&DWP(12*$i+8,"edi")); + &bswap ("ecx"); + &mov (&DWP(32+12*$i+0,"esp"),"ebx"); + &bswap ("esi"); + &mov (&DWP(32+12*$i+4,"esp"),"ecx"); + &mov (&DWP(32+12*$i+8,"esp"),"esi"); + } + &mov ("ebx",&DWP($i*12,"edi")); + &add ("edi",64); + &bswap ("ebx"); + &mov (&DWP(96+4,"esp"),"edi"); + &mov (&DWP(32+12*$i,"esp"),"ebx"); + + my ($t1,$t2) = ("ecx","esi"); + + for ($i=0;$i<64;$i++) { + + if ($i>=16) { + &mov ($T,$t1); # $t1 is preloaded + # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); + &ror ($t1,18-7); + &mov ("edi",$t2); + &ror ($t2,19-17); + &xor ($t1,$T); + &shr ($T,3); + &ror ($t1,7); + &xor ($t2,"edi"); + &xor ($T,$t1); # T = sigma0(X[-15]) + &ror ($t2,17); + &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] + &shr ("edi",10); + &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] + #&xor ("edi",$t2) # sigma1(X[-2]) + # &add ($T,"edi"); # T += sigma1(X[-2]) + # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] + } + &mov ($t1,$E); + &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) + &mov ($t2,&off($f)); + &ror ($E,25-11); + &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) + &mov ("edi",&off($g)); + &xor ($E,$t1); + &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] + &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] + &xor ($t2,"edi"); + &ror ($E,11-6); + &and ($t2,$t1); + &mov (&off($e),$t1); # save $E, modulo-scheduled + &xor ($E,$t1); + &add ($T,&off($h)); # T += h + &xor ("edi",$t2); # Ch(e,f,g) + &ror ($E,6); # Sigma1(e) + &mov ($t1,$AH[0]); + &add ($T,"edi"); # T += Ch(e,f,g) + + &ror ($t1,22-13); + &mov ($t2,$AH[0]); + &mov ("edi",&off($b)); + &xor ($t1,$AH[0]); + &mov (&off($a),$AH[0]); # save $A, modulo-scheduled + &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round + &ror ($t1,13-2); + &and ($AH[1],$AH[0]); # (b^c) &= (a^b) + &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] + &xor ($t1,$t2); + &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) + &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); + &ror ($t1,2); # Sigma0(a) + + &add ($AH[1],$E); # h += T + &add ($E,&off($d)); # d += T + &add ($AH[1],$t1); # h += Sigma0(a) + &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); + + @AH = reverse(@AH); # rotate(a,h) + ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) + } + &mov ("esi",&DWP(96,"esp")); #ctx + #&mov ($AH[0],&DWP(0,"esp")); + &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); + #&mov ("edi", &DWP(8,"esp")); + &mov ("ecx",&DWP(12,"esp")); + &add ($AH[0],&DWP(0,"esi")); + &add ($AH[1],&DWP(4,"esi")); + &add ("edi",&DWP(8,"esi")); + &add ("ecx",&DWP(12,"esi")); + &mov (&DWP(0,"esi"),$AH[0]); + &mov (&DWP(4,"esi"),$AH[1]); + &mov (&DWP(8,"esi"),"edi"); + &mov (&DWP(12,"esi"),"ecx"); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"edi"); # magic + &mov (&DWP(8,"esp"),"edi"); + &mov (&DWP(12,"esp"),"ecx"); + #&mov ($E,&DWP(16,"esp")); + &mov ("edi",&DWP(20,"esp")); + &mov ("ebx",&DWP(24,"esp")); + &mov ("ecx",&DWP(28,"esp")); + &add ($E,&DWP(16,"esi")); + &add ("edi",&DWP(20,"esi")); + &add ("ebx",&DWP(24,"esi")); + &add ("ecx",&DWP(28,"esi")); + &mov (&DWP(16,"esi"),$E); + &mov (&DWP(20,"esi"),"edi"); + &mov (&DWP(24,"esi"),"ebx"); + &mov (&DWP(28,"esi"),"ecx"); + #&mov (&DWP(16,"esp"),$E); + &mov (&DWP(20,"esp"),"edi"); + &mov ("edi",&DWP(96+4,"esp")); # inp + &mov (&DWP(24,"esp"),"ebx"); + &mov (&DWP(28,"esp"),"ecx"); + + &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? + &jb (&label("grand_loop")); + + &mov ("esp",&DWP(96+12,"esp")); # restore sp +&function_end_A(); +} + if (!$i386 && $xmm) {{{ +if ($shaext) { +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$end)=("esi","edi","eax"); +my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); +my @MSG=map("xmm$_",(3..6)); + +sub sha256op38 { + my ($opcodelet,$dst,$src)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } +} +sub sha256rnds2 { sha256op38(0xcb,@_); } +sub sha256msg1 { sha256op38(0xcc,@_); } +sub sha256msg2 { sha256op38(0xcd,@_); } + +&set_label("shaext",32); + &sub ("esp",32); + + &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA + &lea ($K256,&DWP(0x80,$K256)); + &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE + &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask + + &pshufd ($Wi,$ABEF,0x1b); # ABCD + &pshufd ($ABEF,$ABEF,0xb1); # CDAB + &pshufd ($CDGH,$CDGH,0x1b); # EFGH + &palignr ($ABEF,$CDGH,8); # ABEF + &punpcklqdq ($CDGH,$Wi); # CDGH + &jmp (&label("loop_shaext")); + +&set_label("loop_shaext",16); + &movdqu (@MSG[0],&QWP(0,$inp)); + &movdqu (@MSG[1],&QWP(0x10,$inp)); + &movdqu (@MSG[2],&QWP(0x20,$inp)); + &pshufb (@MSG[0],$TMP); + &movdqu (@MSG[3],&QWP(0x30,$inp)); + &movdqa (&QWP(16,"esp"),$CDGH); # offload + + &movdqa ($Wi,&QWP(0*16-0x80,$K256)); + &paddd ($Wi,@MSG[0]); + &pshufb (@MSG[1],$TMP); + &sha256rnds2 ($CDGH,$ABEF); # 0-3 + &pshufd ($Wi,$Wi,0x0e); + &nop (); + &movdqa (&QWP(0,"esp"),$ABEF); # offload + &sha256rnds2 ($ABEF,$CDGH); + + &movdqa ($Wi,&QWP(1*16-0x80,$K256)); + &paddd ($Wi,@MSG[1]); + &pshufb (@MSG[2],$TMP); + &sha256rnds2 ($CDGH,$ABEF); # 4-7 + &pshufd ($Wi,$Wi,0x0e); + &lea ($inp,&DWP(0x40,$inp)); + &sha256msg1 (@MSG[0],@MSG[1]); + &sha256rnds2 ($ABEF,$CDGH); + + &movdqa ($Wi,&QWP(2*16-0x80,$K256)); + &paddd ($Wi,@MSG[2]); + &pshufb (@MSG[3],$TMP); + &sha256rnds2 ($CDGH,$ABEF); # 8-11 + &pshufd ($Wi,$Wi,0x0e); + &movdqa ($TMP,@MSG[3]); + &palignr ($TMP,@MSG[2],4); + &nop (); + &paddd (@MSG[0],$TMP); + &sha256msg1 (@MSG[1],@MSG[2]); + &sha256rnds2 ($ABEF,$CDGH); + + &movdqa ($Wi,&QWP(3*16-0x80,$K256)); + &paddd ($Wi,@MSG[3]); + &sha256msg2 (@MSG[0],@MSG[3]); + &sha256rnds2 ($CDGH,$ABEF); # 12-15 + &pshufd ($Wi,$Wi,0x0e); + &movdqa ($TMP,@MSG[0]); + &palignr ($TMP,@MSG[3],4); + &nop (); + &paddd (@MSG[1],$TMP); + &sha256msg1 (@MSG[2],@MSG[3]); + &sha256rnds2 ($ABEF,$CDGH); + +for($i=4;$i<16-3;$i++) { + &movdqa ($Wi,&QWP($i*16-0x80,$K256)); + &paddd ($Wi,@MSG[0]); + &sha256msg2 (@MSG[1],@MSG[0]); + &sha256rnds2 ($CDGH,$ABEF); # 16-19... + &pshufd ($Wi,$Wi,0x0e); + &movdqa ($TMP,@MSG[1]); + &palignr ($TMP,@MSG[0],4); + &nop (); + &paddd (@MSG[2],$TMP); + &sha256msg1 (@MSG[3],@MSG[0]); + &sha256rnds2 ($ABEF,$CDGH); + + push(@MSG,shift(@MSG)); +} + &movdqa ($Wi,&QWP(13*16-0x80,$K256)); + &paddd ($Wi,@MSG[0]); + &sha256msg2 (@MSG[1],@MSG[0]); + &sha256rnds2 ($CDGH,$ABEF); # 52-55 + &pshufd ($Wi,$Wi,0x0e); + &movdqa ($TMP,@MSG[1]) + &palignr ($TMP,@MSG[0],4); + &sha256rnds2 ($ABEF,$CDGH); + &paddd (@MSG[2],$TMP); + + &movdqa ($Wi,&QWP(14*16-0x80,$K256)); + &paddd ($Wi,@MSG[1]); + &sha256rnds2 ($CDGH,$ABEF); # 56-59 + &pshufd ($Wi,$Wi,0x0e); + &sha256msg2 (@MSG[2],@MSG[1]); + &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask + &sha256rnds2 ($ABEF,$CDGH); + + &movdqa ($Wi,&QWP(15*16-0x80,$K256)); + &paddd ($Wi,@MSG[2]); + &nop (); + &sha256rnds2 ($CDGH,$ABEF); # 60-63 + &pshufd ($Wi,$Wi,0x0e); + &cmp ($end,$inp); + &nop (); + &sha256rnds2 ($ABEF,$CDGH); + + &paddd ($CDGH,&QWP(16,"esp")); + &paddd ($ABEF,&QWP(0,"esp")); + &jnz (&label("loop_shaext")); + + &pshufd ($CDGH,$CDGH,0xb1); # DCHG + &pshufd ($TMP,$ABEF,0x1b); # FEBA + &pshufd ($ABEF,$ABEF,0xb1); # BAFE + &punpckhqdq ($ABEF,$CDGH); # DCBA + &palignr ($CDGH,$TMP,8); # HGFE + + &mov ("esp",&DWP(32+12,"esp")); + &movdqu (&QWP(0,$ctx),$ABEF); + &movdqu (&QWP(16,$ctx),$CDGH); +&function_end_A(); +} + +my @X = map("xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); +my @AH = ($A,$T); + +&set_label("SSSE3",32); + &lea ("esp",&DWP(-96,"esp")); + # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack + &mov ($AH[0],&DWP(0,"esi")); + &mov ($AH[1],&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edi",&DWP(12,"esi")); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"ecx"); # magic + &mov (&DWP(8,"esp"),"ecx"); + &mov (&DWP(12,"esp"),"edi"); + &mov ($E,&DWP(16,"esi")); + &mov ("edi",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &mov ("esi",&DWP(28,"esi")); + #&mov (&DWP(16,"esp"),$E); + &mov (&DWP(20,"esp"),"edi"); + &mov ("edi",&DWP(96+4,"esp")); # inp + &mov (&DWP(24,"esp"),"ecx"); + &mov (&DWP(28,"esp"),"esi"); + &movdqa ($t3,&QWP(256,$K256)); + &jmp (&label("grand_ssse3")); + +&set_label("grand_ssse3",16); + # load input, reverse byte order, add K256[0..15], save to stack + &movdqu (@X[0],&QWP(0,"edi")); + &movdqu (@X[1],&QWP(16,"edi")); + &movdqu (@X[2],&QWP(32,"edi")); + &movdqu (@X[3],&QWP(48,"edi")); + &add ("edi",64); + &pshufb (@X[0],$t3); + &mov (&DWP(96+4,"esp"),"edi"); + &pshufb (@X[1],$t3); + &movdqa ($t0,&QWP(0,$K256)); + &pshufb (@X[2],$t3); + &movdqa ($t1,&QWP(16,$K256)); + &paddd ($t0,@X[0]); + &pshufb (@X[3],$t3); + &movdqa ($t2,&QWP(32,$K256)); + &paddd ($t1,@X[1]); + &movdqa ($t3,&QWP(48,$K256)); + &movdqa (&QWP(32+0,"esp"),$t0); + &paddd ($t2,@X[2]); + &movdqa (&QWP(32+16,"esp"),$t1); + &paddd ($t3,@X[3]); + &movdqa (&QWP(32+32,"esp"),$t2); + &movdqa (&QWP(32+48,"esp"),$t3); + &jmp (&label("ssse3_00_47")); + +&set_label("ssse3_00_47",16); + &add ($K256,64); + +sub SSSE3_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 120 instructions + + eval(shift(@insns)); + &movdqa ($t0,@X[1]); + eval(shift(@insns)); # @ + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t0,@X[0],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); # @ + eval(shift(@insns)); + &palignr ($t3,@X[2],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t1,$t0); + eval(shift(@insns)); # @ + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t0,3); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t2,7); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,32-18); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t2,18-7); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,18-7); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,10); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &psrlq ($t2,17); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + &psrlq ($t2,19-17); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); # @ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); # @ + &psrld ($t3,10); + eval(shift(@insns)); + &psrlq ($t2,17); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + &psrlq ($t2,19-17); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &movdqa ($t2,&QWP(16*$j,$K256)); + eval(shift(@insns)); + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # @ + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd ($t2,@X[0]); + eval(shift(@insns)); # @ + + foreach (@insns) { eval; } # remaining instructions + + &movdqa (&QWP(32+16*$j,"esp"),$t2); +} + +sub body_00_15 () { + ( + '&mov ("ecx",$E);', + '&ror ($E,25-11);', + '&mov ("esi",&off($f));', + '&xor ($E,"ecx");', + '&mov ("edi",&off($g));', + '&xor ("esi","edi");', + '&ror ($E,11-6);', + '&and ("esi","ecx");', + '&mov (&off($e),"ecx");', # save $E, modulo-scheduled + '&xor ($E,"ecx");', + '&xor ("edi","esi");', # Ch(e,f,g) + '&ror ($E,6);', # T = Sigma1(e) + '&mov ("ecx",$AH[0]);', + '&add ($E,"edi");', # T += Ch(e,f,g) + '&mov ("edi",&off($b));', + '&mov ("esi",$AH[0]);', + + '&ror ("ecx",22-13);', + '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled + '&xor ("ecx",$AH[0]);', + '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round + '&add ($E,&off($h));', # T += h + '&ror ("ecx",13-2);', + '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) + '&xor ("ecx","esi");', + '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] + '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) + '&ror ("ecx",2);', # Sigma0(a) + + '&add ($AH[1],$E);', # h += T + '&add ($E,&off($d));', # d += T + '&add ($AH[1],"ecx");'. # h += Sigma0(a) + + '@AH = reverse(@AH); $i++;' # rotate(a,h) + ); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmp (&DWP(16*$j,$K256),0x00010203); + &jne (&label("ssse3_00_47")); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + + &mov ("esi",&DWP(96,"esp")); #ctx + #&mov ($AH[0],&DWP(0,"esp")); + &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); + #&mov ("edi", &DWP(8,"esp")); + &mov ("ecx",&DWP(12,"esp")); + &add ($AH[0],&DWP(0,"esi")); + &add ($AH[1],&DWP(4,"esi")); + &add ("edi",&DWP(8,"esi")); + &add ("ecx",&DWP(12,"esi")); + &mov (&DWP(0,"esi"),$AH[0]); + &mov (&DWP(4,"esi"),$AH[1]); + &mov (&DWP(8,"esi"),"edi"); + &mov (&DWP(12,"esi"),"ecx"); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"edi"); # magic + &mov (&DWP(8,"esp"),"edi"); + &mov (&DWP(12,"esp"),"ecx"); + #&mov ($E,&DWP(16,"esp")); + &mov ("edi",&DWP(20,"esp")); + &mov ("ecx",&DWP(24,"esp")); + &add ($E,&DWP(16,"esi")); + &add ("edi",&DWP(20,"esi")); + &add ("ecx",&DWP(24,"esi")); + &mov (&DWP(16,"esi"),$E); + &mov (&DWP(20,"esi"),"edi"); + &mov (&DWP(20,"esp"),"edi"); + &mov ("edi",&DWP(28,"esp")); + &mov (&DWP(24,"esi"),"ecx"); + #&mov (&DWP(16,"esp"),$E); + &add ("edi",&DWP(28,"esi")); + &mov (&DWP(24,"esp"),"ecx"); + &mov (&DWP(28,"esi"),"edi"); + &mov (&DWP(28,"esp"),"edi"); + &mov ("edi",&DWP(96+4,"esp")); # inp + + &movdqa ($t3,&QWP(64,$K256)); + &sub ($K256,3*64); # rewind K + &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? + &jb (&label("grand_ssse3")); + + &mov ("esp",&DWP(96+12,"esp")); # restore sp +&function_end_A(); + if ($avx) { +&set_label("AVX",32); + if ($avx>1) { + &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 + &cmp ("edx",1<<8|1<<3); + &je (&label("AVX_BMI")); + } + &lea ("esp",&DWP(-96,"esp")); + &vzeroall (); + # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack + &mov ($AH[0],&DWP(0,"esi")); + &mov ($AH[1],&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edi",&DWP(12,"esi")); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"ecx"); # magic + &mov (&DWP(8,"esp"),"ecx"); + &mov (&DWP(12,"esp"),"edi"); + &mov ($E,&DWP(16,"esi")); + &mov ("edi",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &mov ("esi",&DWP(28,"esi")); + #&mov (&DWP(16,"esp"),$E); + &mov (&DWP(20,"esp"),"edi"); + &mov ("edi",&DWP(96+4,"esp")); # inp + &mov (&DWP(24,"esp"),"ecx"); + &mov (&DWP(28,"esp"),"esi"); + &vmovdqa ($t3,&QWP(256,$K256)); + &jmp (&label("grand_avx")); + +&set_label("grand_avx",32); + # load input, reverse byte order, add K256[0..15], save to stack + &vmovdqu (@X[0],&QWP(0,"edi")); + &vmovdqu (@X[1],&QWP(16,"edi")); + &vmovdqu (@X[2],&QWP(32,"edi")); + &vmovdqu (@X[3],&QWP(48,"edi")); + &add ("edi",64); + &vpshufb (@X[0],@X[0],$t3); + &mov (&DWP(96+4,"esp"),"edi"); + &vpshufb (@X[1],@X[1],$t3); + &vpshufb (@X[2],@X[2],$t3); + &vpaddd ($t0,@X[0],&QWP(0,$K256)); + &vpshufb (@X[3],@X[3],$t3); + &vpaddd ($t1,@X[1],&QWP(16,$K256)); + &vpaddd ($t2,@X[2],&QWP(32,$K256)); + &vpaddd ($t3,@X[3],&QWP(48,$K256)); + &vmovdqa (&QWP(32+0,"esp"),$t0); + &vmovdqa (&QWP(32+16,"esp"),$t1); + &vmovdqa (&QWP(32+32,"esp"),$t2); + &vmovdqa (&QWP(32+48,"esp"),$t3); + &jmp (&label("avx_00_47")); + +&set_label("avx_00_47",16); + &add ($K256,64); + +sub Xupdate_AVX () { + ( + '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] + '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] + '&vpsrld ($t2,$t0,7);', + '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] + '&vpsrld ($t3,$t0,3);', + '&vpslld ($t1,$t0,14);', + '&vpxor ($t0,$t3,$t2);', + '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&vpsrld ($t2,$t2,18-7);', + '&vpxor ($t0,$t0,$t1);', + '&vpslld ($t1,$t1,25-14);', + '&vpxor ($t0,$t0,$t2);', + '&vpsrld ($t2,$t3,10);', + '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) + '&vpsrlq ($t1,$t3,17);', + '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&vpxor ($t2,$t2,$t1);', + '&vpsrlq ($t3,$t3,19);', + '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] + '&vpshufd ($t3,$t2,0b10000100);', + '&vpsrldq ($t3,$t3,8);', + '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) + '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&vpsrld ($t2,$t3,10);', + '&vpsrlq ($t1,$t3,17);', + '&vpxor ($t2,$t2,$t1);', + '&vpsrlq ($t3,$t3,19);', + '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] + '&vpshufd ($t3,$t2,0b11101000);', + '&vpslldq ($t3,$t3,8);', + '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) + ); +} + +local *ror = sub { &shrd(@_[0],@_) }; +sub AVX_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 120 instructions +my $insn; + + foreach (Xupdate_AVX()) { # 31 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval($insn = shift(@insns)); + eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); + } + &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (&QWP(32+16*$j,"esp"),$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &AVX_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmp (&DWP(16*$j,$K256),0x00010203); + &jne (&label("avx_00_47")); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + + &mov ("esi",&DWP(96,"esp")); #ctx + #&mov ($AH[0],&DWP(0,"esp")); + &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); + #&mov ("edi", &DWP(8,"esp")); + &mov ("ecx",&DWP(12,"esp")); + &add ($AH[0],&DWP(0,"esi")); + &add ($AH[1],&DWP(4,"esi")); + &add ("edi",&DWP(8,"esi")); + &add ("ecx",&DWP(12,"esi")); + &mov (&DWP(0,"esi"),$AH[0]); + &mov (&DWP(4,"esi"),$AH[1]); + &mov (&DWP(8,"esi"),"edi"); + &mov (&DWP(12,"esi"),"ecx"); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"edi"); # magic + &mov (&DWP(8,"esp"),"edi"); + &mov (&DWP(12,"esp"),"ecx"); + #&mov ($E,&DWP(16,"esp")); + &mov ("edi",&DWP(20,"esp")); + &mov ("ecx",&DWP(24,"esp")); + &add ($E,&DWP(16,"esi")); + &add ("edi",&DWP(20,"esi")); + &add ("ecx",&DWP(24,"esi")); + &mov (&DWP(16,"esi"),$E); + &mov (&DWP(20,"esi"),"edi"); + &mov (&DWP(20,"esp"),"edi"); + &mov ("edi",&DWP(28,"esp")); + &mov (&DWP(24,"esi"),"ecx"); + #&mov (&DWP(16,"esp"),$E); + &add ("edi",&DWP(28,"esi")); + &mov (&DWP(24,"esp"),"ecx"); + &mov (&DWP(28,"esi"),"edi"); + &mov (&DWP(28,"esp"),"edi"); + &mov ("edi",&DWP(96+4,"esp")); # inp + + &vmovdqa ($t3,&QWP(64,$K256)); + &sub ($K256,3*64); # rewind K + &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? + &jb (&label("grand_avx")); + + &mov ("esp",&DWP(96+12,"esp")); # restore sp + &vzeroall (); +&function_end_A(); + if ($avx>1) { +sub bodyx_00_15 () { # +10% + ( + '&rorx ("ecx",$E,6)', + '&rorx ("esi",$E,11)', + '&mov (&off($e),$E)', # save $E, modulo-scheduled + '&rorx ("edi",$E,25)', + '&xor ("ecx","esi")', + '&andn ("esi",$E,&off($g))', + '&xor ("ecx","edi")', # Sigma1(e) + '&and ($E,&off($f))', + '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled + '&or ($E,"esi")', # T = Ch(e,f,g) + + '&rorx ("edi",$AH[0],2)', + '&rorx ("esi",$AH[0],13)', + '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) + '&rorx ("ecx",$AH[0],22)', + '&xor ("esi","edi")', + '&mov ("edi",&off($b))', + '&xor ("ecx","esi")', # Sigma0(a) + + '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round + '&add ($E,&off($h))', # T += h + '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) + '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] + '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) + + '&add ("ecx",$E)', # h += T + '&add ($E,&off($d))', # d += T + '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) + + '@AH = reverse(@AH); $i++;' # rotate(a,h) + ); +} + +&set_label("AVX_BMI",32); + &lea ("esp",&DWP(-96,"esp")); + &vzeroall (); + # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack + &mov ($AH[0],&DWP(0,"esi")); + &mov ($AH[1],&DWP(4,"esi")); + &mov ("ecx",&DWP(8,"esi")); + &mov ("edi",&DWP(12,"esi")); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"ecx"); # magic + &mov (&DWP(8,"esp"),"ecx"); + &mov (&DWP(12,"esp"),"edi"); + &mov ($E,&DWP(16,"esi")); + &mov ("edi",&DWP(20,"esi")); + &mov ("ecx",&DWP(24,"esi")); + &mov ("esi",&DWP(28,"esi")); + #&mov (&DWP(16,"esp"),$E); + &mov (&DWP(20,"esp"),"edi"); + &mov ("edi",&DWP(96+4,"esp")); # inp + &mov (&DWP(24,"esp"),"ecx"); + &mov (&DWP(28,"esp"),"esi"); + &vmovdqa ($t3,&QWP(256,$K256)); + &jmp (&label("grand_avx_bmi")); + +&set_label("grand_avx_bmi",32); + # load input, reverse byte order, add K256[0..15], save to stack + &vmovdqu (@X[0],&QWP(0,"edi")); + &vmovdqu (@X[1],&QWP(16,"edi")); + &vmovdqu (@X[2],&QWP(32,"edi")); + &vmovdqu (@X[3],&QWP(48,"edi")); + &add ("edi",64); + &vpshufb (@X[0],@X[0],$t3); + &mov (&DWP(96+4,"esp"),"edi"); + &vpshufb (@X[1],@X[1],$t3); + &vpshufb (@X[2],@X[2],$t3); + &vpaddd ($t0,@X[0],&QWP(0,$K256)); + &vpshufb (@X[3],@X[3],$t3); + &vpaddd ($t1,@X[1],&QWP(16,$K256)); + &vpaddd ($t2,@X[2],&QWP(32,$K256)); + &vpaddd ($t3,@X[3],&QWP(48,$K256)); + &vmovdqa (&QWP(32+0,"esp"),$t0); + &vmovdqa (&QWP(32+16,"esp"),$t1); + &vmovdqa (&QWP(32+32,"esp"),$t2); + &vmovdqa (&QWP(32+48,"esp"),$t3); + &jmp (&label("avx_bmi_00_47")); + +&set_label("avx_bmi_00_47",16); + &add ($K256,64); + + for ($i=0,$j=0; $j<4; $j++) { + &AVX_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmp (&DWP(16*$j,$K256),0x00010203); + &jne (&label("avx_bmi_00_47")); + + for ($i=0; $i<16; ) { + foreach(bodyx_00_15()) { eval; } + } + + &mov ("esi",&DWP(96,"esp")); #ctx + #&mov ($AH[0],&DWP(0,"esp")); + &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); + #&mov ("edi", &DWP(8,"esp")); + &mov ("ecx",&DWP(12,"esp")); + &add ($AH[0],&DWP(0,"esi")); + &add ($AH[1],&DWP(4,"esi")); + &add ("edi",&DWP(8,"esi")); + &add ("ecx",&DWP(12,"esi")); + &mov (&DWP(0,"esi"),$AH[0]); + &mov (&DWP(4,"esi"),$AH[1]); + &mov (&DWP(8,"esi"),"edi"); + &mov (&DWP(12,"esi"),"ecx"); + #&mov (&DWP(0,"esp"),$AH[0]); + &mov (&DWP(4,"esp"),$AH[1]); + &xor ($AH[1],"edi"); # magic + &mov (&DWP(8,"esp"),"edi"); + &mov (&DWP(12,"esp"),"ecx"); + #&mov ($E,&DWP(16,"esp")); + &mov ("edi",&DWP(20,"esp")); + &mov ("ecx",&DWP(24,"esp")); + &add ($E,&DWP(16,"esi")); + &add ("edi",&DWP(20,"esi")); + &add ("ecx",&DWP(24,"esi")); + &mov (&DWP(16,"esi"),$E); + &mov (&DWP(20,"esi"),"edi"); + &mov (&DWP(20,"esp"),"edi"); + &mov ("edi",&DWP(28,"esp")); + &mov (&DWP(24,"esi"),"ecx"); + #&mov (&DWP(16,"esp"),$E); + &add ("edi",&DWP(28,"esi")); + &mov (&DWP(24,"esp"),"ecx"); + &mov (&DWP(28,"esi"),"edi"); + &mov (&DWP(28,"esp"),"edi"); + &mov ("edi",&DWP(96+4,"esp")); # inp + + &vmovdqa ($t3,&QWP(64,$K256)); + &sub ($K256,3*64); # rewind K + &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? + &jb (&label("grand_avx_bmi")); + + &mov ("esp",&DWP(96+12,"esp")); # restore sp + &vzeroall (); +&function_end_A(); + } + } + }}} +&function_end_B("sha256_block_data_order"); + +&asm_finish(); + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha256-armv4.pl b/openssl-1.1.0h/crypto/sha/asm/sha256-armv4.pl new file mode 100644 index 0000000..55d30cb --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha256-armv4.pl @@ -0,0 +1,732 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPL terms is granted. +# ==================================================================== + +# SHA256 block procedure for ARMv4. May 2007. + +# Performance is ~2x better than gcc 3.4 generated code and in "abso- +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +# byte [on single-issue Xscale PXA250 core]. + +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$ctx="r0"; $t0="r0"; +$inp="r1"; $t4="r1"; +$len="r2"; $t1="r2"; +$T1="r3"; $t3="r3"; +$A="r4"; +$B="r5"; +$C="r6"; +$D="r7"; +$E="r8"; +$F="r9"; +$G="r10"; +$H="r11"; +@V=($A,$B,$C,$D,$E,$F,$G,$H); +$t2="r12"; +$Ktbl="r14"; + +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___ if ($i<16); +#if __ARM_ARCH__>=7 + @ ldr $t1,[$inp],#4 @ $i +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ + rev $t1,$t1 +# endif +#else + @ ldrb $t1,[$inp,#3] @ $i + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + ldrb $t2,[$inp,#2] + ldrb $t0,[$inp,#1] + orr $t1,$t1,$t2,lsl#8 + ldrb $t2,[$inp],#4 + orr $t1,$t1,$t0,lsl#16 +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + orr $t1,$t1,$t2,lsl#24 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +#endif +___ +$code.=<<___; + ldr $t2,[$Ktbl],#4 @ *K256++ + add $h,$h,$t1 @ h+=X[i] + str $t1,[sp,#`$i%16`*4] + eor $t1,$f,$g + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) + and $t1,$t1,$e + add $h,$h,$t2 @ h+=K256[i] + eor $t1,$t1,$g @ Ch(e,f,g) + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` + add $h,$h,$t1 @ h+=Ch(e,f,g) +#if $i==31 + and $t2,$t2,#0xff + cmp $t2,#0xf2 @ done? +#endif +#if $i<15 +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 @ prefetch +# else + ldrb $t1,[$inp,#3] +# endif + eor $t2,$a,$b @ a^b, b^c in next round +#else + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx + eor $t2,$a,$b @ a^b, b^c in next round + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx +#endif + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) + and $t3,$t3,$t2 @ (b^c)&=(a^b) + add $d,$d,$h @ d+=h + eor $t3,$t3,$b @ Maj(a,b,c) + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) + @ add $h,$h,$t3 @ h+=Maj(a,b,c) +___ + ($t2,$t3)=($t3,$t2); +} + +sub BODY_16_XX { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t4,[sp,#`($i+14)%16`*4] + mov $t0,$t1,ror#$sigma0[0] + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + mov $t2,$t4,ror#$sigma1[0] + eor $t0,$t0,$t1,ror#$sigma0[1] + eor $t2,$t2,$t4,ror#$sigma1[1] + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) + ldr $t1,[sp,#`($i+0)%16`*4] + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) + ldr $t4,[sp,#`($i+9)%16`*4] + + add $t2,$t2,$t0 + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 + add $t1,$t1,$t2 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + add $t1,$t1,$t4 @ X[i] +___ + &BODY_00_15(@_); +} + +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha256_block_data_order +#endif +.align 5 + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +.Lsha256_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ sha256_block_data_order +#else + adr r3,.Lsha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} + sub $Ktbl,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 +# else + ldrb $t1,[$inp,#3] +# endif + eor $t3,$B,$C @ magic + eor $t2,$t2,$t2 +___ +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".Lrounds_16_xx:\n"; +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t0,[$t3,#0] + ldr $t1,[$t3,#4] + ldr $t2,[$t3,#8] + add $A,$A,$t0 + ldr $t0,[$t3,#12] + add $B,$B,$t1 + ldr $t1,[$t3,#16] + add $C,$C,$t2 + ldr $t2,[$t3,#20] + add $D,$D,$t0 + ldr $t0,[$t3,#24] + add $E,$E,$t1 + ldr $t1,[$t3,#28] + add $F,$F,$t2 + ldr $inp,[sp,#17*4] @ pull inp + ldr $t2,[sp,#18*4] @ pull inp+len + add $G,$G,$t0 + add $H,$H,$t1 + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} + cmp $inp,$t2 + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#`16+3`*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + while($#insns>=2) { eval(shift(@insns)); } + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&eor ($t1,$f,$g)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&and ($t1,$t1,$e)', + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&eor ($t1,$t1,$g)', # Ch(e,f,g) + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&ldr ($t1,"[sp,#64]") if ($j==31)', + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&add ($d,$d,$h)', # d+=h + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 5 +.skip 16 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + sub $H,sp,#16*4+16 + adr $Ktbl,K256 + bic $H,$H,#15 @ align for 128-bit stores + mov $t2,sp + mov sp,$H @ alloca + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + + vld1.8 {@X[0]},[$inp]! + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + vld1.32 {$T0},[$Ktbl,:128]! + vld1.32 {$T1},[$Ktbl,:128]! + vld1.32 {$T2},[$Ktbl,:128]! + vld1.32 {$T3},[$Ktbl,:128]! + vrev32.8 @X[0],@X[0] @ yes, even on + str $ctx,[sp,#64] + vrev32.8 @X[1],@X[1] @ big-endian + str $inp,[sp,#68] + mov $Xfer,sp + vrev32.8 @X[2],@X[2] + str $len,[sp,#72] + vrev32.8 @X[3],@X[3] + str $t2,[sp,#76] @ save original sp + vadd.i32 $T0,$T0,@X[0] + vadd.i32 $T1,$T1,@X[1] + vst1.32 {$T0},[$Xfer,:128]! + vadd.i32 $T2,$T2,@X[2] + vst1.32 {$T1},[$Xfer,:128]! + vadd.i32 $T3,$T3,@X[3] + vst1.32 {$T2},[$Xfer,:128]! + vst1.32 {$T3},[$Xfer,:128]! + + ldmia $ctx,{$A-$H} + sub $Xfer,$Xfer,#64 + ldr $t1,[sp,#0] + eor $t2,$t2,$t2 + eor $t3,$B,$C + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + teq $t1,#0 @ check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + ldr $inp,[sp,#68] + ldr $t0,[sp,#72] + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl + teq $inp,$t0 + it eq + subeq $inp,$inp,#64 @ avoid SEGV + vld1.8 {@X[0]},[$inp]! @ load next input block + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + it ne + strne $inp,[sp,#68] + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + ldr $t0,[$t1,#0] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t2,[$t1,#4] + ldr $t3,[$t1,#8] + ldr $t4,[$t1,#12] + add $A,$A,$t0 @ accumulate + ldr $t0,[$t1,#16] + add $B,$B,$t2 + ldr $t2,[$t1,#20] + add $C,$C,$t3 + ldr $t3,[$t1,#24] + add $D,$D,$t4 + ldr $t4,[$t1,#28] + add $E,$E,$t0 + str $A,[$t1],#4 + add $F,$F,$t2 + str $B,[$t1],#4 + add $G,$G,$t3 + str $C,[$t1],#4 + add $H,$H,$t4 + str $D,[$t1],#4 + stmia $t1,{$E-$H} + + ittte ne + movne $Xfer,sp + ldrne $t1,[sp,#0] + eorne $t2,$t2,$t2 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne $t3,$B,$C + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} +###################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); +my @MSG=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); +my $Ktbl="r3"; + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {$ABCD,$EFGH},[$ctx] + sub $Ktbl,$Ktbl,#256+32 + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + b .Loop_v8 + +.align 4 +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vld1.32 {$W0},[$Ktbl]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + vrev32.8 @MSG[2],@MSG[2] + vrev32.8 @MSG[3],@MSG[3] + vmov $ABCD_SAVE,$ABCD @ offload + vmov $EFGH_SAVE,$EFGH + teq $inp,$len +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vld1.32 {$W0},[$Ktbl]! + vadd.i32 $W1,$W1,@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vld1.32 {$W1},[$Ktbl] + vadd.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#256-16 @ rewind + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vadd.i32 $W1,$W1,@MSG[3] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + vadd.i32 $EFGH,$EFGH,$EFGH_SAVE + it ne + bne .Loop_v8 + + vst1.32 {$ABCD,$EFGH},[$ctx] + + ret @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +{ my %opcode = ( + "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, + "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/openssl-1.1.0h/crypto/sha/asm/sha256-c64xplus.pl b/openssl-1.1.0h/crypto/sha/asm/sha256-c64xplus.pl new file mode 100644 index 0000000..3ab7d9b --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha256-c64xplus.pl @@ -0,0 +1,320 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256 for C64x+. +# +# January 2012 +# +# Performance is just below 10 cycles per processed byte, which is +# almost 40% faster than compiler-generated code. Unroll is unlikely +# to give more than ~8% improvement... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K256="A3"; + +($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) + =map("A$_",(16..31)); +($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) + =map("B$_",(16..31)); + +($Xia,$Xib)=("A5","B5"); # circular/ring buffer + $CTXB=$t2e; + +($Xn,$X0,$K)=("B7","B8","B9"); +($Maj,$Ch)=($T2,"B6"); + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha256_block_data_order,_sha256_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg SWAP2,MV + .asg SWAP4,MV + .endif + + .global _sha256_block_data_order +_sha256_block_data_order: +__sha256_block: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] ADDKPC __sha256_block,B2 +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + .if __TI_EABI__ + [A0] MVK 0x00404,B1 +|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256 + .else + [A0] MVK 0x00404,B1 +|| [A0] MVKL (K256-__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH (K256-__sha256_block),$K256 + .endif + [A0] MVC B1,AMR ; setup circular addressing +|| [A0] MV SP,$Xia + [A0] MV SP,$Xib +|| [A0] ADD B2,$K256,$K256 +|| [A0] MV $CTXA,$CTXB +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + LDW *${CTXA}[0],$A ; load ctx +|| LDW *${CTXB}[4],$E + LDW *${CTXA}[1],$B +|| LDW *${CTXB}[5],$F + LDW *${CTXA}[2],$C +|| LDW *${CTXB}[6],$G + LDW *${CTXA}[3],$D +|| LDW *${CTXB}[7],$H + + LDNW *$INP++,$Xn ; pre-fetch input + LDW *$K256++,$K ; pre-fetch K256[0] + MVK 14,B0 ; loop counters + MVK 47,B1 +|| ADDAW $Xia,9,$Xia +outerloop?: + SUB A0,1,A0 +|| MV $A,$Actx +|| MV $E,$Ectx +|| MVD $B,$Bctx +|| MVD $F,$Fctx + MV $C,$Cctx +|| MV $G,$Gctx +|| MVD $D,$Dctx +|| MVD $H,$Hctx +|| SWAP4 $Xn,$X0 + + SPLOOPD 8 ; BODY_00_14 +|| MVC B0,ILC +|| SWAP2 $X0,$X0 + + LDNW *$INP++,$Xn +|| ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X14 +|| SWAP4 $Xn,$X0 + SWAP2 $X0,$X0 +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c + MV $B,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + SPKERNEL + + ROTL $A,30,$S0 ; BODY_15 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| LDW *${Xib}[1],$Xn ; modulo-scheduled + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| LDW *${Xib}[2],$X1 ; modulo-scheduled + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X15 + MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; modulo-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled +|| ROTL $B,0,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + + SPLOOPD 10 ; BODY_16_63 +|| MVC B1,ILC +|| ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled + + XOR $t0e,$s0,$s0 +|| XOR $t0a,$s1,$s1 +|| MV $X15,$X14 +|| MV $X1,$Xn + XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) +|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) +|| LDW *${Xib}[2],$X1 ; module-scheduled + ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $H,$K,$T1 ; T1 = h + K256[i] +|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) + XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 +|| ADD $X0,$T1,$T1 ; T1 += X[i] +|| STW $X0,*$Xib++ + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) +|| MV $X0,$X15 +|| ROTL $G,0,$H ; h = g +|| LDW *$K256++,$K ; pre-fetch K256[i+1] + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| MV $F,$G ; g = f +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *++$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; module-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $B,$C ; c = b + MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 +|| SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled + SPKERNEL + + [A0] B outerloop? +|| [A0] LDNW *$INP++,$Xn ; pre-fetch input +|| [A0] ADDK -260,$K256 ; rewind K256 +|| ADD $Actx,$A,$A ; accumulate ctx +|| ADD $Ectx,$E,$E +|| ADD $Bctx,$B,$B + ADD $Fctx,$F,$F +|| ADD $Cctx,$C,$C +|| ADD $Gctx,$G,$G +|| ADD $Dctx,$D,$D +|| ADD $Hctx,$H,$H +|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] + + [!A0] BNOP RA +||[!A0] MV $CTXA,$CTXB + [!A0] MV FP,SP ; restore stack pointer +||[!A0] LDW *FP[0],FP ; restore frame pointer + [!A0] STW $A,*${CTXA}[0] ; save ctx +||[!A0] STW $E,*${CTXB}[4] +||[!A0] MVK 0,B0 + [!A0] STW $B,*${CTXA}[1] +||[!A0] STW $F,*${CTXB}[5] +||[!A0] MVC B0,AMR ; clear AMR + STW $C,*${CTXA}[2] +|| STW $G,*${CTXB}[6] + STW $D,*${CTXA}[3] +|| STW $H,*${CTXB}[7] + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K256: + .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by " + .align 4 + +___ + +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha256-mb-x86_64.pl b/openssl-1.1.0h/crypto/sha/asm/sha256-mb-x86_64.pl new file mode 100644 index 0000000..fbcd29f --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha256-mb-x86_64.pl @@ -0,0 +1,1568 @@ +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer SHA256 procedure processes n buffers in parallel by +# placing buffer data to designated lane of SIMD register. n is +# naturally limited to 4 on pre-AVX2 processors and to 8 on +# AVX2-capable processors such as Haswell. +# +# this +aesni(i) sha256 aesni-sha256 gain(iv) +# ------------------------------------------------------------------- +# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% +# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95% +# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% +# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% +# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% +# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170% +# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% +# +# (i) multi-block CBC encrypt with 128-bit key; +# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, +# because of lower AES-NI instruction throughput, nor is there +# AES-NI-SHA256 stitch for these processors; +# (iii) "this" is for n=8, when we gather twice as much data, result +# for n=4 is 20.3+4.44=24.7; +# (iv) presented improvement coefficients are asymptotic limits and +# in real-life application are somewhat lower, e.g. for 2KB +# fragments they range from 75% to 130% (on Haswell); + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +# void sha256_multi_block ( +# struct { unsigned int A[8]; +# unsigned int B[8]; +# unsigned int C[8]; +# unsigned int D[8]; +# unsigned int E[8]; +# unsigned int F[8]; +# unsigned int G[8]; +# unsigned int H[8]; } *ctx, +# struct { void *ptr; int blocks; } inp[8], +# int num); /* 1 or 2 */ +# +$ctx="%rdi"; # 1st arg +$inp="%rsi"; # 2nd arg +$num="%edx"; # 3rd arg +@ptr=map("%r$_",(8..11)); +$Tbl="%rbp"; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); +($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); + +$REG_SZ=16; + +sub Xi_off { +my $off = shift; + + $off %= 16; $off *= $REG_SZ; + $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; +} + +sub ROUND_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; + +$code.=<<___ if ($i<15); + movd `4*$i`(@ptr[0]),$Xi + movd `4*$i`(@ptr[1]),$t1 + movd `4*$i`(@ptr[2]),$t2 + movd `4*$i`(@ptr[3]),$t3 + punpckldq $t2,$Xi + punpckldq $t3,$t1 + punpckldq $t1,$Xi +___ +$code.=<<___ if ($i==15); + movd `4*$i`(@ptr[0]),$Xi + lea `16*4`(@ptr[0]),@ptr[0] + movd `4*$i`(@ptr[1]),$t1 + lea `16*4`(@ptr[1]),@ptr[1] + movd `4*$i`(@ptr[2]),$t2 + lea `16*4`(@ptr[2]),@ptr[2] + movd `4*$i`(@ptr[3]),$t3 + lea `16*4`(@ptr[3]),@ptr[3] + punpckldq $t2,$Xi + punpckldq $t3,$t1 + punpckldq $t1,$Xi +___ +$code.=<<___; + movdqa $e,$sigma + `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)` + movdqa $e,$t3 + `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)` + psrld \$6,$sigma + movdqa $e,$t2 + pslld \$7,$t3 + movdqa $Xi,`&Xi_off($i)` + paddd $h,$Xi # Xi+=h + + psrld \$11,$t2 + pxor $t3,$sigma + pslld \$21-7,$t3 + paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round] + pxor $t2,$sigma + + psrld \$25-11,$t2 + movdqa $e,$t1 + `"prefetcht0 63(@ptr[0])" if ($i==15)` + pxor $t3,$sigma + movdqa $e,$axb # borrow $axb + pslld \$26-21,$t3 + pandn $g,$t1 + pand $f,$axb + pxor $t2,$sigma + + `"prefetcht0 63(@ptr[1])" if ($i==15)` + movdqa $a,$t2 + pxor $t3,$sigma # Sigma1(e) + movdqa $a,$t3 + psrld \$2,$t2 + paddd $sigma,$Xi # Xi+=Sigma1(e) + pxor $axb,$t1 # Ch(e,f,g) + movdqa $b,$axb + movdqa $a,$sigma + pslld \$10,$t3 + pxor $a,$axb # a^b, b^c in next round + + `"prefetcht0 63(@ptr[2])" if ($i==15)` + psrld \$13,$sigma + pxor $t3,$t2 + paddd $t1,$Xi # Xi+=Ch(e,f,g) + pslld \$19-10,$t3 + pand $axb,$bxc + pxor $sigma,$t2 + + `"prefetcht0 63(@ptr[3])" if ($i==15)` + psrld \$22-13,$sigma + pxor $t3,$t2 + movdqa $b,$h + pslld \$30-19,$t3 + pxor $t2,$sigma + pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + paddd $Xi,$d # d+=Xi + pxor $t3,$sigma # Sigma0(a) + + paddd $Xi,$h # h+=Xi + paddd $sigma,$h # h+=Sigma0(a) +___ +$code.=<<___ if (($i%8)==7); + lea `32*8`($Tbl),$Tbl +___ + ($axb,$bxc)=($bxc,$axb); +} + +sub ROUND_16_XX { +my $i=shift; + +$code.=<<___; + movdqa `&Xi_off($i+1)`,$Xn + paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9] + + movdqa $Xn,$sigma + movdqa $Xn,$t2 + psrld \$3,$sigma + movdqa $Xn,$t3 + + psrld \$7,$t2 + movdqa `&Xi_off($i+14)`,$t1 + pslld \$14,$t3 + pxor $t2,$sigma + psrld \$18-7,$t2 + movdqa $t1,$axb # borrow $axb + pxor $t3,$sigma + pslld \$25-14,$t3 + pxor $t2,$sigma + psrld \$10,$t1 + movdqa $axb,$t2 + + psrld \$17,$axb + pxor $t3,$sigma # sigma0(X[i+1]) + pslld \$13,$t2 + paddd $sigma,$Xi # Xi+=sigma0(e) + pxor $axb,$t1 + psrld \$19-17,$axb + pxor $t2,$t1 + pslld \$15-13,$t2 + pxor $axb,$t1 + pxor $t2,$t1 # sigma0(X[i+14]) + paddd $t1,$Xi # Xi+=sigma1(X[i+14]) +___ + &ROUND_00_15($i,@_); + ($Xi,$Xn)=($Xn,$Xi); +} + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl sha256_multi_block +.type sha256_multi_block,\@function,3 +.align 32 +sha256_multi_block: + mov OPENSSL_ia32cap_P+4(%rip),%rcx + bt \$61,%rcx # check SHA bit + jc _shaext_shortcut +___ +$code.=<<___ if ($avx); + test \$`1<<28`,%ecx + jnz _avx_shortcut +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody: + lea K256+128(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + lea 0x80($ctx),$ctx # size optimization + +.Loop_grande: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone + + movdqu 0x00-0x80($ctx),$A # load context + lea 128(%rsp),%rax + movdqu 0x20-0x80($ctx),$B + movdqu 0x40-0x80($ctx),$C + movdqu 0x60-0x80($ctx),$D + movdqu 0x80-0x80($ctx),$E + movdqu 0xa0-0x80($ctx),$F + movdqu 0xc0-0x80($ctx),$G + movdqu 0xe0-0x80($ctx),$H + movdqu .Lpbswap(%rip),$Xn + jmp .Loop + +.align 32 +.Loop: + movdqa $C,$bxc + pxor $B,$bxc # magic seed +___ +for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + movdqu `&Xi_off($i)`,$Xi + mov \$3,%ecx + jmp .Loop_16_xx +.align 32 +.Loop_16_xx: +___ +for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + dec %ecx + jnz .Loop_16_xx + + mov \$1,%ecx + lea K256+128(%rip),$Tbl + + movdqa (%rbx),$sigma # pull counters + cmp 4*0(%rbx),%ecx # examine counters + pxor $t1,$t1 + cmovge $Tbl,@ptr[0] # cancel input + cmp 4*1(%rbx),%ecx + movdqa $sigma,$Xn + cmovge $Tbl,@ptr[1] + cmp 4*2(%rbx),%ecx + pcmpgtd $t1,$Xn # mask value + cmovge $Tbl,@ptr[2] + cmp 4*3(%rbx),%ecx + paddd $Xn,$sigma # counters-- + cmovge $Tbl,@ptr[3] + + movdqu 0x00-0x80($ctx),$t1 + pand $Xn,$A + movdqu 0x20-0x80($ctx),$t2 + pand $Xn,$B + movdqu 0x40-0x80($ctx),$t3 + pand $Xn,$C + movdqu 0x60-0x80($ctx),$Xi + pand $Xn,$D + paddd $t1,$A + movdqu 0x80-0x80($ctx),$t1 + pand $Xn,$E + paddd $t2,$B + movdqu 0xa0-0x80($ctx),$t2 + pand $Xn,$F + paddd $t3,$C + movdqu 0xc0-0x80($ctx),$t3 + pand $Xn,$G + paddd $Xi,$D + movdqu 0xe0-0x80($ctx),$Xi + pand $Xn,$H + paddd $t1,$E + paddd $t2,$F + movdqu $A,0x00-0x80($ctx) + paddd $t3,$G + movdqu $B,0x20-0x80($ctx) + paddd $Xi,$H + movdqu $C,0x40-0x80($ctx) + movdqu $D,0x60-0x80($ctx) + movdqu $E,0x80-0x80($ctx) + movdqu $F,0xa0-0x80($ctx) + movdqu $G,0xc0-0x80($ctx) + movdqu $H,0xe0-0x80($ctx) + + movdqa $sigma,(%rbx) # save counters + movdqa .Lpbswap(%rip),$Xn + dec $num + jnz .Loop + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande + +.Ldone: + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue: + ret +.size sha256_multi_block,.-sha256_multi_block +___ + {{{ +my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15)); +my @MSG0=map("%xmm$_",(4..7)); +my @MSG1=map("%xmm$_",(8..11)); + +$code.=<<___; +.type sha256_multi_block_shaext,\@function,3 +.align 32 +sha256_multi_block_shaext: +_shaext_shortcut: + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`,%rsp + shl \$1,$num # we process pair at a time + and \$-256,%rsp + lea 0x80($ctx),$ctx # size optimization + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody_shaext: + lea `$REG_SZ*16`(%rsp),%rbx + lea K256_shaext+0x80(%rip),$Tbl + +.Loop_grande_shaext: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<2;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone_shaext + + movq 0x00-0x80($ctx),$ABEF0 # A1.A0 + movq 0x20-0x80($ctx),@MSG0[0] # B1.B0 + movq 0x40-0x80($ctx),$CDGH0 # C1.C0 + movq 0x60-0x80($ctx),@MSG0[1] # D1.D0 + movq 0x80-0x80($ctx),@MSG1[0] # E1.E0 + movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0 + movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0 + movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0 + + punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0 + punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0 + punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0 + punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0 + movdqa K256_shaext-0x10(%rip),$TMPx # byte swap + + movdqa $ABEF0,$ABEF1 + movdqa $CDGH0,$CDGH1 + punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0 + punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0 + punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1 + punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1 + + pshufd \$0b00011011,$ABEF0,$ABEF0 + pshufd \$0b00011011,$CDGH0,$CDGH0 + pshufd \$0b00011011,$ABEF1,$ABEF1 + pshufd \$0b00011011,$CDGH1,$CDGH1 + jmp .Loop_shaext + +.align 32 +.Loop_shaext: + movdqu 0x00(@ptr[0]),@MSG0[0] + movdqu 0x00(@ptr[1]),@MSG1[0] + movdqu 0x10(@ptr[0]),@MSG0[1] + movdqu 0x10(@ptr[1]),@MSG1[1] + movdqu 0x20(@ptr[0]),@MSG0[2] + pshufb $TMPx,@MSG0[0] + movdqu 0x20(@ptr[1]),@MSG1[2] + pshufb $TMPx,@MSG1[0] + movdqu 0x30(@ptr[0]),@MSG0[3] + lea 0x40(@ptr[0]),@ptr[0] + movdqu 0x30(@ptr[1]),@MSG1[3] + lea 0x40(@ptr[1]),@ptr[1] + + movdqa 0*16-0x80($Tbl),$Wi + pshufb $TMPx,@MSG0[1] + paddd @MSG0[0],$Wi + pxor $ABEF0,@MSG0[0] # black magic + movdqa $Wi,$TMP0 + movdqa 0*16-0x80($Tbl),$TMP1 + pshufb $TMPx,@MSG1[1] + paddd @MSG1[0],$TMP1 + movdqa $CDGH0,0x50(%rsp) # offload + sha256rnds2 $ABEF0,$CDGH0 # 0-3 + pxor $ABEF1,@MSG1[0] # black magic + movdqa $TMP1,$Wi + movdqa $CDGH1,0x70(%rsp) + sha256rnds2 $ABEF1,$CDGH1 # 0-3 + pshufd \$0x0e,$TMP0,$Wi + pxor $ABEF0,@MSG0[0] # black magic + movdqa $ABEF0,0x40(%rsp) # offload + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + pxor $ABEF1,@MSG1[0] # black magic + movdqa $ABEF1,0x60(%rsp) + movdqa 1*16-0x80($Tbl),$TMP0 + paddd @MSG0[1],$TMP0 + pshufb $TMPx,@MSG0[2] + sha256rnds2 $CDGH1,$ABEF1 + + movdqa $TMP0,$Wi + movdqa 1*16-0x80($Tbl),$TMP1 + paddd @MSG1[1],$TMP1 + sha256rnds2 $ABEF0,$CDGH0 # 4-7 + movdqa $TMP1,$Wi + prefetcht0 127(@ptr[0]) + pshufb $TMPx,@MSG0[3] + pshufb $TMPx,@MSG1[2] + prefetcht0 127(@ptr[1]) + sha256rnds2 $ABEF1,$CDGH1 # 4-7 + pshufd \$0x0e,$TMP0,$Wi + pshufb $TMPx,@MSG1[3] + sha256msg1 @MSG0[1],@MSG0[0] + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + movdqa 2*16-0x80($Tbl),$TMP0 + paddd @MSG0[2],$TMP0 + sha256rnds2 $CDGH1,$ABEF1 + + movdqa $TMP0,$Wi + movdqa 2*16-0x80($Tbl),$TMP1 + paddd @MSG1[2],$TMP1 + sha256rnds2 $ABEF0,$CDGH0 # 8-11 + sha256msg1 @MSG1[1],@MSG1[0] + movdqa $TMP1,$Wi + movdqa @MSG0[3],$TMPx + sha256rnds2 $ABEF1,$CDGH1 # 8-11 + pshufd \$0x0e,$TMP0,$Wi + palignr \$4,@MSG0[2],$TMPx + paddd $TMPx,@MSG0[0] + movdqa @MSG1[3],$TMPx + palignr \$4,@MSG1[2],$TMPx + sha256msg1 @MSG0[2],@MSG0[1] + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + movdqa 3*16-0x80($Tbl),$TMP0 + paddd @MSG0[3],$TMP0 + sha256rnds2 $CDGH1,$ABEF1 + sha256msg1 @MSG1[2],@MSG1[1] + + movdqa $TMP0,$Wi + movdqa 3*16-0x80($Tbl),$TMP1 + paddd $TMPx,@MSG1[0] + paddd @MSG1[3],$TMP1 + sha256msg2 @MSG0[3],@MSG0[0] + sha256rnds2 $ABEF0,$CDGH0 # 12-15 + movdqa $TMP1,$Wi + movdqa @MSG0[0],$TMPx + palignr \$4,@MSG0[3],$TMPx + sha256rnds2 $ABEF1,$CDGH1 # 12-15 + sha256msg2 @MSG1[3],@MSG1[0] + pshufd \$0x0e,$TMP0,$Wi + paddd $TMPx,@MSG0[1] + movdqa @MSG1[0],$TMPx + palignr \$4,@MSG1[3],$TMPx + sha256msg1 @MSG0[3],@MSG0[2] + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + movdqa 4*16-0x80($Tbl),$TMP0 + paddd @MSG0[0],$TMP0 + sha256rnds2 $CDGH1,$ABEF1 + sha256msg1 @MSG1[3],@MSG1[2] +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $TMP0,$Wi + movdqa $i*16-0x80($Tbl),$TMP1 + paddd $TMPx,@MSG1[1] + paddd @MSG1[0],$TMP1 + sha256msg2 @MSG0[0],@MSG0[1] + sha256rnds2 $ABEF0,$CDGH0 # 16-19... + movdqa $TMP1,$Wi + movdqa @MSG0[1],$TMPx + palignr \$4,@MSG0[0],$TMPx + sha256rnds2 $ABEF1,$CDGH1 # 16-19... + sha256msg2 @MSG1[0],@MSG1[1] + pshufd \$0x0e,$TMP0,$Wi + paddd $TMPx,@MSG0[2] + movdqa @MSG1[1],$TMPx + palignr \$4,@MSG1[0],$TMPx + sha256msg1 @MSG0[0],@MSG0[3] + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + movdqa `($i+1)*16`-0x80($Tbl),$TMP0 + paddd @MSG0[1],$TMP0 + sha256rnds2 $CDGH1,$ABEF1 + sha256msg1 @MSG1[0],@MSG1[3] +___ + push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); +} +$code.=<<___; + movdqa $TMP0,$Wi + movdqa 13*16-0x80($Tbl),$TMP1 + paddd $TMPx,@MSG1[1] + paddd @MSG1[0],$TMP1 + sha256msg2 @MSG0[0],@MSG0[1] + sha256rnds2 $ABEF0,$CDGH0 # 52-55 + movdqa $TMP1,$Wi + movdqa @MSG0[1],$TMPx + palignr \$4,@MSG0[0],$TMPx + sha256rnds2 $ABEF1,$CDGH1 # 52-55 + sha256msg2 @MSG1[0],@MSG1[1] + pshufd \$0x0e,$TMP0,$Wi + paddd $TMPx,@MSG0[2] + movdqa @MSG1[1],$TMPx + palignr \$4,@MSG1[0],$TMPx + nop + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + movdqa 14*16-0x80($Tbl),$TMP0 + paddd @MSG0[1],$TMP0 + sha256rnds2 $CDGH1,$ABEF1 + + movdqa $TMP0,$Wi + movdqa 14*16-0x80($Tbl),$TMP1 + paddd $TMPx,@MSG1[2] + paddd @MSG1[1],$TMP1 + sha256msg2 @MSG0[1],@MSG0[2] + nop + sha256rnds2 $ABEF0,$CDGH0 # 56-59 + movdqa $TMP1,$Wi + mov \$1,%ecx + pxor @MSG0[1],@MSG0[1] # zero + sha256rnds2 $ABEF1,$CDGH1 # 56-59 + sha256msg2 @MSG1[1],@MSG1[2] + pshufd \$0x0e,$TMP0,$Wi + movdqa 15*16-0x80($Tbl),$TMP0 + paddd @MSG0[2],$TMP0 + movq (%rbx),@MSG0[2] # pull counters + nop + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + movdqa 15*16-0x80($Tbl),$TMP1 + paddd @MSG1[2],$TMP1 + sha256rnds2 $CDGH1,$ABEF1 + + movdqa $TMP0,$Wi + cmp 4*0(%rbx),%ecx # examine counters + cmovge %rsp,@ptr[0] # cancel input + cmp 4*1(%rbx),%ecx + cmovge %rsp,@ptr[1] + pshufd \$0x00,@MSG0[2],@MSG1[0] + sha256rnds2 $ABEF0,$CDGH0 # 60-63 + movdqa $TMP1,$Wi + pshufd \$0x55,@MSG0[2],@MSG1[1] + movdqa @MSG0[2],@MSG1[2] + sha256rnds2 $ABEF1,$CDGH1 # 60-63 + pshufd \$0x0e,$TMP0,$Wi + pcmpgtd @MSG0[1],@MSG1[0] + pcmpgtd @MSG0[1],@MSG1[1] + sha256rnds2 $CDGH0,$ABEF0 + pshufd \$0x0e,$TMP1,$Wi + pcmpgtd @MSG0[1],@MSG1[2] # counter mask + movdqa K256_shaext-0x10(%rip),$TMPx + sha256rnds2 $CDGH1,$ABEF1 + + pand @MSG1[0],$CDGH0 + pand @MSG1[1],$CDGH1 + pand @MSG1[0],$ABEF0 + pand @MSG1[1],$ABEF1 + paddd @MSG0[2],@MSG1[2] # counters-- + + paddd 0x50(%rsp),$CDGH0 + paddd 0x70(%rsp),$CDGH1 + paddd 0x40(%rsp),$ABEF0 + paddd 0x60(%rsp),$ABEF1 + + movq @MSG1[2],(%rbx) # save counters + dec $num + jnz .Loop_shaext + + mov `$REG_SZ*17+8`(%rsp),$num + + pshufd \$0b00011011,$ABEF0,$ABEF0 + pshufd \$0b00011011,$CDGH0,$CDGH0 + pshufd \$0b00011011,$ABEF1,$ABEF1 + pshufd \$0b00011011,$CDGH1,$CDGH1 + + movdqa $ABEF0,@MSG0[0] + movdqa $CDGH0,@MSG0[1] + punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0 + punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0 + punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0 + punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0 + + movq $ABEF0,0x00-0x80($ctx) # A1.A0 + psrldq \$8,$ABEF0 + movq @MSG0[0],0x80-0x80($ctx) # E1.E0 + psrldq \$8,@MSG0[0] + movq $ABEF0,0x20-0x80($ctx) # B1.B0 + movq @MSG0[0],0xa0-0x80($ctx) # F1.F0 + + movq $CDGH0,0x40-0x80($ctx) # C1.C0 + psrldq \$8,$CDGH0 + movq @MSG0[1],0xc0-0x80($ctx) # G1.G0 + psrldq \$8,@MSG0[1] + movq $CDGH0,0x60-0x80($ctx) # D1.D0 + movq @MSG0[1],0xe0-0x80($ctx) # H1.H0 + + lea `$REG_SZ/2`($ctx),$ctx + lea `16*2`($inp),$inp + dec $num + jnz .Loop_grande_shaext + +.Ldone_shaext: + #mov `$REG_SZ*17`(%rsp),%rax # original %rsp +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue_shaext: + ret +.size sha256_multi_block_shaext,.-sha256_multi_block_shaext +___ + }}} + if ($avx) {{{ +sub ROUND_00_15_avx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; + +$code.=<<___ if ($i<15 && $REG_SZ==16); + vmovd `4*$i`(@ptr[0]),$Xi + vmovd `4*$i`(@ptr[1]),$t1 + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 + vpunpckldq $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___ if ($i==15 && $REG_SZ==16); + vmovd `4*$i`(@ptr[0]),$Xi + lea `16*4`(@ptr[0]),@ptr[0] + vmovd `4*$i`(@ptr[1]),$t1 + lea `16*4`(@ptr[1]),@ptr[1] + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 + lea `16*4`(@ptr[3]),@ptr[3] + vpunpckldq $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___ if ($i<15 && $REG_SZ==32); + vmovd `4*$i`(@ptr[0]),$Xi + vmovd `4*$i`(@ptr[4]),$t1 + vmovd `4*$i`(@ptr[1]),$t2 + vmovd `4*$i`(@ptr[5]),$t3 + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 + vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 + vpunpckldq $t2,$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 + vpunpckldq $t3,$t1,$t1 + vinserti128 $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___ if ($i==15 && $REG_SZ==32); + vmovd `4*$i`(@ptr[0]),$Xi + lea `16*4`(@ptr[0]),@ptr[0] + vmovd `4*$i`(@ptr[4]),$t1 + lea `16*4`(@ptr[4]),@ptr[4] + vmovd `4*$i`(@ptr[1]),$t2 + lea `16*4`(@ptr[1]),@ptr[1] + vmovd `4*$i`(@ptr[5]),$t3 + lea `16*4`(@ptr[5]),@ptr[5] + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 + lea `16*4`(@ptr[6]),@ptr[6] + vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 + lea `16*4`(@ptr[3]),@ptr[3] + vpunpckldq $t2,$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 + lea `16*4`(@ptr[7]),@ptr[7] + vpunpckldq $t3,$t1,$t1 + vinserti128 $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___; + vpsrld \$6,$e,$sigma + vpslld \$26,$e,$t3 + vmovdqu $Xi,`&Xi_off($i)` + vpaddd $h,$Xi,$Xi # Xi+=h + + vpsrld \$11,$e,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$21,$e,$t3 + vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round] + vpxor $t2,$sigma,$sigma + + vpsrld \$25,$e,$t2 + vpxor $t3,$sigma,$sigma + `"prefetcht0 63(@ptr[0])" if ($i==15)` + vpslld \$7,$e,$t3 + vpandn $g,$e,$t1 + vpand $f,$e,$axb # borrow $axb + `"prefetcht0 63(@ptr[1])" if ($i==15)` + vpxor $t2,$sigma,$sigma + + vpsrld \$2,$a,$h # borrow $h + vpxor $t3,$sigma,$sigma # Sigma1(e) + `"prefetcht0 63(@ptr[2])" if ($i==15)` + vpslld \$30,$a,$t2 + vpxor $axb,$t1,$t1 # Ch(e,f,g) + vpxor $a,$b,$axb # a^b, b^c in next round + `"prefetcht0 63(@ptr[3])" if ($i==15)` + vpxor $t2,$h,$h + vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) + + vpsrld \$13,$a,$t2 + `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` + vpslld \$19,$a,$t3 + vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) + vpand $axb,$bxc,$bxc + `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` + vpxor $t2,$h,$sigma + + vpsrld \$22,$a,$t2 + vpxor $t3,$sigma,$sigma + `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` + vpslld \$10,$a,$t3 + vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + vpaddd $Xi,$d,$d # d+=Xi + `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` + vpxor $t2,$sigma,$sigma + vpxor $t3,$sigma,$sigma # Sigma0(a) + + vpaddd $Xi,$h,$h # h+=Xi + vpaddd $sigma,$h,$h # h+=Sigma0(a) +___ +$code.=<<___ if (($i%8)==7); + add \$`32*8`,$Tbl +___ + ($axb,$bxc)=($bxc,$axb); +} + +sub ROUND_16_XX_avx { +my $i=shift; + +$code.=<<___; + vmovdqu `&Xi_off($i+1)`,$Xn + vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9] + + vpsrld \$3,$Xn,$sigma + vpsrld \$7,$Xn,$t2 + vpslld \$25,$Xn,$t3 + vpxor $t2,$sigma,$sigma + vpsrld \$18,$Xn,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$14,$Xn,$t3 + vmovdqu `&Xi_off($i+14)`,$t1 + vpsrld \$10,$t1,$axb # borrow $axb + + vpxor $t2,$sigma,$sigma + vpsrld \$17,$t1,$t2 + vpxor $t3,$sigma,$sigma # sigma0(X[i+1]) + vpslld \$15,$t1,$t3 + vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e) + vpxor $t2,$axb,$sigma + vpsrld \$19,$t1,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$13,$t1,$t3 + vpxor $t2,$sigma,$sigma + vpxor $t3,$sigma,$sigma # sigma0(X[i+14]) + vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14]) +___ + &ROUND_00_15_avx($i,@_); + ($Xi,$Xn)=($Xn,$Xi); +} + +$code.=<<___; +.type sha256_multi_block_avx,\@function,3 +.align 32 +sha256_multi_block_avx: +_avx_shortcut: +___ +$code.=<<___ if ($avx>1); + shr \$32,%rcx + cmp \$2,$num + jb .Lavx + test \$`1<<5`,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody_avx: + lea K256+128(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + lea 0x80($ctx),$ctx # size optimization + +.Loop_grande_avx: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone_avx + + vmovdqu 0x00-0x80($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20-0x80($ctx),$B + vmovdqu 0x40-0x80($ctx),$C + vmovdqu 0x60-0x80($ctx),$D + vmovdqu 0x80-0x80($ctx),$E + vmovdqu 0xa0-0x80($ctx),$F + vmovdqu 0xc0-0x80($ctx),$G + vmovdqu 0xe0-0x80($ctx),$H + vmovdqu .Lpbswap(%rip),$Xn + jmp .Loop_avx + +.align 32 +.Loop_avx: + vpxor $B,$C,$bxc # magic seed +___ +for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + vmovdqu `&Xi_off($i)`,$Xi + mov \$3,%ecx + jmp .Loop_16_xx_avx +.align 32 +.Loop_16_xx_avx: +___ +for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + dec %ecx + jnz .Loop_16_xx_avx + + mov \$1,%ecx + lea K256+128(%rip),$Tbl +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqa (%rbx),$sigma # pull counters + vpxor $t1,$t1,$t1 + vmovdqa $sigma,$Xn + vpcmpgtd $t1,$Xn,$Xn # mask value + vpaddd $Xn,$sigma,$sigma # counters-- + + vmovdqu 0x00-0x80($ctx),$t1 + vpand $Xn,$A,$A + vmovdqu 0x20-0x80($ctx),$t2 + vpand $Xn,$B,$B + vmovdqu 0x40-0x80($ctx),$t3 + vpand $Xn,$C,$C + vmovdqu 0x60-0x80($ctx),$Xi + vpand $Xn,$D,$D + vpaddd $t1,$A,$A + vmovdqu 0x80-0x80($ctx),$t1 + vpand $Xn,$E,$E + vpaddd $t2,$B,$B + vmovdqu 0xa0-0x80($ctx),$t2 + vpand $Xn,$F,$F + vpaddd $t3,$C,$C + vmovdqu 0xc0-0x80($ctx),$t3 + vpand $Xn,$G,$G + vpaddd $Xi,$D,$D + vmovdqu 0xe0-0x80($ctx),$Xi + vpand $Xn,$H,$H + vpaddd $t1,$E,$E + vpaddd $t2,$F,$F + vmovdqu $A,0x00-0x80($ctx) + vpaddd $t3,$G,$G + vmovdqu $B,0x20-0x80($ctx) + vpaddd $Xi,$H,$H + vmovdqu $C,0x40-0x80($ctx) + vmovdqu $D,0x60-0x80($ctx) + vmovdqu $E,0x80-0x80($ctx) + vmovdqu $F,0xa0-0x80($ctx) + vmovdqu $G,0xc0-0x80($ctx) + vmovdqu $H,0xe0-0x80($ctx) + + vmovdqu $sigma,(%rbx) # save counters + vmovdqu .Lpbswap(%rip),$Xn + dec $num + jnz .Loop_avx + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande_avx + +.Ldone_avx: + mov `$REG_SZ*17`(%rsp),%rax # original %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue_avx: + ret +.size sha256_multi_block_avx,.-sha256_multi_block_avx +___ + if ($avx>1) { +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +$REG_SZ=32; +@ptr=map("%r$_",(12..15,8..11)); + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15)); +($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7)); + +$code.=<<___; +.type sha256_multi_block_avx2,\@function,3 +.align 32 +sha256_multi_block_avx2: +_avx2_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.Lbody_avx2: + lea K256+128(%rip),$Tbl + lea 0x80($ctx),$ctx # size optimization + +.Loop_grande_avx2: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num + lea `$REG_SZ*16`(%rsp),%rbx +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu 0x00-0x80($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20-0x80($ctx),$B + lea 256+128(%rsp),%rbx + vmovdqu 0x40-0x80($ctx),$C + vmovdqu 0x60-0x80($ctx),$D + vmovdqu 0x80-0x80($ctx),$E + vmovdqu 0xa0-0x80($ctx),$F + vmovdqu 0xc0-0x80($ctx),$G + vmovdqu 0xe0-0x80($ctx),$H + vmovdqu .Lpbswap(%rip),$Xn + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + vpxor $B,$C,$bxc # magic seed +___ +for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + vmovdqu `&Xi_off($i)`,$Xi + mov \$3,%ecx + jmp .Loop_16_xx_avx2 +.align 32 +.Loop_16_xx_avx2: +___ +for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + dec %ecx + jnz .Loop_16_xx_avx2 + + mov \$1,%ecx + lea `$REG_SZ*16`(%rsp),%rbx + lea K256+128(%rip),$Tbl +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqa (%rbx),$sigma # pull counters + vpxor $t1,$t1,$t1 + vmovdqa $sigma,$Xn + vpcmpgtd $t1,$Xn,$Xn # mask value + vpaddd $Xn,$sigma,$sigma # counters-- + + vmovdqu 0x00-0x80($ctx),$t1 + vpand $Xn,$A,$A + vmovdqu 0x20-0x80($ctx),$t2 + vpand $Xn,$B,$B + vmovdqu 0x40-0x80($ctx),$t3 + vpand $Xn,$C,$C + vmovdqu 0x60-0x80($ctx),$Xi + vpand $Xn,$D,$D + vpaddd $t1,$A,$A + vmovdqu 0x80-0x80($ctx),$t1 + vpand $Xn,$E,$E + vpaddd $t2,$B,$B + vmovdqu 0xa0-0x80($ctx),$t2 + vpand $Xn,$F,$F + vpaddd $t3,$C,$C + vmovdqu 0xc0-0x80($ctx),$t3 + vpand $Xn,$G,$G + vpaddd $Xi,$D,$D + vmovdqu 0xe0-0x80($ctx),$Xi + vpand $Xn,$H,$H + vpaddd $t1,$E,$E + vpaddd $t2,$F,$F + vmovdqu $A,0x00-0x80($ctx) + vpaddd $t3,$G,$G + vmovdqu $B,0x20-0x80($ctx) + vpaddd $Xi,$H,$H + vmovdqu $C,0x40-0x80($ctx) + vmovdqu $D,0x60-0x80($ctx) + vmovdqu $E,0x80-0x80($ctx) + vmovdqu $F,0xa0-0x80($ctx) + vmovdqu $G,0xc0-0x80($ctx) + vmovdqu $H,0xe0-0x80($ctx) + + vmovdqu $sigma,(%rbx) # save counters + lea 256+128(%rsp),%rbx + vmovdqu .Lpbswap(%rip),$Xn + dec $num + jnz .Loop_avx2 + + #mov `$REG_SZ*17+8`(%rsp),$num + #lea $REG_SZ($ctx),$ctx + #lea `16*$REG_SZ/4`($inp),$inp + #dec $num + #jnz .Loop_grande_avx2 + +.Ldone_avx2: + mov `$REG_SZ*17`(%rsp),%rax # original %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Lepilogue_avx2: + ret +.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 +___ + } }}} +$code.=<<___; +.align 256 +K256: +___ +sub TABLE { + foreach (@_) { + $code.=<<___; + .long $_,$_,$_,$_ + .long $_,$_,$_,$_ +___ + } +} +&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, + 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, + 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, + 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, + 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, + 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, + 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, + 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, + 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, + 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, + 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, + 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, + 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, + 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, + 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); +$code.=<<___; +.Lpbswap: + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap +K256_shaext: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by " +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->Rip<.Lbody + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + mov `16*17`(%rax),%rax # pull saved stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + + lea -24-10*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler +___ +$code.=<<___ if ($avx>1); +.type avx2_handler,\@abi-omnipotent +.align 16 +avx2_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov `32*17`($context),%rax # pull saved stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore cotnext->R12 + mov %r13,224($context) # restore cotnext->R13 + mov %r14,232($context) # restore cotnext->R14 + mov %r15,240($context) # restore cotnext->R15 + + lea -56-10*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lin_prologue +.size avx2_handler,.-avx2_handler +___ +$code.=<<___; +.section .pdata +.align 4 + .rva .LSEH_begin_sha256_multi_block + .rva .LSEH_end_sha256_multi_block + .rva .LSEH_info_sha256_multi_block + .rva .LSEH_begin_sha256_multi_block_shaext + .rva .LSEH_end_sha256_multi_block_shaext + .rva .LSEH_info_sha256_multi_block_shaext +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_sha256_multi_block_avx + .rva .LSEH_end_sha256_multi_block_avx + .rva .LSEH_info_sha256_multi_block_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_sha256_multi_block_avx2 + .rva .LSEH_end_sha256_multi_block_avx2 + .rva .LSEH_info_sha256_multi_block_avx2 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_sha256_multi_block: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbody,.Lepilogue # HandlerData[] +.LSEH_info_sha256_multi_block_shaext: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_sha256_multi_block_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] +___ +$code.=<<___ if ($avx>1); +.LSEH_info_sha256_multi_block_avx2: + .byte 9,0,0,0 + .rva avx2_handler + .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] +___ +} +#################################################################### + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if ($dst>=8); + $rex|=0x01 if ($src>=8); + unshift @opcode,$rex|0x40 if ($rex); +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x0f,0x38); + rex(\@opcode,$2,$1); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or + + s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or + s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or + s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-586.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-586.pl new file mode 100644 index 0000000..3873934 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-586.pl @@ -0,0 +1,924 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA512 block transform for x86. September 2007. +# +# May 2013. +# +# Add SSSE3 code path, 20-25% improvement [over original SSE2 code]. +# +# Performance in clock cycles per processed byte (less is better): +# +# gcc icc x86 asm SIMD(*) x86_64(**) +# Pentium 100 97 61 - - +# PIII 75 77 56 - - +# P4 116 95 82 34.6 30.8 +# AMD K8 54 55 36 20.7 9.57 +# Core2 66 57 40 15.9 9.97 +# Westmere 70 - 38 12.2 9.58 +# Sandy Bridge 58 - 35 11.9 11.2 +# Ivy Bridge 50 - 33 11.5 8.17 +# Haswell 46 - 29 11.3 7.66 +# Bulldozer 121 - 50 14.0 13.5 +# VIA Nano 91 - 52 33 14.7 +# Atom 126 - 68 48(***) 14.7 +# Silvermont 97 - 58 42(***) 17.5 +# Goldmont 80 - 48 19.5 12.0 +# +# (*) whichever best applicable. +# (**) x86_64 assembler performance is presented for reference +# purposes, the results are for integer-only code. +# (***) paddq is increadibly slow on Atom. +# +# IALU code-path is optimized for elder Pentiums. On vanilla Pentium +# performance improvement over compiler generated code reaches ~60%, +# while on PIII - ~35%. On newer µ-archs improvement varies from 15% +# to 50%, but it's less important as they are expected to execute SSE2 +# code-path, which is commonly ~2-3x faster [than compiler generated +# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even +# though it does not use 128-bit operations. The latter means that +# SSE2-aware kernel is no longer required to execute the code. Another +# difference is that new code optimizes amount of writes, but at the +# cost of increased data cache "footprint" by 1/2KB. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp"); +$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp"); +$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp"); +$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp"); +$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp"); +$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp"); +$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp"); +$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp"); +$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp"); +$K512="ebp"; + +$Asse2=&QWP(0,"esp"); +$Bsse2=&QWP(8,"esp"); +$Csse2=&QWP(16,"esp"); +$Dsse2=&QWP(24,"esp"); +$Esse2=&QWP(32,"esp"); +$Fsse2=&QWP(40,"esp"); +$Gsse2=&QWP(48,"esp"); +$Hsse2=&QWP(56,"esp"); + +$A="mm0"; # B-D and +$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and + # mm5-mm7, but it's done on on-demand basis... +$BxC="mm2"; # ... except for B^C + +sub BODY_00_15_sse2 { + my $phase=shift; + + #&movq ("mm5",$Fsse2); # load f + #&movq ("mm6",$Gsse2); # load g + + &movq ("mm1",$E); # %mm1 is sliding right + &pxor ("mm5","mm6"); # f^=g + &psrlq ("mm1",14); + &movq ($Esse2,$E); # modulo-scheduled save e + &pand ("mm5",$E); # f&=e + &psllq ($E,23); # $E is sliding left + &movq ($A,"mm3") if ($phase<2); + &movq (&QWP(8*9,"esp"),"mm7") # save X[i] + &movq ("mm3","mm1"); # %mm3 is T1 + &psrlq ("mm1",4); + &pxor ("mm5","mm6"); # Ch(e,f,g) + &pxor ("mm3",$E); + &psllq ($E,23); + &pxor ("mm3","mm1"); + &movq ($Asse2,$A); # modulo-scheduled save a + &paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g) + &pxor ("mm3",$E); + &psrlq ("mm1",23); + &paddq ("mm7",$Hsse2); # X[i]+=h + &pxor ("mm3","mm1"); + &psllq ($E,4); + &paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i] + &pxor ("mm3",$E); # T1=Sigma1_512(e) + + &movq ($E,$Dsse2); # e = load d, e in next round + &paddq ("mm3","mm7"); # T1+=X[i] + &movq ("mm5",$A); # %mm5 is sliding right + &psrlq ("mm5",28); + &paddq ($E,"mm3"); # d += T1 + &movq ("mm6",$A); # %mm6 is sliding left + &movq ("mm7","mm5"); + &psllq ("mm6",25); + &movq ("mm1",$Bsse2); # load b + &psrlq ("mm5",6); + &pxor ("mm7","mm6"); + &sub ("esp",8); + &psllq ("mm6",5); + &pxor ("mm7","mm5"); + &pxor ($A,"mm1"); # a^b, b^c in next round + &psrlq ("mm5",5); + &pxor ("mm7","mm6"); + &pand ($BxC,$A); # (b^c)&(a^b) + &psllq ("mm6",6); + &pxor ("mm7","mm5"); + &pxor ($BxC,"mm1"); # [h=]Maj(a,b,c) + &pxor ("mm6","mm7"); # Sigma0_512(a) + &movq ("mm7",&QWP(8*(9+16-1),"esp")) if ($phase!=0); # pre-fetch + &movq ("mm5",$Fsse2) if ($phase==0); # load f + + if ($phase>1) { + &paddq ($BxC,"mm6"); # h+=Sigma0(a) + &add ($K512,8); + #&paddq ($BxC,"mm3"); # h+=T1 + + ($A,$BxC) = ($BxC,$A); # rotate registers + } else { + &paddq ("mm3",$BxC); # T1+=Maj(a,b,c) + &movq ($BxC,$A); + &add ($K512,8); + &paddq ("mm3","mm6"); # T1+=Sigma0(a) + &movq ("mm6",$Gsse2) if ($phase==0); # load g + #&movq ($A,"mm3"); # h=T1 + } +} + +sub BODY_00_15_x86 { + #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + &mov ("ecx",$Elo); + &mov ("edx",$Ehi); + &mov ("esi","ecx"); + + &shr ("ecx",9); # lo>>9 + &mov ("edi","edx"); + &shr ("edx",9); # hi>>9 + &mov ("ebx","ecx"); + &shl ("esi",14); # lo<<14 + &mov ("eax","edx"); + &shl ("edi",14); # hi<<14 + &xor ("ebx","esi"); + + &shr ("ecx",14-9); # lo>>14 + &xor ("eax","edi"); + &shr ("edx",14-9); # hi>>14 + &xor ("eax","ecx"); + &shl ("esi",18-14); # lo<<18 + &xor ("ebx","edx"); + &shl ("edi",18-14); # hi<<18 + &xor ("ebx","esi"); + + &shr ("ecx",18-14); # lo>>18 + &xor ("eax","edi"); + &shr ("edx",18-14); # hi>>18 + &xor ("eax","ecx"); + &shl ("esi",23-18); # lo<<23 + &xor ("ebx","edx"); + &shl ("edi",23-18); # hi<<23 + &xor ("eax","esi"); + &xor ("ebx","edi"); # T1 = Sigma1(e) + + &mov ("ecx",$Flo); + &mov ("edx",$Fhi); + &mov ("esi",$Glo); + &mov ("edi",$Ghi); + &add ("eax",$Hlo); + &adc ("ebx",$Hhi); # T1 += h + &xor ("ecx","esi"); + &xor ("edx","edi"); + &and ("ecx",$Elo); + &and ("edx",$Ehi); + &add ("eax",&DWP(8*(9+15)+0,"esp")); + &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0] + &xor ("ecx","esi"); + &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g + + &mov ("esi",&DWP(0,$K512)); + &mov ("edi",&DWP(4,$K512)); # K[i] + &add ("eax","ecx"); + &adc ("ebx","edx"); # T1 += Ch(e,f,g) + &mov ("ecx",$Dlo); + &mov ("edx",$Dhi); + &add ("eax","esi"); + &adc ("ebx","edi"); # T1 += K[i] + &mov ($Tlo,"eax"); + &mov ($Thi,"ebx"); # put T1 away + &add ("eax","ecx"); + &adc ("ebx","edx"); # d += T1 + + #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + &mov ("ecx",$Alo); + &mov ("edx",$Ahi); + &mov ($Dlo,"eax"); + &mov ($Dhi,"ebx"); + &mov ("esi","ecx"); + + &shr ("ecx",2); # lo>>2 + &mov ("edi","edx"); + &shr ("edx",2); # hi>>2 + &mov ("ebx","ecx"); + &shl ("esi",4); # lo<<4 + &mov ("eax","edx"); + &shl ("edi",4); # hi<<4 + &xor ("ebx","esi"); + + &shr ("ecx",7-2); # lo>>7 + &xor ("eax","edi"); + &shr ("edx",7-2); # hi>>7 + &xor ("ebx","ecx"); + &shl ("esi",25-4); # lo<<25 + &xor ("eax","edx"); + &shl ("edi",25-4); # hi<<25 + &xor ("eax","esi"); + + &shr ("ecx",28-7); # lo>>28 + &xor ("ebx","edi"); + &shr ("edx",28-7); # hi>>28 + &xor ("eax","ecx"); + &shl ("esi",30-25); # lo<<30 + &xor ("ebx","edx"); + &shl ("edi",30-25); # hi<<30 + &xor ("eax","esi"); + &xor ("ebx","edi"); # Sigma0(a) + + &mov ("ecx",$Alo); + &mov ("edx",$Ahi); + &mov ("esi",$Blo); + &mov ("edi",$Bhi); + &add ("eax",$Tlo); + &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1 + &or ("ecx","esi"); + &or ("edx","edi"); + &and ("ecx",$Clo); + &and ("edx",$Chi); + &and ("esi",$Alo); + &and ("edi",$Ahi); + &or ("ecx","esi"); + &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b) + + &add ("eax","ecx"); + &adc ("ebx","edx"); # T1 += Maj(a,b,c) + &mov ($Tlo,"eax"); + &mov ($Thi,"ebx"); + + &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K + &sub ("esp",8); + &lea ($K512,&DWP(8,$K512)); # K++ +} + + +&function_begin("sha512_block_data_order"); + &mov ("esi",wparam(0)); # ctx + &mov ("edi",wparam(1)); # inp + &mov ("eax",wparam(2)); # num + &mov ("ebx","esp"); # saved sp + + &call (&label("pic_point")); # make it PIC! +&set_label("pic_point"); + &blindpop($K512); + &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512)); + + &sub ("esp",16); + &and ("esp",-64); + + &shl ("eax",7); + &add ("eax","edi"); + &mov (&DWP(0,"esp"),"esi"); # ctx + &mov (&DWP(4,"esp"),"edi"); # inp + &mov (&DWP(8,"esp"),"eax"); # inp+num*128 + &mov (&DWP(12,"esp"),"ebx"); # saved sp + +if ($sse2) { + &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512")); + &mov ("ecx",&DWP(0,"edx")); + &test ("ecx",1<<26); + &jz (&label("loop_x86")); + + &mov ("edx",&DWP(4,"edx")); + + # load ctx->h[0-7] + &movq ($A,&QWP(0,"esi")); + &and ("ecx",1<<24); # XMM registers availability + &movq ("mm1",&QWP(8,"esi")); + &and ("edx",1<<9); # SSSE3 bit + &movq ($BxC,&QWP(16,"esi")); + &or ("ecx","edx"); + &movq ("mm3",&QWP(24,"esi")); + &movq ($E,&QWP(32,"esi")); + &movq ("mm5",&QWP(40,"esi")); + &movq ("mm6",&QWP(48,"esi")); + &movq ("mm7",&QWP(56,"esi")); + &cmp ("ecx",1<<24|1<<9); + &je (&label("SSSE3")); + &sub ("esp",8*10); + &jmp (&label("loop_sse2")); + +&set_label("loop_sse2",16); + #&movq ($Asse2,$A); + &movq ($Bsse2,"mm1"); + &movq ($Csse2,$BxC); + &movq ($Dsse2,"mm3"); + #&movq ($Esse2,$E); + &movq ($Fsse2,"mm5"); + &movq ($Gsse2,"mm6"); + &pxor ($BxC,"mm1"); # magic + &movq ($Hsse2,"mm7"); + &movq ("mm3",$A); # magic + + &mov ("eax",&DWP(0,"edi")); + &mov ("ebx",&DWP(4,"edi")); + &add ("edi",8); + &mov ("edx",15); # counter + &bswap ("eax"); + &bswap ("ebx"); + &jmp (&label("00_14_sse2")); + +&set_label("00_14_sse2",16); + &movd ("mm1","eax"); + &mov ("eax",&DWP(0,"edi")); + &movd ("mm7","ebx"); + &mov ("ebx",&DWP(4,"edi")); + &add ("edi",8); + &bswap ("eax"); + &bswap ("ebx"); + &punpckldq("mm7","mm1"); + + &BODY_00_15_sse2(); + + &dec ("edx"); + &jnz (&label("00_14_sse2")); + + &movd ("mm1","eax"); + &movd ("mm7","ebx"); + &punpckldq("mm7","mm1"); + + &BODY_00_15_sse2(1); + + &pxor ($A,$A); # A is in %mm3 + &mov ("edx",32); # counter + &jmp (&label("16_79_sse2")); + +&set_label("16_79_sse2",16); + for ($j=0;$j<2;$j++) { # 2x unroll + #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15 + &movq ("mm5",&QWP(8*(9+16-14),"esp")); + &movq ("mm1","mm7"); + &psrlq ("mm7",1); + &movq ("mm6","mm5"); + &psrlq ("mm5",6); + &psllq ("mm1",56); + &paddq ($A,"mm3"); # from BODY_00_15 + &movq ("mm3","mm7"); + &psrlq ("mm7",7-1); + &pxor ("mm3","mm1"); + &psllq ("mm1",63-56); + &pxor ("mm3","mm7"); + &psrlq ("mm7",8-7); + &pxor ("mm3","mm1"); + &movq ("mm1","mm5"); + &psrlq ("mm5",19-6); + &pxor ("mm7","mm3"); # sigma0 + + &psllq ("mm6",3); + &pxor ("mm1","mm5"); + &paddq ("mm7",&QWP(8*(9+16),"esp")); + &pxor ("mm1","mm6"); + &psrlq ("mm5",61-19); + &paddq ("mm7",&QWP(8*(9+16-9),"esp")); + &pxor ("mm1","mm5"); + &psllq ("mm6",45-3); + &movq ("mm5",$Fsse2); # load f + &pxor ("mm1","mm6"); # sigma1 + &movq ("mm6",$Gsse2); # load g + + &paddq ("mm7","mm1"); # X[i] + #&movq (&QWP(8*9,"esp"),"mm7"); # moved to BODY_00_15 + + &BODY_00_15_sse2(2); + } + &dec ("edx"); + &jnz (&label("16_79_sse2")); + + #&movq ($A,$Asse2); + &paddq ($A,"mm3"); # from BODY_00_15 + &movq ("mm1",$Bsse2); + #&movq ($BxC,$Csse2); + &movq ("mm3",$Dsse2); + #&movq ($E,$Esse2); + &movq ("mm5",$Fsse2); + &movq ("mm6",$Gsse2); + &movq ("mm7",$Hsse2); + + &pxor ($BxC,"mm1"); # de-magic + &paddq ($A,&QWP(0,"esi")); + &paddq ("mm1",&QWP(8,"esi")); + &paddq ($BxC,&QWP(16,"esi")); + &paddq ("mm3",&QWP(24,"esi")); + &paddq ($E,&QWP(32,"esi")); + &paddq ("mm5",&QWP(40,"esi")); + &paddq ("mm6",&QWP(48,"esi")); + &paddq ("mm7",&QWP(56,"esi")); + + &mov ("eax",8*80); + &movq (&QWP(0,"esi"),$A); + &movq (&QWP(8,"esi"),"mm1"); + &movq (&QWP(16,"esi"),$BxC); + &movq (&QWP(24,"esi"),"mm3"); + &movq (&QWP(32,"esi"),$E); + &movq (&QWP(40,"esi"),"mm5"); + &movq (&QWP(48,"esi"),"mm6"); + &movq (&QWP(56,"esi"),"mm7"); + + &lea ("esp",&DWP(0,"esp","eax")); # destroy frame + &sub ($K512,"eax"); # rewind K + + &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet? + &jb (&label("loop_sse2")); + + &mov ("esp",&DWP(8*10+12,"esp")); # restore sp + &emms (); +&function_end_A(); + +&set_label("SSSE3",32); +{ my ($cnt,$frame)=("ecx","edx"); + my @X=map("xmm$_",(0..7)); + my $j; + my $i=0; + + &lea ($frame,&DWP(-64,"esp")); + &sub ("esp",256); + + # fixed stack frame layout + # + # +0 A B C D E F G H # backing store + # +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area + # +192 # XMM off-load ring buffer + # +256 # saved parameters + + &movdqa (@X[1],&QWP(80*8,$K512)); # byte swap mask + &movdqu (@X[0],&QWP(0,"edi")); + &pshufb (@X[0],@X[1]); + for ($j=0;$j<8;$j++) { + &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load + &movdqa (@X[3],&QWP(16*($j%8),$K512)); + &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask + &movdqu (@X[1],&QWP(16*($j+1),"edi")) if ($j<7); # next input + &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0] + &paddq (@X[3],@X[0]); + &pshufb (@X[1],@X[2]) if ($j<7); + &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]); # xfer X[i]+K[i] + + push(@X,shift(@X)); # rotate(@X) + } + #&jmp (&label("loop_ssse3")); + &nop (); + +&set_label("loop_ssse3",32); + &movdqa (@X[2],&QWP(16*(($j+1)%4),$frame)); # pre-restore @X[1] + &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]); # off-load @X[3] + &lea ($K512,&DWP(16*8,$K512)); + + #&movq ($Asse2,$A); # off-load A-H + &movq ($Bsse2,"mm1"); + &mov ("ebx","edi"); + &movq ($Csse2,$BxC); + &lea ("edi",&DWP(128,"edi")); # advance input + &movq ($Dsse2,"mm3"); + &cmp ("edi","eax"); + #&movq ($Esse2,$E); + &movq ($Fsse2,"mm5"); + &cmovb ("ebx","edi"); + &movq ($Gsse2,"mm6"); + &mov ("ecx",4); # loop counter + &pxor ($BxC,"mm1"); # magic + &movq ($Hsse2,"mm7"); + &pxor ("mm3","mm3"); # magic + + &jmp (&label("00_47_ssse3")); + +sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2 + ( + '&movq ("mm1",$E)', # %mm1 is sliding right + '&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i] + '&pxor ("mm5","mm6")', # f^=g + '&psrlq ("mm1",14)', + '&movq (&QWP(8*($i+4)%64,"esp"),$E)', # modulo-scheduled save e + '&pand ("mm5",$E)', # f&=e + '&psllq ($E,23)', # $E is sliding left + '&paddq ($A,"mm3")', # [h+=Maj(a,b,c)] + '&movq ("mm3","mm1")', # %mm3 is T1 + '&psrlq("mm1",4)', + '&pxor ("mm5","mm6")', # Ch(e,f,g) + '&pxor ("mm3",$E)', + '&psllq($E,23)', + '&pxor ("mm3","mm1")', + '&movq (&QWP(8*$i%64,"esp"),$A)', # modulo-scheduled save a + '&paddq("mm7","mm5")', # X[i]+=Ch(e,f,g) + '&pxor ("mm3",$E)', + '&psrlq("mm1",23)', + '&paddq("mm7",&QWP(8*($i+7)%64,"esp"))', # X[i]+=h + '&pxor ("mm3","mm1")', + '&psllq($E,4)', + '&pxor ("mm3",$E)', # T1=Sigma1_512(e) + + '&movq ($E,&QWP(8*($i+3)%64,"esp"))', # e = load d, e in next round + '&paddq ("mm3","mm7")', # T1+=X[i] + '&movq ("mm5",$A)', # %mm5 is sliding right + '&psrlq("mm5",28)', + '&paddq ($E,"mm3")', # d += T1 + '&movq ("mm6",$A)', # %mm6 is sliding left + '&movq ("mm7","mm5")', + '&psllq("mm6",25)', + '&movq ("mm1",&QWP(8*($i+1)%64,"esp"))', # load b + '&psrlq("mm5",6)', + '&pxor ("mm7","mm6")', + '&psllq("mm6",5)', + '&pxor ("mm7","mm5")', + '&pxor ($A,"mm1")', # a^b, b^c in next round + '&psrlq("mm5",5)', + '&pxor ("mm7","mm6")', + '&pand ($BxC,$A)', # (b^c)&(a^b) + '&psllq("mm6",6)', + '&pxor ("mm7","mm5")', + '&pxor ($BxC,"mm1")', # [h=]Maj(a,b,c) + '&pxor ("mm6","mm7")', # Sigma0_512(a) + '&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))', # pre-load f + '&paddq ($BxC,"mm6")', # h+=Sigma0(a) + '&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))', # pre-load g + + '($A,$BxC) = ($BxC,$A); $i--;' + ); +} + +&set_label("00_47_ssse3",32); + + for(;$j<16;$j++) { + my ($t0,$t2,$t1)=@X[2..4]; + my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3()); + + &movdqa ($t2,@X[5]); + &movdqa (@X[1],$t0); # restore @X[1] + &palignr ($t0,@X[0],8); # X[1..2] + &movdqa (&QWP(16*($j%4),$frame),@X[4]); # off-load @X[4] + &palignr ($t2,@X[4],8); # X[9..10] + + &movdqa ($t1,$t0); + &psrlq ($t0,7); + &paddq (@X[0],$t2); # X[0..1] += X[9..10] + &movdqa ($t2,$t1); + &psrlq ($t1,1); + &psllq ($t2,64-8); + &pxor ($t0,$t1); + &psrlq ($t1,8-1); + &pxor ($t0,$t2); + &psllq ($t2,8-1); + &pxor ($t0,$t1); + &movdqa ($t1,@X[7]); + &pxor ($t0,$t2); # sigma0(X[1..2]) + &movdqa ($t2,@X[7]); + &psrlq ($t1,6); + &paddq (@X[0],$t0); # X[0..1] += sigma0(X[1..2]) + + &movdqa ($t0,@X[7]); + &psrlq ($t2,19); + &psllq ($t0,64-61); + &pxor ($t1,$t2); + &psrlq ($t2,61-19); + &pxor ($t1,$t0); + &psllq ($t0,61-19); + &pxor ($t1,$t2); + &movdqa ($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1] + &pxor ($t1,$t0); # sigma0(X[1..2]) + &movdqa ($t0,&QWP(16*($j%8),$K512)); + eval(shift(@insns)); + &paddq (@X[0],$t1); # X[0..1] += sigma0(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddq ($t0,@X[0]); + foreach(@insns) { eval; } + &movdqa (&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i] + + push(@X,shift(@X)); # rotate(@X) + } + &lea ($K512,&DWP(16*8,$K512)); + &dec ("ecx"); + &jnz (&label("00_47_ssse3")); + + &movdqa (@X[1],&QWP(0,$K512)); # byte swap mask + &lea ($K512,&DWP(-80*8,$K512)); # rewind + &movdqu (@X[0],&QWP(0,"ebx")); + &pshufb (@X[0],@X[1]); + + for ($j=0;$j<8;$j++) { # load next or same block + my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3()); + + &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load + &movdqa (@X[3],&QWP(16*($j%8),$K512)); + &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask + &movdqu (@X[1],&QWP(16*($j+1),"ebx")) if ($j<7); # next input + &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0] + &paddq (@X[3],@X[0]); + &pshufb (@X[1],@X[2]) if ($j<7); + foreach(@insns) { eval; } + &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i] + + push(@X,shift(@X)); # rotate(@X) + } + + #&movq ($A,$Asse2); # load A-H + &movq ("mm1",$Bsse2); + &paddq ($A,"mm3"); # from BODY_00_15 + #&movq ($BxC,$Csse2); + &movq ("mm3",$Dsse2); + #&movq ($E,$Esse2); + #&movq ("mm5",$Fsse2); + #&movq ("mm6",$Gsse2); + &movq ("mm7",$Hsse2); + + &pxor ($BxC,"mm1"); # de-magic + &paddq ($A,&QWP(0,"esi")); + &paddq ("mm1",&QWP(8,"esi")); + &paddq ($BxC,&QWP(16,"esi")); + &paddq ("mm3",&QWP(24,"esi")); + &paddq ($E,&QWP(32,"esi")); + &paddq ("mm5",&QWP(40,"esi")); + &paddq ("mm6",&QWP(48,"esi")); + &paddq ("mm7",&QWP(56,"esi")); + + &movq (&QWP(0,"esi"),$A); + &movq (&QWP(8,"esi"),"mm1"); + &movq (&QWP(16,"esi"),$BxC); + &movq (&QWP(24,"esi"),"mm3"); + &movq (&QWP(32,"esi"),$E); + &movq (&QWP(40,"esi"),"mm5"); + &movq (&QWP(48,"esi"),"mm6"); + &movq (&QWP(56,"esi"),"mm7"); + + &cmp ("edi","eax") # are we done yet? + &jb (&label("loop_ssse3")); + + &mov ("esp",&DWP(64+12,$frame)); # restore sp + &emms (); +} +&function_end_A(); +} +&set_label("loop_x86",16); + # copy input block to stack reversing byte and qword order + for ($i=0;$i<8;$i++) { + &mov ("eax",&DWP($i*16+0,"edi")); + &mov ("ebx",&DWP($i*16+4,"edi")); + &mov ("ecx",&DWP($i*16+8,"edi")); + &mov ("edx",&DWP($i*16+12,"edi")); + &bswap ("eax"); + &bswap ("ebx"); + &bswap ("ecx"); + &bswap ("edx"); + &push ("eax"); + &push ("ebx"); + &push ("ecx"); + &push ("edx"); + } + &add ("edi",128); + &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H + &mov (&DWP(8*(9+16)+4,"esp"),"edi"); + + # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack + &lea ("edi",&DWP(8,"esp")); + &mov ("ecx",16); + &data_word(0xA5F3F689); # rep movsd + +&set_label("00_15_x86",16); + &BODY_00_15_x86(); + + &cmp (&LB("edx"),0x94); + &jne (&label("00_15_x86")); + +&set_label("16_79_x86",16); + #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp")); + &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp")); + &mov ("esi","ecx"); + + &shr ("ecx",1); # lo>>1 + &mov ("edi","edx"); + &shr ("edx",1); # hi>>1 + &mov ("eax","ecx"); + &shl ("esi",24); # lo<<24 + &mov ("ebx","edx"); + &shl ("edi",24); # hi<<24 + &xor ("ebx","esi"); + + &shr ("ecx",7-1); # lo>>7 + &xor ("eax","edi"); + &shr ("edx",7-1); # hi>>7 + &xor ("eax","ecx"); + &shl ("esi",31-24); # lo<<31 + &xor ("ebx","edx"); + &shl ("edi",25-24); # hi<<25 + &xor ("ebx","esi"); + + &shr ("ecx",8-7); # lo>>8 + &xor ("eax","edi"); + &shr ("edx",8-7); # hi>>8 + &xor ("eax","ecx"); + &shl ("edi",31-25); # hi<<31 + &xor ("ebx","edx"); + &xor ("eax","edi"); # T1 = sigma0(X[-15]) + + &mov (&DWP(0,"esp"),"eax"); + &mov (&DWP(4,"esp"),"ebx"); # put T1 away + + #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp")); + &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp")); + &mov ("esi","ecx"); + + &shr ("ecx",6); # lo>>6 + &mov ("edi","edx"); + &shr ("edx",6); # hi>>6 + &mov ("eax","ecx"); + &shl ("esi",3); # lo<<3 + &mov ("ebx","edx"); + &shl ("edi",3); # hi<<3 + &xor ("eax","esi"); + + &shr ("ecx",19-6); # lo>>19 + &xor ("ebx","edi"); + &shr ("edx",19-6); # hi>>19 + &xor ("eax","ecx"); + &shl ("esi",13-3); # lo<<13 + &xor ("ebx","edx"); + &shl ("edi",13-3); # hi<<13 + &xor ("ebx","esi"); + + &shr ("ecx",29-19); # lo>>29 + &xor ("eax","edi"); + &shr ("edx",29-19); # hi>>29 + &xor ("ebx","ecx"); + &shl ("edi",26-13); # hi<<26 + &xor ("eax","edx"); + &xor ("eax","edi"); # sigma1(X[-2]) + + &mov ("ecx",&DWP(8*(9+15+16)+0,"esp")); + &mov ("edx",&DWP(8*(9+15+16)+4,"esp")); + &add ("eax",&DWP(0,"esp")); + &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1 + &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp")); + &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp")); + &add ("eax","ecx"); + &adc ("ebx","edx"); # T1 += X[-16] + &add ("eax","esi"); + &adc ("ebx","edi"); # T1 += X[-7] + &mov (&DWP(8*(9+15)+0,"esp"),"eax"); + &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0] + + &BODY_00_15_x86(); + + &cmp (&LB("edx"),0x17); + &jne (&label("16_79_x86")); + + &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx + &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp + for($i=0;$i<4;$i++) { + &mov ("eax",&DWP($i*16+0,"esi")); + &mov ("ebx",&DWP($i*16+4,"esi")); + &mov ("ecx",&DWP($i*16+8,"esi")); + &mov ("edx",&DWP($i*16+12,"esi")); + &add ("eax",&DWP(8+($i*16)+0,"esp")); + &adc ("ebx",&DWP(8+($i*16)+4,"esp")); + &mov (&DWP($i*16+0,"esi"),"eax"); + &mov (&DWP($i*16+4,"esi"),"ebx"); + &add ("ecx",&DWP(8+($i*16)+8,"esp")); + &adc ("edx",&DWP(8+($i*16)+12,"esp")); + &mov (&DWP($i*16+8,"esi"),"ecx"); + &mov (&DWP($i*16+12,"esi"),"edx"); + } + &add ("esp",8*(9+16+80)); # destroy frame + &sub ($K512,8*80); # rewind K + + &cmp ("edi",&DWP(8,"esp")); # are we done yet? + &jb (&label("loop_x86")); + + &mov ("esp",&DWP(12,"esp")); # restore sp +&function_end_A(); + +&set_label("K512",64); # Yes! I keep it in the code segment! + &data_word(0xd728ae22,0x428a2f98); # u64 + &data_word(0x23ef65cd,0x71374491); # u64 + &data_word(0xec4d3b2f,0xb5c0fbcf); # u64 + &data_word(0x8189dbbc,0xe9b5dba5); # u64 + &data_word(0xf348b538,0x3956c25b); # u64 + &data_word(0xb605d019,0x59f111f1); # u64 + &data_word(0xaf194f9b,0x923f82a4); # u64 + &data_word(0xda6d8118,0xab1c5ed5); # u64 + &data_word(0xa3030242,0xd807aa98); # u64 + &data_word(0x45706fbe,0x12835b01); # u64 + &data_word(0x4ee4b28c,0x243185be); # u64 + &data_word(0xd5ffb4e2,0x550c7dc3); # u64 + &data_word(0xf27b896f,0x72be5d74); # u64 + &data_word(0x3b1696b1,0x80deb1fe); # u64 + &data_word(0x25c71235,0x9bdc06a7); # u64 + &data_word(0xcf692694,0xc19bf174); # u64 + &data_word(0x9ef14ad2,0xe49b69c1); # u64 + &data_word(0x384f25e3,0xefbe4786); # u64 + &data_word(0x8b8cd5b5,0x0fc19dc6); # u64 + &data_word(0x77ac9c65,0x240ca1cc); # u64 + &data_word(0x592b0275,0x2de92c6f); # u64 + &data_word(0x6ea6e483,0x4a7484aa); # u64 + &data_word(0xbd41fbd4,0x5cb0a9dc); # u64 + &data_word(0x831153b5,0x76f988da); # u64 + &data_word(0xee66dfab,0x983e5152); # u64 + &data_word(0x2db43210,0xa831c66d); # u64 + &data_word(0x98fb213f,0xb00327c8); # u64 + &data_word(0xbeef0ee4,0xbf597fc7); # u64 + &data_word(0x3da88fc2,0xc6e00bf3); # u64 + &data_word(0x930aa725,0xd5a79147); # u64 + &data_word(0xe003826f,0x06ca6351); # u64 + &data_word(0x0a0e6e70,0x14292967); # u64 + &data_word(0x46d22ffc,0x27b70a85); # u64 + &data_word(0x5c26c926,0x2e1b2138); # u64 + &data_word(0x5ac42aed,0x4d2c6dfc); # u64 + &data_word(0x9d95b3df,0x53380d13); # u64 + &data_word(0x8baf63de,0x650a7354); # u64 + &data_word(0x3c77b2a8,0x766a0abb); # u64 + &data_word(0x47edaee6,0x81c2c92e); # u64 + &data_word(0x1482353b,0x92722c85); # u64 + &data_word(0x4cf10364,0xa2bfe8a1); # u64 + &data_word(0xbc423001,0xa81a664b); # u64 + &data_word(0xd0f89791,0xc24b8b70); # u64 + &data_word(0x0654be30,0xc76c51a3); # u64 + &data_word(0xd6ef5218,0xd192e819); # u64 + &data_word(0x5565a910,0xd6990624); # u64 + &data_word(0x5771202a,0xf40e3585); # u64 + &data_word(0x32bbd1b8,0x106aa070); # u64 + &data_word(0xb8d2d0c8,0x19a4c116); # u64 + &data_word(0x5141ab53,0x1e376c08); # u64 + &data_word(0xdf8eeb99,0x2748774c); # u64 + &data_word(0xe19b48a8,0x34b0bcb5); # u64 + &data_word(0xc5c95a63,0x391c0cb3); # u64 + &data_word(0xe3418acb,0x4ed8aa4a); # u64 + &data_word(0x7763e373,0x5b9cca4f); # u64 + &data_word(0xd6b2b8a3,0x682e6ff3); # u64 + &data_word(0x5defb2fc,0x748f82ee); # u64 + &data_word(0x43172f60,0x78a5636f); # u64 + &data_word(0xa1f0ab72,0x84c87814); # u64 + &data_word(0x1a6439ec,0x8cc70208); # u64 + &data_word(0x23631e28,0x90befffa); # u64 + &data_word(0xde82bde9,0xa4506ceb); # u64 + &data_word(0xb2c67915,0xbef9a3f7); # u64 + &data_word(0xe372532b,0xc67178f2); # u64 + &data_word(0xea26619c,0xca273ece); # u64 + &data_word(0x21c0c207,0xd186b8c7); # u64 + &data_word(0xcde0eb1e,0xeada7dd6); # u64 + &data_word(0xee6ed178,0xf57d4f7f); # u64 + &data_word(0x72176fba,0x06f067aa); # u64 + &data_word(0xa2c898a6,0x0a637dc5); # u64 + &data_word(0xbef90dae,0x113f9804); # u64 + &data_word(0x131c471b,0x1b710b35); # u64 + &data_word(0x23047d84,0x28db77f5); # u64 + &data_word(0x40c72493,0x32caab7b); # u64 + &data_word(0x15c9bebc,0x3c9ebe0a); # u64 + &data_word(0x9c100d4c,0x431d67c4); # u64 + &data_word(0xcb3e42b6,0x4cc5d4be); # u64 + &data_word(0xfc657e2a,0x597f299c); # u64 + &data_word(0x3ad6faec,0x5fcb6fab); # u64 + &data_word(0x4a475817,0x6c44198c); # u64 + + &data_word(0x04050607,0x00010203); # byte swap + &data_word(0x0c0d0e0f,0x08090a0b); # mask +&function_end_B("sha512_block_data_order"); +&asciz("SHA512 block transform for x86, CRYPTOGAMS by "); + +&asm_finish(); + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-armv4.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-armv4.pl new file mode 100644 index 0000000..22b5a9d --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-armv4.pl @@ -0,0 +1,668 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPL terms is granted. +# ==================================================================== + +# SHA512 block procedure for ARMv4. September 2007. + +# This code is ~4.5 (four and a half) times faster than code generated +# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +# Xscale PXA250 core]. +# +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 6% improvement on +# Cortex A8 core and ~40 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Coxtex A8 core and ~38 cycles per byte. + +# March 2011. +# +# Add NEON implementation. On Cortex A8 it was measured to process +# one byte in 23.3 cycles or ~60% faster than integer-only code. + +# August 2012. +# +# Improve NEON performance by 12% on Snapdragon S4. In absolute +# terms it's 22.6 cycles per byte, which is disappointing result. +# Technical writers asserted that 3-way S4 pipeline can sustain +# multiple NEON instructions per cycle, but dual NEON issue could +# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. + +# Byte order [in]dependence. ========================================= +# +# Originally caller was expected to maintain specific *dword* order in +# h[0-7], namely with most significant dword at *lower* address, which +# was reflected in below two parameters as 0 and 4. Now caller is +# expected to maintain native byte order for whole 64-bit values. +$hi="HI"; +$lo="LO"; +# ==================================================================== + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$ctx="r0"; # parameter block +$inp="r1"; +$len="r2"; + +$Tlo="r3"; +$Thi="r4"; +$Alo="r5"; +$Ahi="r6"; +$Elo="r7"; +$Ehi="r8"; +$t0="r9"; +$t1="r10"; +$t2="r11"; +$t3="r12"; +############ r13 is stack pointer +$Ktbl="r14"; +############ r15 is program counter + +$Aoff=8*0; +$Boff=8*1; +$Coff=8*2; +$Doff=8*3; +$Eoff=8*4; +$Foff=8*5; +$Goff=8*6; +$Hoff=8*7; +$Xoff=8*8; + +sub BODY_00_15() { +my $magic = shift; +$code.=<<___; + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov $t0,$Elo,lsr#14 + str $Tlo,[sp,#$Xoff+0] + mov $t1,$Ehi,lsr#14 + str $Thi,[sp,#$Xoff+4] + eor $t0,$t0,$Ehi,lsl#18 + ldr $t2,[sp,#$Hoff+0] @ h.lo + eor $t1,$t1,$Elo,lsl#18 + ldr $t3,[sp,#$Hoff+4] @ h.hi + eor $t0,$t0,$Elo,lsr#18 + eor $t1,$t1,$Ehi,lsr#18 + eor $t0,$t0,$Ehi,lsl#14 + eor $t1,$t1,$Elo,lsl#14 + eor $t0,$t0,$Ehi,lsr#9 + eor $t1,$t1,$Elo,lsr#9 + eor $t0,$t0,$Elo,lsl#23 + eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#$Foff+0] @ f.lo + adc $Thi,$Thi,$t1 @ T += Sigma1(e) + ldr $t1,[sp,#$Foff+4] @ f.hi + adds $Tlo,$Tlo,$t2 + ldr $t2,[sp,#$Goff+0] @ g.lo + adc $Thi,$Thi,$t3 @ T += h + ldr $t3,[sp,#$Goff+4] @ g.hi + + eor $t0,$t0,$t2 + str $Elo,[sp,#$Eoff+0] + eor $t1,$t1,$t3 + str $Ehi,[sp,#$Eoff+4] + and $t0,$t0,$Elo + str $Alo,[sp,#$Aoff+0] + and $t1,$t1,$Ehi + str $Ahi,[sp,#$Aoff+4] + eor $t0,$t0,$t2 + ldr $t2,[$Ktbl,#$lo] @ K[i].lo + eor $t1,$t1,$t3 @ Ch(e,f,g) + ldr $t3,[$Ktbl,#$hi] @ K[i].hi + + adds $Tlo,$Tlo,$t0 + ldr $Elo,[sp,#$Doff+0] @ d.lo + adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) + ldr $Ehi,[sp,#$Doff+4] @ d.hi + adds $Tlo,$Tlo,$t2 + and $t0,$t2,#0xff + adc $Thi,$Thi,$t3 @ T += K[i] + adds $Elo,$Elo,$Tlo + ldr $t2,[sp,#$Boff+0] @ b.lo + adc $Ehi,$Ehi,$Thi @ d += T + teq $t0,#$magic + + ldr $t3,[sp,#$Coff+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq $Ktbl,$Ktbl,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov $t0,$Alo,lsr#28 + mov $t1,$Ahi,lsr#28 + eor $t0,$t0,$Ahi,lsl#4 + eor $t1,$t1,$Alo,lsl#4 + eor $t0,$t0,$Ahi,lsr#2 + eor $t1,$t1,$Alo,lsr#2 + eor $t0,$t0,$Alo,lsl#30 + eor $t1,$t1,$Ahi,lsl#30 + eor $t0,$t0,$Ahi,lsr#7 + eor $t1,$t1,$Alo,lsr#7 + eor $t0,$t0,$Alo,lsl#25 + eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) + adds $Tlo,$Tlo,$t0 + and $t0,$Alo,$t2 + adc $Thi,$Thi,$t1 @ T += Sigma0(a) + + ldr $t1,[sp,#$Boff+4] @ b.hi + orr $Alo,$Alo,$t2 + ldr $t2,[sp,#$Coff+4] @ c.hi + and $Alo,$Alo,$t3 + and $t3,$Ahi,$t1 + orr $Ahi,$Ahi,$t1 + orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo + and $Ahi,$Ahi,$t2 + adds $Alo,$Alo,$Tlo + orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc $Ahi,$Ahi,$Thi @ h += T + tst $Ktbl,#1 + add $Ktbl,$Ktbl,#8 +___ +} +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + +.type K512,%object +.align 5 +K512: +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif + +.global sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ sha512_block_data_order +#else + adr r3,.Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4-r12,lr} + sub $Ktbl,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr $Elo,[$ctx,#$Eoff+$lo] + ldr $Ehi,[$ctx,#$Eoff+$hi] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] +.Loop: + str $t0, [sp,#$Goff+0] + str $t1, [sp,#$Goff+4] + str $t2, [sp,#$Hoff+0] + str $t3, [sp,#$Hoff+4] + ldr $Alo,[$ctx,#$Aoff+$lo] + ldr $Ahi,[$ctx,#$Aoff+$hi] + ldr $Tlo,[$ctx,#$Boff+$lo] + ldr $Thi,[$ctx,#$Boff+$hi] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + str $Tlo,[sp,#$Boff+0] + str $Thi,[sp,#$Boff+4] + str $t0, [sp,#$Coff+0] + str $t1, [sp,#$Coff+4] + str $t2, [sp,#$Doff+0] + str $t3, [sp,#$Doff+4] + ldr $Tlo,[$ctx,#$Foff+$lo] + ldr $Thi,[$ctx,#$Foff+$hi] + str $Tlo,[sp,#$Foff+0] + str $Thi,[sp,#$Foff+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb $Tlo,[$inp,#7] + ldrb $t0, [$inp,#6] + ldrb $t1, [$inp,#5] + ldrb $t2, [$inp,#4] + ldrb $Thi,[$inp,#3] + ldrb $t3, [$inp,#2] + orr $Tlo,$Tlo,$t0,lsl#8 + ldrb $t0, [$inp,#1] + orr $Tlo,$Tlo,$t1,lsl#16 + ldrb $t1, [$inp],#8 + orr $Tlo,$Tlo,$t2,lsl#24 + orr $Thi,$Thi,$t3,lsl#8 + orr $Thi,$Thi,$t0,lsl#16 + orr $Thi,$Thi,$t1,lsl#24 +#else + ldr $Tlo,[$inp,#4] + ldr $Thi,[$inp],#8 +#ifdef __ARMEL__ + rev $Tlo,$Tlo + rev $Thi,$Thi +#endif +#endif +___ + &BODY_00_15(0x94); +$code.=<<___; + tst $Ktbl,#1 + beq .L00_15 + ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] + bic $Ktbl,$Ktbl,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov $Tlo,$t0,lsr#1 + ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] + mov $Thi,$t1,lsr#1 + ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] + eor $Tlo,$Tlo,$t1,lsl#31 + eor $Thi,$Thi,$t0,lsl#31 + eor $Tlo,$Tlo,$t0,lsr#8 + eor $Thi,$Thi,$t1,lsr#8 + eor $Tlo,$Tlo,$t1,lsl#24 + eor $Thi,$Thi,$t0,lsl#24 + eor $Tlo,$Tlo,$t0,lsr#7 + eor $Thi,$Thi,$t1,lsr#7 + eor $Tlo,$Tlo,$t1,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov $t0,$t2,lsr#19 + mov $t1,$t3,lsr#19 + eor $t0,$t0,$t3,lsl#13 + eor $t1,$t1,$t2,lsl#13 + eor $t0,$t0,$t3,lsr#29 + eor $t1,$t1,$t2,lsr#29 + eor $t0,$t0,$t2,lsl#3 + eor $t1,$t1,$t3,lsl#3 + eor $t0,$t0,$t2,lsr#6 + eor $t1,$t1,$t3,lsr#6 + ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] + eor $t0,$t0,$t3,lsl#26 + + ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#`$Xoff+8*16`+0] + adc $Thi,$Thi,$t1 + + ldr $t1,[sp,#`$Xoff+8*16`+4] + adds $Tlo,$Tlo,$t2 + adc $Thi,$Thi,$t3 + adds $Tlo,$Tlo,$t0 + adc $Thi,$Thi,$t1 +___ + &BODY_00_15(0x17); +$code.=<<___; +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] + beq .L16_79 + bic $Ktbl,$Ktbl,#1 + + ldr $Tlo,[sp,#$Boff+0] + ldr $Thi,[sp,#$Boff+4] + ldr $t0, [$ctx,#$Aoff+$lo] + ldr $t1, [$ctx,#$Aoff+$hi] + ldr $t2, [$ctx,#$Boff+$lo] + ldr $t3, [$ctx,#$Boff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Aoff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Aoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Boff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Boff+$hi] + + ldr $Alo,[sp,#$Coff+0] + ldr $Ahi,[sp,#$Coff+4] + ldr $Tlo,[sp,#$Doff+0] + ldr $Thi,[sp,#$Doff+4] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Coff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Coff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Doff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Doff+$hi] + + ldr $Tlo,[sp,#$Foff+0] + ldr $Thi,[sp,#$Foff+4] + ldr $t0, [$ctx,#$Eoff+$lo] + ldr $t1, [$ctx,#$Eoff+$hi] + ldr $t2, [$ctx,#$Foff+$lo] + ldr $t3, [$ctx,#$Foff+$hi] + adds $Elo,$Elo,$t0 + str $Elo,[$ctx,#$Eoff+$lo] + adc $Ehi,$Ehi,$t1 + str $Ehi,[$ctx,#$Eoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Foff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Foff+$hi] + + ldr $Alo,[sp,#$Goff+0] + ldr $Ahi,[sp,#$Goff+4] + ldr $Tlo,[sp,#$Hoff+0] + ldr $Thi,[sp,#$Hoff+4] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Goff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Goff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Hoff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Hoff+$hi] + + add sp,sp,#640 + sub $Ktbl,$Ktbl,#640 + + teq $inp,$len + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order,.-sha512_block_data_order +___ + +{ +my @Sigma0=(28,34,39); +my @Sigma1=(14,18,41); +my @sigma0=(1, 8, 7); +my @sigma1=(19,61,6); + +my $Ktbl="r3"; +my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch + +my @X=map("d$_",(0..15)); +my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); + +sub NEON_00_15() { +my $i=shift; +my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps + +$code.=<<___ if ($i<16 || $i&1); + vshr.u64 $t0,$e,#@Sigma1[0] @ $i +#if $i<16 + vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned +#endif + vshr.u64 $t1,$e,#@Sigma1[1] +#if $i>0 + vadd.i64 $a,$Maj @ h+=Maj from the past +#endif + vshr.u64 $t2,$e,#@Sigma1[2] +___ +$code.=<<___; + vld1.64 {$K},[$Ktbl,:64]! @ K[i++] + vsli.64 $t0,$e,#`64-@Sigma1[0]` + vsli.64 $t1,$e,#`64-@Sigma1[1]` + vmov $Ch,$e + vsli.64 $t2,$e,#`64-@Sigma1[2]` +#if $i<16 && defined(__ARMEL__) + vrev64.8 @X[$i],@X[$i] +#endif + veor $t1,$t0 + vbsl $Ch,$f,$g @ Ch(e,f,g) + vshr.u64 $t0,$a,#@Sigma0[0] + veor $t2,$t1 @ Sigma1(e) + vadd.i64 $T1,$Ch,$h + vshr.u64 $t1,$a,#@Sigma0[1] + vsli.64 $t0,$a,#`64-@Sigma0[0]` + vadd.i64 $T1,$t2 + vshr.u64 $t2,$a,#@Sigma0[2] + vadd.i64 $K,@X[$i%16] + vsli.64 $t1,$a,#`64-@Sigma0[1]` + veor $Maj,$a,$b + vsli.64 $t2,$a,#`64-@Sigma0[2]` + veor $h,$t0,$t1 + vadd.i64 $T1,$K + vbsl $Maj,$c,$b @ Maj(a,b,c) + veor $h,$t2 @ Sigma0(a) + vadd.i64 $d,$T1 + vadd.i64 $Maj,$T1 + @ vadd.i64 $h,$Maj +___ +} + +sub NEON_16_79() { +my $i=shift; + +if ($i&1) { &NEON_00_15($i,@_); return; } + +# 2x-vectorized, therefore runs every 2nd round +my @X=map("q$_",(0..7)); # view @X as 128-bit vector +my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps +my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 +my $e=@_[4]; # $e from NEON_00_15 +$i /= 2; +$code.=<<___; + vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] + vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] + vadd.i64 @_[0],d30 @ h+=Maj from the past + vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] + vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` + vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] + vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` + veor $s1,$t0 + vshr.u64 $t0,$s0,#@sigma0[0] + veor $s1,$t1 @ sigma1(X[i+14]) + vshr.u64 $t1,$s0,#@sigma0[1] + vadd.i64 @X[$i%8],$s1 + vshr.u64 $s1,$s0,#@sigma0[2] + vsli.64 $t0,$s0,#`64-@sigma0[0]` + vsli.64 $t1,$s0,#`64-@sigma0[1]` + vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] + veor $s1,$t0 + vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s0 + vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 + veor $s1,$t1 @ sigma0(X[i+1]) + vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s1 +___ + &NEON_00_15(2*$i,@_); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: +.LNEON: + dmb @ errata #451034 on early Cortex A8 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + adr $Ktbl,K512 + VFP_ABI_PUSH + vldmia $ctx,{$A-$H} @ load context +.Loop_neon: +___ +for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov $cnt,#4 +.L16_79_neon: + subs $cnt,#1 +___ +for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bne .L16_79_neon + + vadd.i64 $A,d30 @ h+=Maj from the past + vldmia $ctx,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia $ctx,{$A-$H} @ save context + teq $inp,$len + sub $Ktbl,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +___ +} +$code.=<<___; +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; +close STDOUT; # enforce flush diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-armv8.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-armv8.pl new file mode 100644 index 0000000..c1aaf77 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-armv8.pl @@ -0,0 +1,446 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# Denver 2.01 10.5 (+26%) 6.70 (+8%) +# X-Gene 20.0 (+100%) 12.8 (+300%(***)) +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10% (or by 1 cycle per round), but at the cost of 20% loss +# on Cortex-A53 (or by 4 cycles per round). +# (***) Super-impressive coefficients over gcc-generated code are +# indication of some compiler "pathology", most notably code +# generated with -mgeneral-regs-only is significanty faster +# and the gap is only 40-90%. + +$flavour=shift; +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if ($output =~ /512/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __ARMEB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.extern OPENSSL_armcap_P +.globl $func +.type $func,%function +.align 6 +$func: +___ +$code.=<<___ if ($SZ==4); +#ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +#else + ldr x16,.LOPENSSL_armcap_P +#endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry +___ +$code.=<<___; + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adr $Ktbl,.LK$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size $func,.-$func + +.align 6 +.type .LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size .LK$BITS,.-.LK$BITS +.align 3 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ + .long OPENSSL_armcap_P-. +#else + .quad OPENSSL_armcap_P-. +#endif +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +___ +} + +$code.=<<___; +.comm OPENSSL_armcap_P,4,4 +___ + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; + + s/\.\w?32\b//o and s/\.16b/\.4s/go; + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-c64xplus.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-c64xplus.pl new file mode 100644 index 0000000..9ebfc92 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-c64xplus.pl @@ -0,0 +1,438 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA512 for C64x+. +# +# January 2012 +# +# Performance is 19 cycles per processed byte. Compared to block +# transform function from sha512.c compiled with cl6x with -mv6400+ +# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller. +# Loop unroll won't make it, this implementation, any faster, because +# it's effectively dominated by SHRU||SHL pairs and you can't schedule +# more of them. +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K512="A3"; + +($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi, + $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31)); +($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo, + $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31)); + +($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13)); +($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13)); +($T1hi, $T2hi)= ("A6","A7"); +($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9"); +($Khi,$Klo)=("A9","A8"); +($MAJhi,$MAJlo)=($T2hi,$T2lo); +($t1hi,$t1lo)=($Khi,"B2"); + $CTXB=$t1lo; + +($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha512_block_data_order,_sha512_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg $Khi,KHI + .asg $Klo,KLO + .else + .asg $Khi,KLO + .asg $Klo,KHI + .endif + + .global _sha512_block_data_order +_sha512_block_data_order: +__sha512_block: + .asmfunc stack_usage(40+128) + MV $NUM,A0 ; reassign $NUM +|| MVK -128,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--(40) ; save frame pointer +|| [A0] MV SP,FP + [A0] STDW B13:B12,*SP[4] +|| [A0] MVK 0x00404,B1 + [A0] STDW B11:B10,*SP[3] +|| [A0] STDW A13:A12,*FP[-3] +|| [A0] MVKH 0x60000,B1 + [A0] STDW A11:A10,*SP[1] +|| [A0] MVC B1,AMR ; setup circular addressing +|| [A0] ADD B0,SP,SP ; alloca(128) + .if __TI_EABI__ + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512 + [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .else + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL (K512-__sha512_block),$K512 + [A0] MVKH (K512-__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .endif + ADDAW SP,3,$Xilo + ADDAW SP,2,$Xihi + +|| MV $CTXA,$CTXB + LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx +|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo +|| ADD B1,$K512,$K512 + LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi +|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo + LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi +|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo + LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi +|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo + LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi +|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo + LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi +|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo + LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi +|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo + LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi +|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo + + LDNDW *$INP++,B11:B10 ; pre-fetch input + LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] +outerloop?: + MVK 15,B0 ; loop counters +|| MVK 64,B1 +|| SUB A0,1,A0 + MV $Ahi,$Actxhi +|| MV $Alo,$Actxlo +|| MV $Bhi,$Bctxhi +|| MV $Blo,$Bctxlo +|| MV $Chi,$Cctxhi +|| MV $Clo,$Cctxlo +|| MVD $Dhi,$Dctxhi +|| MVD $Dlo,$Dctxlo + MV $Ehi,$Ectxhi +|| MV $Elo,$Ectxlo +|| MV $Fhi,$Fctxhi +|| MV $Flo,$Fctxlo +|| MV $Ghi,$Gctxhi +|| MV $Glo,$Gctxlo +|| MVD $Hhi,$Hctxhi +|| MVD $Hlo,$Hctxlo +loop0_15?: + .if .BIG_ENDIAN + MV B11,$T1hi +|| MV B10,$T1lo + .else + SWAP4 B10,$T1hi +|| SWAP4 B11,$T1lo + SWAP2 $T1hi,$T1hi +|| SWAP2 $T1lo,$T1lo + .endif +loop16_79?: + STW $T1hi,*$Xihi++[2] +|| STW $T1lo,*$Xilo++[2] ; X[i] = T1 +|| ADD $Hhi,$T1hi,$T1hi +|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h +|| SHRU $Ehi,14,$S1hi +|| SHL $Ehi,32-14,$S1lo + XOR $Fhi,$Ghi,$CHhi +|| XOR $Flo,$Glo,$CHlo +|| ADD KHI,$T1hi,$T1hi +|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i] +|| SHRU $Elo,14,$t0lo +|| SHL $Elo,32-14,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Ehi,$CHhi,$CHhi +|| AND $Elo,$CHlo,$CHlo +|| ROTL $Ghi,0,$Hhi +|| ROTL $Glo,0,$Hlo ; h = g +|| SHRU $Ehi,18,$t0hi +|| SHL $Ehi,32-18,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| XOR $Ghi,$CHhi,$CHhi +|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g +|| ROTL $Fhi,0,$Ghi +|| ROTL $Flo,0,$Glo ; g = f +|| SHRU $Elo,18,$t0lo +|| SHL $Elo,32-18,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| OR $Ahi,$Bhi,$MAJhi +|| OR $Alo,$Blo,$MAJlo +|| ROTL $Ehi,0,$Fhi +|| ROTL $Elo,0,$Flo ; f = e +|| SHRU $Ehi,41-32,$t0lo +|| SHL $Ehi,64-41,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Chi,$MAJhi,$MAJhi +|| AND $Clo,$MAJlo,$MAJlo +|| ROTL $Dhi,0,$Ehi +|| ROTL $Dlo,0,$Elo ; e = d +|| SHRU $Elo,41-32,$t0hi +|| SHL $Elo,64-41,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e) +|| AND $Ahi,$Bhi,$t1hi +|| AND $Alo,$Blo,$t1lo +|| ROTL $Chi,0,$Dhi +|| ROTL $Clo,0,$Dlo ; d = c +|| SHRU $Ahi,28,$S0hi +|| SHL $Ahi,32-28,$S0lo + OR $t1hi,$MAJhi,$MAJhi +|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g) +|| ROTL $Bhi,0,$Chi +|| ROTL $Blo,0,$Clo ; c = b +|| SHRU $Alo,28,$t0lo +|| SHL $Alo,32-28,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e) +|| ROTL $Ahi,0,$Bhi +|| ROTL $Alo,0,$Blo ; b = a +|| SHRU $Ahi,34-32,$t0lo +|| SHL $Ahi,64-34,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $MAJhi,$T1hi,$T2hi +|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c) +|| SHRU $Alo,34-32,$t0hi +|| SHL $Alo,64-34,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $Ehi,$T1hi,$T1hi +|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e +|| [B0] BNOP loop0_15? +|| SHRU $Ahi,39-32,$t0lo +|| SHL $Ahi,64-39,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input +||[!B1] BNOP break? +|| SHRU $Alo,39-32,$t0hi +|| SHL $Alo,64-39,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a) +|| ADD $T1carry,$T1hi,$Ehi +|| MV $T1lo,$Elo ; e = T1 +||[!B0] LDW *${Xihi}[28],$T1hi +||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14] + ADD $S0hi,$T2hi,$T2hi +|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a) +|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i] + NOP ; avoid cross-path stall + ADD $T2carry,$T2hi,$Ahi +|| MV $T2lo,$Alo ; a = T2 +|| [B0] SUB B0,1,B0 +;;===== branch to loop00_15? is taken here + NOP +;;===== branch to break? is taken here + LDW *${Xihi}[2],$T2hi +|| LDW *${Xilo}[2],$T2lo ; X[i+1] +|| SHRU $T1hi,19,$S1hi +|| SHL $T1hi,32-19,$S1lo + SHRU $T1lo,19,$t0lo +|| SHL $T1lo,32-19,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,61-32,$t0lo +|| SHL $T1hi,64-61,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,61-32,$t0hi +|| SHL $T1lo,64-61,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,6,$t0hi +|| SHL $T1hi,32-6,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,6,$t0lo +|| LDW *${Xihi}[18],$T1hi +|| LDW *${Xilo}[18],$T1lo ; X[i+9] + XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14]) + +|| LDW *${Xihi}[0],$CHhi +|| LDW *${Xilo}[0],$CHlo ; X[i] +|| SHRU $T2hi,1,$S0hi +|| SHL $T2hi,32-1,$S0lo + SHRU $T2lo,1,$t0lo +|| SHL $T2lo,32-1,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2hi,8,$t0hi +|| SHL $T2hi,32-8,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2lo,8,$t0lo +|| SHL $T2lo,32-8,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1() +|| [B1] BNOP loop16_79? +|| SHRU $T2hi,7,$t0hi +|| SHL $T2hi,32-7,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i] +|| SHRU $T2lo,7,$t0lo + XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1] + + ADD $S0hi,$T1hi,$T1hi +|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0() +|| [B1] SUB B1,1,B1 + NOP ; avoid cross-path stall + ADD $T1carry,$T1hi,$T1hi +;;===== branch to loop16_79? is taken here + +break?: + ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx +|| ADDU $Alo,$Actxlo,$Actxlo:$Alo +|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input +|| [A0] ADDK -640,$K512 ; rewind pointer to K512 + ADD $Bhi,$Bctxhi,$Bhi +|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo +|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] + ADD $Chi,$Cctxhi,$Chi +|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo +|| ADD $Actxlo,$Ahi,$Ahi +||[!A0] MV $CTXA,$CTXB + ADD $Dhi,$Dctxhi,$Dhi +|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo +|| ADD $Bctxlo,$Bhi,$Bhi +||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx +||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN] + ADD $Ehi,$Ectxhi,$Ehi +|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo +|| ADD $Cctxlo,$Chi,$Chi +|| [A0] BNOP outerloop? +||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN] +||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN] + ADD $Fhi,$Fctxhi,$Fhi +|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo +|| ADD $Dctxlo,$Dhi,$Dhi +||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN] +||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN] + ADD $Ghi,$Gctxhi,$Ghi +|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo +|| ADD $Ectxlo,$Ehi,$Ehi +||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN] +||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN] + ADD $Hhi,$Hctxhi,$Hhi +|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo +|| ADD $Fctxlo,$Fhi,$Fhi +||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN] +||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN] + ADD $Gctxlo,$Ghi,$Ghi +||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN] +||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN] + ADD $Hctxlo,$Hhi,$Hhi +||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN] +||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN] +;;===== branch to outerloop? is taken here + + STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN] +|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN] +|| MVK -40,B0 + ADD FP,B0,SP ; destroy circular buffer +|| LDDW *FP[-4],A11:A10 + LDDW *SP[2],A13:A12 +|| LDDW *FP[-2],B11:B10 + LDDW *SP[4],B13:B12 +|| BNOP RA + LDW *++SP(40),FP ; restore frame pointer + MVK 0,B0 + MVC B0,AMR ; clear AMR + NOP 2 ; wait till FP is committed + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K512: + .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd + .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc + .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 + .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 + .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe + .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 + .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 + .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 + .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 + .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 + .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 + .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 + .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 + .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 + .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 + .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 + .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 + .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df + .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 + .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b + .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 + .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 + .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 + .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 + .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 + .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 + .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb + .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 + .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 + .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec + .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 + .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b + .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 + .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 + .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 + .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b + .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 + .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c + .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a + .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 + .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-ia64.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-ia64.pl new file mode 100755 index 0000000..356a46a --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-ia64.pl @@ -0,0 +1,692 @@ +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512_Transform for Itanium. +# +# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50% +# faster than gcc and >60%(!) faster than code generated by HP-UX +# compiler (yes, HP-UX is generating slower code, because unlike gcc, +# it failed to deploy "shift right pair," 'shrp' instruction, which +# substitutes for 64-bit rotate). +# +# 924 cycles long sha256_block outperforms gcc by over factor of 2(!) +# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost +# this one big time). Note that "formally" 924 is about 100 cycles +# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical +# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round, +# are spent on extra work to provide for 32-bit rotations. 32-bit +# rotations are still handled by 'shrp' instruction and for this +# reason lower 32 bits are deposited to upper half of 64-bit register +# prior 'shrp' issue. And in order to minimize the amount of such +# operations, X[16] values are *maintained* with copies of lower +# halves in upper halves, which is why you'll spot such instructions +# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel +# 32-bit unsigned right shift," 'pshr4.u' instructions here. +# +# Rules of engagement. +# +# There is only one integer shifter meaning that if I have two rotate, +# deposit or extract instructions in adjacent bundles, they shall +# split [at run-time if they have to]. But note that variable and +# parallel shifts are performed by multi-media ALU and *are* pairable +# with rotates [and alike]. On the backside MMALU is rather slow: it +# takes 2 extra cycles before the result of integer operation is +# available *to* MMALU and 2(*) extra cycles before the result of MM +# operation is available "back" *to* integer ALU, not to mention that +# MMALU itself has 2 cycles latency. However! I explicitly scheduled +# these MM instructions to avoid MM stalls, so that all these extra +# latencies get "hidden" in instruction-level parallelism. +# +# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule +# for 2 in order to provide for best *overall* performance, +# because on Itanium 1 stall on MM result is accompanied by +# pipeline flush, which takes 6 cycles:-( +# +# June 2012 +# +# Improve performance by 15-20%. Note about "rules of engagement" +# above. Contemporary cores are equipped with additional shifter, +# so that they should perform even better than below, presumably +# by ~10%. +# +###################################################################### +# Current performance in cycles per processed byte for Itanium 2 +# pre-9000 series [little-endian] system: +# +# SHA1(*) 5.7 +# SHA256 12.6 +# SHA512 6.7 +# +# (*) SHA1 result is presented purely for reference purposes. +# +# To generate code, pass the file name with either 256 or 512 in its +# name and compiler flags. + +$output=pop; + +if ($output =~ /512.*\.[s|asm]/) { + $SZ=8; + $BITS=8*$SZ; + $LDW="ld8"; + $STW="st8"; + $ADD="add"; + $SHRU="shr.u"; + $TABLE="K512"; + $func="sha512_block_data_order"; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; +} elsif ($output =~ /256.*\.[s|asm]/) { + $SZ=4; + $BITS=8*$SZ; + $LDW="ld4"; + $STW="st4"; + $ADD="padd4"; + $SHRU="pshr4.u"; + $TABLE="K256"; + $func="sha256_block_data_order"; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; +} else { die "nonsense $output"; } + +open STDOUT,">$output" || die "can't open $output: $!"; + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } +for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); + $big_endian=0 if (/\-DL_ENDIAN/); } +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +$code=<<___; +.ident \"$output, version 2.0\" +.ident \"IA-64 ISA artwork by Andy Polyakov \" +.explicit +.text + +pfssave=r2; +lcsave=r3; +prsave=r14; +K=r15; +A_=r16; B_=r17; C_=r18; D_=r19; +E_=r20; F_=r21; G_=r22; H_=r23; +T1=r24; T2=r25; +s0=r26; s1=r27; t0=r28; t1=r29; +Ktbl=r30; +ctx=r31; // 1st arg +input=r56; // 2nd arg +num=r57; // 3rd arg +sgm0=r58; sgm1=r59; // small constants + +// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host]) +.global $func# +.proc $func# +.align 32 +.skip 16 +$func: + .prologue + .save ar.pfs,pfssave +{ .mmi; alloc pfssave=ar.pfs,3,25,0,24 + $ADDP ctx=0,r32 // 1st arg + .save ar.lc,lcsave + mov lcsave=ar.lc } +{ .mmi; $ADDP input=0,r33 // 2nd arg + mov num=r34 // 3rd arg + .save pr,prsave + mov prsave=pr };; + + .body +{ .mib; add r8=0*$SZ,ctx + add r9=1*$SZ,ctx } +{ .mib; add r10=2*$SZ,ctx + add r11=3*$SZ,ctx };; + +// load A-H +.Lpic_point: +{ .mmi; $LDW A_=[r8],4*$SZ + $LDW B_=[r9],4*$SZ + mov Ktbl=ip } +{ .mmi; $LDW C_=[r10],4*$SZ + $LDW D_=[r11],4*$SZ + mov sgm0=$sigma0[2] };; +{ .mmi; $LDW E_=[r8] + $LDW F_=[r9] + add Ktbl=($TABLE#-.Lpic_point),Ktbl } +{ .mmi; $LDW G_=[r10] + $LDW H_=[r11] + cmp.ne p0,p16=0,r0 };; +___ +$code.=<<___ if ($BITS==64); +{ .mii; and r8=7,input + and input=~7,input;; + cmp.eq p9,p0=1,r8 } +{ .mmi; cmp.eq p10,p0=2,r8 + cmp.eq p11,p0=3,r8 + cmp.eq p12,p0=4,r8 } +{ .mmi; cmp.eq p13,p0=5,r8 + cmp.eq p14,p0=6,r8 + cmp.eq p15,p0=7,r8 };; +___ +$code.=<<___; +.L_outer: +.rotr R[8],X[16] +A=R[0]; B=R[1]; C=R[2]; D=R[3]; E=R[4]; F=R[5]; G=R[6]; H=R[7] +{ .mmi; ld1 X[15]=[input],$SZ // eliminated in sha512 + mov A=A_ + mov ar.lc=14 } +{ .mmi; mov B=B_ + mov C=C_ + mov D=D_ } +{ .mmi; mov E=E_ + mov F=F_ + mov ar.ec=2 };; +{ .mmi; mov G=G_ + mov H=H_ + mov sgm1=$sigma1[2] } +{ .mib; mov r8=0 + add r9=1-$SZ,input + brp.loop.imp .L_first16,.L_first16_end-16 };; +___ +$t0="A", $t1="E", $code.=<<___ if ($BITS==64); +// in sha512 case I load whole X[16] at once and take care of alignment... +{ .mmi; add r8=1*$SZ,input + add r9=2*$SZ,input + add r10=3*$SZ,input };; +{ .mmb; $LDW X[15]=[input],4*$SZ + $LDW X[14]=[r8],4*$SZ +(p9) br.cond.dpnt.many .L1byte };; +{ .mmb; $LDW X[13]=[r9],4*$SZ + $LDW X[12]=[r10],4*$SZ +(p10) br.cond.dpnt.many .L2byte };; +{ .mmb; $LDW X[11]=[input],4*$SZ + $LDW X[10]=[r8],4*$SZ +(p11) br.cond.dpnt.many .L3byte };; +{ .mmb; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ +(p12) br.cond.dpnt.many .L4byte };; +{ .mmb; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ +(p13) br.cond.dpnt.many .L5byte };; +{ .mmb; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ +(p14) br.cond.dpnt.many .L6byte };; +{ .mmb; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ +(p15) br.cond.dpnt.many .L7byte };; +{ .mmb; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev // eliminated on big-endian + br.many .L_first16 };; +.L1byte: +{ .mmi; $LDW X[13]=[r9],4*$SZ + $LDW X[12]=[r10],4*$SZ + shrp X[15]=X[15],X[14],56 };; +{ .mmi; $LDW X[11]=[input],4*$SZ + $LDW X[10]=[r8],4*$SZ + shrp X[14]=X[14],X[13],56 } +{ .mmi; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ + shrp X[13]=X[13],X[12],56 };; +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[12]=X[12],X[11],56 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[11]=X[11],X[10],56 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[10]=X[10],X[ 9],56 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[ 9]=X[ 9],X[ 8],56 };; +{ .mii; $LDW T1=[input] + shrp X[ 8]=X[ 8],X[ 7],56 + shrp X[ 7]=X[ 7],X[ 6],56 } +{ .mii; shrp X[ 6]=X[ 6],X[ 5],56 + shrp X[ 5]=X[ 5],X[ 4],56 };; +{ .mii; shrp X[ 4]=X[ 4],X[ 3],56 + shrp X[ 3]=X[ 3],X[ 2],56 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],56 + shrp X[ 1]=X[ 1],X[ 0],56 } +{ .mib; shrp X[ 0]=X[ 0],T1,56 } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev // eliminated on big-endian + br.many .L_first16 };; +.L2byte: +{ .mmi; $LDW X[11]=[input],4*$SZ + $LDW X[10]=[r8],4*$SZ + shrp X[15]=X[15],X[14],48 } +{ .mmi; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ + shrp X[14]=X[14],X[13],48 };; +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[13]=X[13],X[12],48 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[12]=X[12],X[11],48 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[11]=X[11],X[10],48 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[10]=X[10],X[ 9],48 };; +{ .mii; $LDW T1=[input] + shrp X[ 9]=X[ 9],X[ 8],48 + shrp X[ 8]=X[ 8],X[ 7],48 } +{ .mii; shrp X[ 7]=X[ 7],X[ 6],48 + shrp X[ 6]=X[ 6],X[ 5],48 };; +{ .mii; shrp X[ 5]=X[ 5],X[ 4],48 + shrp X[ 4]=X[ 4],X[ 3],48 } +{ .mii; shrp X[ 3]=X[ 3],X[ 2],48 + shrp X[ 2]=X[ 2],X[ 1],48 } +{ .mii; shrp X[ 1]=X[ 1],X[ 0],48 + shrp X[ 0]=X[ 0],T1,48 } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev // eliminated on big-endian + br.many .L_first16 };; +.L3byte: +{ .mmi; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ + shrp X[15]=X[15],X[14],40 };; +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[14]=X[14],X[13],40 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[13]=X[13],X[12],40 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[12]=X[12],X[11],40 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[11]=X[11],X[10],40 };; +{ .mii; $LDW T1=[input] + shrp X[10]=X[10],X[ 9],40 + shrp X[ 9]=X[ 9],X[ 8],40 } +{ .mii; shrp X[ 8]=X[ 8],X[ 7],40 + shrp X[ 7]=X[ 7],X[ 6],40 };; +{ .mii; shrp X[ 6]=X[ 6],X[ 5],40 + shrp X[ 5]=X[ 5],X[ 4],40 } +{ .mii; shrp X[ 4]=X[ 4],X[ 3],40 + shrp X[ 3]=X[ 3],X[ 2],40 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],40 + shrp X[ 1]=X[ 1],X[ 0],40 } +{ .mib; shrp X[ 0]=X[ 0],T1,40 } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev // eliminated on big-endian + br.many .L_first16 };; +.L4byte: +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[15]=X[15],X[14],32 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[14]=X[14],X[13],32 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[13]=X[13],X[12],32 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[12]=X[12],X[11],32 };; +{ .mii; $LDW T1=[input] + shrp X[11]=X[11],X[10],32 + shrp X[10]=X[10],X[ 9],32 } +{ .mii; shrp X[ 9]=X[ 9],X[ 8],32 + shrp X[ 8]=X[ 8],X[ 7],32 };; +{ .mii; shrp X[ 7]=X[ 7],X[ 6],32 + shrp X[ 6]=X[ 6],X[ 5],32 } +{ .mii; shrp X[ 5]=X[ 5],X[ 4],32 + shrp X[ 4]=X[ 4],X[ 3],32 } +{ .mii; shrp X[ 3]=X[ 3],X[ 2],32 + shrp X[ 2]=X[ 2],X[ 1],32 } +{ .mii; shrp X[ 1]=X[ 1],X[ 0],32 + shrp X[ 0]=X[ 0],T1,32 } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev // eliminated on big-endian + br.many .L_first16 };; +.L5byte: +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[15]=X[15],X[14],24 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[14]=X[14],X[13],24 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[13]=X[13],X[12],24 };; +{ .mii; $LDW T1=[input] + shrp X[12]=X[12],X[11],24 + shrp X[11]=X[11],X[10],24 } +{ .mii; shrp X[10]=X[10],X[ 9],24 + shrp X[ 9]=X[ 9],X[ 8],24 };; +{ .mii; shrp X[ 8]=X[ 8],X[ 7],24 + shrp X[ 7]=X[ 7],X[ 6],24 } +{ .mii; shrp X[ 6]=X[ 6],X[ 5],24 + shrp X[ 5]=X[ 5],X[ 4],24 } +{ .mii; shrp X[ 4]=X[ 4],X[ 3],24 + shrp X[ 3]=X[ 3],X[ 2],24 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],24 + shrp X[ 1]=X[ 1],X[ 0],24 } +{ .mib; shrp X[ 0]=X[ 0],T1,24 } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev // eliminated on big-endian + br.many .L_first16 };; +.L6byte: +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[15]=X[15],X[14],16 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[14]=X[14],X[13],16 };; +{ .mii; $LDW T1=[input] + shrp X[13]=X[13],X[12],16 + shrp X[12]=X[12],X[11],16 } +{ .mii; shrp X[11]=X[11],X[10],16 + shrp X[10]=X[10],X[ 9],16 };; +{ .mii; shrp X[ 9]=X[ 9],X[ 8],16 + shrp X[ 8]=X[ 8],X[ 7],16 } +{ .mii; shrp X[ 7]=X[ 7],X[ 6],16 + shrp X[ 6]=X[ 6],X[ 5],16 } +{ .mii; shrp X[ 5]=X[ 5],X[ 4],16 + shrp X[ 4]=X[ 4],X[ 3],16 } +{ .mii; shrp X[ 3]=X[ 3],X[ 2],16 + shrp X[ 2]=X[ 2],X[ 1],16 } +{ .mii; shrp X[ 1]=X[ 1],X[ 0],16 + shrp X[ 0]=X[ 0],T1,16 } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev // eliminated on big-endian + br.many .L_first16 };; +.L7byte: +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[15]=X[15],X[14],8 };; +{ .mii; $LDW T1=[input] + shrp X[14]=X[14],X[13],8 + shrp X[13]=X[13],X[12],8 } +{ .mii; shrp X[12]=X[12],X[11],8 + shrp X[11]=X[11],X[10],8 };; +{ .mii; shrp X[10]=X[10],X[ 9],8 + shrp X[ 9]=X[ 9],X[ 8],8 } +{ .mii; shrp X[ 8]=X[ 8],X[ 7],8 + shrp X[ 7]=X[ 7],X[ 6],8 } +{ .mii; shrp X[ 6]=X[ 6],X[ 5],8 + shrp X[ 5]=X[ 5],X[ 4],8 } +{ .mii; shrp X[ 4]=X[ 4],X[ 3],8 + shrp X[ 3]=X[ 3],X[ 2],8 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],8 + shrp X[ 1]=X[ 1],X[ 0],8 } +{ .mib; shrp X[ 0]=X[ 0],T1,8 } +{ .mib; mov r8=0 + mux1 X[15]=X[15],\@rev };; // eliminated on big-endian + +.align 32 +.L_first16: +{ .mmi; $LDW K=[Ktbl],$SZ + add A=A,r8 // H+=Sigma(0) from the past + _rotr r10=$t1,$Sigma1[0] } // ROTR(e,14) +{ .mmi; and T1=F,E + andcm r8=G,E + (p16) mux1 X[14]=X[14],\@rev };; // eliminated on big-endian +{ .mmi; and T2=A,B + and r9=A,C + _rotr r11=$t1,$Sigma1[1] } // ROTR(e,41) +{ .mmi; xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) + and r8=B,C };; +___ +$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); +.align 32 +.L_first16: +{ .mmi; add A=A,r8 // H+=Sigma(0) from the past + add r10=2-$SZ,input + add r11=3-$SZ,input };; +{ .mmi; ld1 r9=[r9] + ld1 r10=[r10] + dep.z $t1=E,32,32 } +{ .mmi; ld1 r11=[r11] + $LDW K=[Ktbl],$SZ + zxt4 E=E };; +{ .mii; or $t1=$t1,E + dep X[15]=X[15],r9,8,8 + mux2 $t0=A,0x44 };; // copy lower half to upper +{ .mmi; and T1=F,E + andcm r8=G,E + dep r11=r10,r11,8,8 };; +{ .mmi; and T2=A,B + and r9=A,C + dep X[15]=X[15],r11,16,16 };; +{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch + xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) + _rotr r10=$t1,$Sigma1[0] } // ROTR(e,14) +{ .mmi; and r8=B,C + _rotr r11=$t1,$Sigma1[1] };; // ROTR(e,18) +___ +$code.=<<___; +{ .mmi; add T1=T1,H // T1=Ch(e,f,g)+h + xor r10=r10,r11 + _rotr r11=$t1,$Sigma1[2] } // ROTR(e,41) +{ .mmi; xor T2=T2,r9 + add K=K,X[15] };; +{ .mmi; add T1=T1,K // T1+=K[i]+X[i] + xor T2=T2,r8 // T2=((a & b) ^ (a & c) ^ (b & c)) + _rotr r8=$t0,$Sigma0[0] } // ROTR(a,28) +{ .mmi; xor r11=r11,r10 // Sigma1(e) + _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34) +{ .mmi; add T1=T1,r11 // T+=Sigma1(e) + xor r8=r8,r9 + _rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39) +{ .mmi; xor r8=r8,r9 // Sigma0(a) + add D=D,T1 + mux2 H=X[15],0x44 } // mov H=X[15] in sha512 +{ .mib; (p16) add r9=1-$SZ,input // not used in sha512 + add X[15]=T1,T2 // H=T1+Maj(a,b,c) + br.ctop.sptk .L_first16 };; +.L_first16_end: + +{ .mib; mov ar.lc=$rounds-17 + brp.loop.imp .L_rest,.L_rest_end-16 } +{ .mib; mov ar.ec=1 + br.many .L_rest };; + +.align 32 +.L_rest: +{ .mmi; $LDW K=[Ktbl],$SZ + add A=A,r8 // H+=Sigma0(a) from the past + _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1) +{ .mmi; add X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF] + $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7 +{ .mib; and T1=F,E + _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8) +{ .mib; andcm r10=G,E + $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6 +// Pair of mmi; splits on Itanium 1 and prevents pipeline flush +// upon $SHRU output usage +{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g)) + xor r9=r8,r9 + _rotr r10=X[15-14],$sigma1[0] }// ROTR(s1,19) +{ .mmi; and T2=A,B + and r8=A,C + _rotr r11=X[15-14],$sigma1[1] };;// ROTR(s1,61) +___ +$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); +{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF]) + dep.z $t1=E,32,32 } +{ .mib; xor r10=r11,r10 + zxt4 E=E };; +{ .mii; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF]) + shrp r9=E,$t1,32+$Sigma1[0] // ROTR(e,14) + mux2 $t0=A,0x44 };; // copy lower half to upper +// Pair of mmi; splits on Itanium 1 and prevents pipeline flush +// upon mux2 output usage +{ .mmi; xor T2=T2,r8 + shrp r8=E,$t1,32+$Sigma1[1]} // ROTR(e,18) +{ .mmi; and r10=B,C + add T1=T1,H // T1=Ch(e,f,g)+h + or $t1=$t1,E };; +___ +$t0="A", $t1="E", $code.=<<___ if ($BITS==64); +{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF]) + _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14) +{ .mib; xor r10=r11,r10 + xor T2=T2,r8 };; +{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF]) + _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18) +{ .mib; and r10=B,C + add T1=T1,H };; // T1+=H +___ +$code.=<<___; +{ .mib; xor r9=r9,r8 + _rotr r8=$t1,$Sigma1[2] } // ROTR(e,41) +{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c)) + add X[15]=X[15],s0 };; // X[i]+=sigma0(X[i+1]) +{ .mmi; xor r9=r9,r8 // Sigma1(e) + add X[15]=X[15],s1 // X[i]+=sigma0(X[i+14]) + _rotr r8=$t0,$Sigma0[0] };; // ROTR(a,28) +{ .mmi; add K=K,X[15] + add T1=T1,r9 // T1+=Sigma1(e) + _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34) +{ .mmi; add T1=T1,K // T1+=K[i]+X[i] + xor r8=r8,r9 + _rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39) +{ .mib; add D=D,T1 + mux2 H=X[15],0x44 } // mov H=X[15] in sha512 +{ .mib; xor r8=r8,r9 // Sigma0(a) + add X[15]=T1,T2 // H=T1+Maj(a,b,c) + br.ctop.sptk .L_rest };; +.L_rest_end: + +{ .mmi; add A=A,r8 };; // H+=Sigma0(a) from the past +{ .mmi; add A_=A_,A + add B_=B_,B + add C_=C_,C } +{ .mmi; add D_=D_,D + add E_=E_,E + cmp.ltu p16,p0=1,num };; +{ .mmi; add F_=F_,F + add G_=G_,G + add H_=H_,H } +{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl +(p16) add num=-1,num +(p16) br.dptk.many .L_outer };; + +{ .mib; add r8=0*$SZ,ctx + add r9=1*$SZ,ctx } +{ .mib; add r10=2*$SZ,ctx + add r11=3*$SZ,ctx };; +{ .mmi; $STW [r8]=A_,4*$SZ + $STW [r9]=B_,4*$SZ + mov ar.lc=lcsave } +{ .mmi; $STW [r10]=C_,4*$SZ + $STW [r11]=D_,4*$SZ + mov pr=prsave,0x1ffff };; +{ .mmb; $STW [r8]=E_ + $STW [r9]=F_ } +{ .mmb; $STW [r10]=G_ + $STW [r11]=H_ + br.ret.sptk.many b0 };; +.endp $func# +___ + +foreach(split($/,$code)) { + s/\`([^\`]*)\`/eval $1/gem; + s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm; + if ($BITS==64) { + s/mux2(\s+)([^=]+)=([^,]+),\S+/mov$1 $2=$3/gm; + s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian); + s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm + if (!$big_endian); + s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm; + } + + print $_,"\n"; +} + +print<<___ if ($BITS==32); +.align 64 +.type K256#,\@object +K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256#,$SZ*$rounds +stringz "SHA256 block transform for IA64, CRYPTOGAMS by " +___ +print<<___ if ($BITS==64); +.align 64 +.type K512#,\@object +K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd + data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + data8 0x3956c25bf348b538,0x59f111f1b605d019 + data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + data8 0xd807aa98a3030242,0x12835b0145706fbe + data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + data8 0x9bdc06a725c71235,0xc19bf174cf692694 + data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + data8 0x983e5152ee66dfab,0xa831c66d2db43210 + data8 0xb00327c898fb213f,0xbf597fc7beef0ee4 + data8 0xc6e00bf33da88fc2,0xd5a79147930aa725 + data8 0x06ca6351e003826f,0x142929670a0e6e70 + data8 0x27b70a8546d22ffc,0x2e1b21385c26c926 + data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + data8 0x650a73548baf63de,0x766a0abb3c77b2a8 + data8 0x81c2c92e47edaee6,0x92722c851482353b + data8 0xa2bfe8a14cf10364,0xa81a664bbc423001 + data8 0xc24b8b70d0f89791,0xc76c51a30654be30 + data8 0xd192e819d6ef5218,0xd69906245565a910 + data8 0xf40e35855771202a,0x106aa07032bbd1b8 + data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + data8 0x748f82ee5defb2fc,0x78a5636f43172f60 + data8 0x84c87814a1f0ab72,0x8cc702081a6439ec + data8 0x90befffa23631e28,0xa4506cebde82bde9 + data8 0xbef9a3f7b2c67915,0xc67178f2e372532b + data8 0xca273eceea26619c,0xd186b8c721c0c207 + data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + data8 0x06f067aa72176fba,0x0a637dc5a2c898a6 + data8 0x113f9804bef90dae,0x1b710b35131c471b + data8 0x28db77f523047d84,0x32caab7b40c72493 + data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.size K512#,$SZ*$rounds +stringz "SHA512 block transform for IA64, CRYPTOGAMS by " +___ diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-mips.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-mips.pl new file mode 100644 index 0000000..5c2d23f --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-mips.pl @@ -0,0 +1,519 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA2 block procedures for MIPS. + +# October 2010. +# +# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- +# generated code in o32 build and ~55% in n32/64 build. SHA512 [which +# for now can only be compiled for MIPS64 ISA] improvement is modest +# ~17%, but it comes for free, because it's same instruction sequence. +# Improvement coefficients are for aligned input. + +# September 2012. +# +# Add MIPS[32|64]R2 code (>25% less instructions). + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_LA="dla"; + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_LA="la"; + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; +# +# +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); + +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } + +if ($output =~ /512/) { + $label="512"; + $SZ=8; + $LD="ld"; # load from memory + $ST="sd"; # store to memory + $SLL="dsll"; # shift left logical + $SRL="dsrl"; # shift right logical + $ADDU="daddu"; + $ROTR="drotr"; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=( 7, 1, 8); # right shift first + @sigma1=( 6,19,61); # right shift first + $lastK=0x817; + $rounds=80; +} else { + $label="256"; + $SZ=4; + $LD="lw"; # load from memory + $ST="sw"; # store to memory + $SLL="sll"; # shift left logical + $SRL="srl"; # shift right logical + $ADDU="addu"; + $ROTR="rotr"; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 3, 7,18); # right shift first + @sigma1=(10,17,19); # right shift first + $lastK=0x8f2; + $rounds=64; +} + +$MSB = $big_endian ? 0 : ($SZ-1); +$LSB = ($SZ-1)&~$MSB; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); +@X=map("\$$_",(8..23)); + +$ctx=$a0; +$inp=$a1; +$len=$a2; $Ktbl=$len; + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___ if ($i<15); + ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) + ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==4); +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + wsbh @X[0],@X[0] # byte swap($i) + rotr @X[0],@X[0],16 +#else + srl $tmp0,@X[0],24 # byte swap($i) + srl $tmp1,@X[0],8 + andi $tmp2,@X[0],0xFF00 + sll @X[0],@X[0],24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 +#endif +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==8); +#if defined(_MIPS_ARCH_MIPS64R2) + dsbh @X[0],@X[0] # byte swap($i) + dshd @X[0],@X[0] +#else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + and $tmp1,@X[0],$tmp0 # byte swap($i) + dsrl $tmp2,@X[0],24 + dsll $tmp1,24 + and $tmp2,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + and $tmp2,@X[0],$tmp0 + dsrl @X[0],8 + dsll $tmp2,8 + and @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 + dsrl $tmp1,@X[0],32 + dsll @X[0],32 + or @X[0],$tmp1 +#endif +___ +$code.=<<___; +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + xor $tmp2,$f,$g # $i + $ROTR $tmp0,$e,@Sigma1[0] + $ADDU $T1,$X[0],$h + $ROTR $tmp1,$e,@Sigma1[1] + and $tmp2,$e + $ROTR $h,$e,@Sigma1[2] + xor $tmp0,$tmp1 + $ROTR $tmp1,$a,@Sigma0[0] + xor $tmp2,$g # Ch(e,f,g) + xor $tmp0,$h # Sigma1(e) + + $ROTR $h,$a,@Sigma0[1] + $ADDU $T1,$tmp2 + $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] + xor $h,$tmp1 + $ROTR $tmp1,$a,@Sigma0[2] + $ADDU $T1,$tmp0 + and $tmp0,$b,$c + xor $h,$tmp1 # Sigma0(a) + xor $tmp1,$b,$c +#else + $ADDU $T1,$X[0],$h # $i + $SRL $h,$e,@Sigma1[0] + xor $tmp2,$f,$g + $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` + and $tmp2,$e + $SRL $tmp0,$e,@Sigma1[1] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` + xor $h,$tmp0 + $SRL $tmp0,$e,@Sigma1[2] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` + xor $h,$tmp0 + xor $tmp2,$g # Ch(e,f,g) + xor $tmp0,$tmp1,$h # Sigma1(e) + + $SRL $h,$a,@Sigma0[0] + $ADDU $T1,$tmp2 + $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] + $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` + $ADDU $T1,$tmp0 + $SRL $tmp0,$a,@Sigma0[1] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` + xor $h,$tmp0 + $SRL $tmp0,$a,@Sigma0[2] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` + xor $h,$tmp0 + and $tmp0,$b,$c + xor $h,$tmp1 # Sigma0(a) + xor $tmp1,$b,$c +#endif + $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer + $ADDU $h,$tmp0 + and $tmp1,$a + $ADDU $T1,$tmp2 # +=K[$i] + $ADDU $h,$tmp1 # +=Maj(a,b,c) + $ADDU $d,$T1 + $ADDU $h,$T1 +___ +$code.=<<___ if ($i>=13); + $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer +___ +} + +sub BODY_16_XX { +my $i=@_[0]; +my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___; +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) + $ROTR $tmp0,@X[1],@sigma0[1] + $ADDU @X[0],@X[9] # +=X[i+9] + xor $tmp2,$tmp0 + $ROTR $tmp0,@X[1],@sigma0[2] + + $SRL $tmp3,@X[14],@sigma1[0] + $ROTR $tmp1,@X[14],@sigma1[1] + xor $tmp2,$tmp0 # sigma0(X[i+1]) + $ROTR $tmp0,@X[14],@sigma1[2] + xor $tmp3,$tmp1 + $ADDU @X[0],$tmp2 +#else + $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) + $ADDU @X[0],@X[9] # +=X[i+9] + $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` + $SRL $tmp0,@X[1],@sigma0[1] + xor $tmp2,$tmp1 + $SLL $tmp1,`@sigma0[2]-@sigma0[1]` + xor $tmp2,$tmp0 + $SRL $tmp0,@X[1],@sigma0[2] + xor $tmp2,$tmp1 + + $SRL $tmp3,@X[14],@sigma1[0] + xor $tmp2,$tmp0 # sigma0(X[i+1]) + $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` + $ADDU @X[0],$tmp2 + $SRL $tmp0,@X[14],@sigma1[1] + xor $tmp3,$tmp1 + $SLL $tmp1,`@sigma1[2]-@sigma1[1]` + xor $tmp3,$tmp0 + $SRL $tmp0,@X[14],@sigma1[2] + xor $tmp3,$tmp1 +#endif + xor $tmp3,$tmp0 # sigma1(X[i+14]) + $ADDU @X[0],$tmp3 +___ + &BODY_00_15(@_); +} + +$FRAMESIZE=16*$SZ+16*$SZREG; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; + +$code.=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include +#endif + +#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) +#define _MIPS_ARCH_MIPS32R2 +#endif + +.text +.set noat +#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) +.option pic2 +#endif + +.align 5 +.globl sha${label}_block_data_order +.ent sha${label}_block_data_order +sha${label}_block_data_order: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Ktbl + .cpsetup $pf,$zero,sha${label}_block_data_order +___ +$code.=<<___; + .set reorder + $PTR_LA $Ktbl,K${label} # PIC-ified 'load address' + + $LD $A,0*$SZ($ctx) # load context + $LD $B,1*$SZ($ctx) + $LD $C,2*$SZ($ctx) + $LD $D,3*$SZ($ctx) + $LD $E,4*$SZ($ctx) + $LD $F,5*$SZ($ctx) + $LD $G,6*$SZ($ctx) + $LD $H,7*$SZ($ctx) + + $PTR_ADD @X[15],$inp # pointer to the end of input + $REG_S @X[15],16*$SZ($sp) + b .Loop + +.align 5 +.Loop: + ${LD}l @X[0],$MSB($inp) + ${LD}r @X[0],$LSB($inp) +___ +for ($i=0;$i<16;$i++) +{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + b .L16_xx +.align 4 +.L16_xx: +___ +for (;$i<32;$i++) +{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + and @X[6],0xfff + li @X[7],$lastK + .set noreorder + bne @X[6],@X[7],.L16_xx + $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 + + $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input + $LD @X[0],0*$SZ($ctx) + $LD @X[1],1*$SZ($ctx) + $LD @X[2],2*$SZ($ctx) + $PTR_ADD $inp,16*$SZ + $LD @X[3],3*$SZ($ctx) + $ADDU $A,@X[0] + $LD @X[4],4*$SZ($ctx) + $ADDU $B,@X[1] + $LD @X[5],5*$SZ($ctx) + $ADDU $C,@X[2] + $LD @X[6],6*$SZ($ctx) + $ADDU $D,@X[3] + $LD @X[7],7*$SZ($ctx) + $ADDU $E,@X[4] + $ST $A,0*$SZ($ctx) + $ADDU $F,@X[5] + $ST $B,1*$SZ($ctx) + $ADDU $G,@X[6] + $ST $C,2*$SZ($ctx) + $ADDU $H,@X[7] + $ST $D,3*$SZ($ctx) + $ST $E,4*$SZ($ctx) + $ST $F,5*$SZ($ctx) + $ST $G,6*$SZ($ctx) + $ST $H,7*$SZ($ctx) + + bne $inp,@X[15],.Loop + $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl + + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end sha${label}_block_data_order + +.rdata +.align 5 +K${label}: +___ +if ($SZ==4) { +$code.=<<___; + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +___ +} else { +$code.=<<___; + .dword 0x428a2f98d728ae22, 0x7137449123ef65cd + .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .dword 0x3956c25bf348b538, 0x59f111f1b605d019 + .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .dword 0xd807aa98a3030242, 0x12835b0145706fbe + .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 + .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 + .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .dword 0x06ca6351e003826f, 0x142929670a0e6e70 + .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .dword 0x81c2c92e47edaee6, 0x92722c851482353b + .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .dword 0xd192e819d6ef5218, 0xd69906245565a910 + .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 + .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .dword 0x90befffa23631e28, 0xa4506cebde82bde9 + .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .dword 0xca273eceea26619c, 0xd186b8c721c0c207 + .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .dword 0x113f9804bef90dae, 0x1b710b35131c471b + .dword 0x28db77f523047d84, 0x32caab7b40c72493 + .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +___ +} +$code.=<<___; +.asciiz "SHA${label} for MIPS, CRYPTOGAMS by " +.align 5 + +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-parisc.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-parisc.pl new file mode 100755 index 0000000..fcb6157 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-parisc.pl @@ -0,0 +1,800 @@ +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256/512 block procedure for PA-RISC. + +# June 2009. +# +# SHA256 performance is >75% better than gcc 3.2 generated code on +# PA-7100LC. Compared to code generated by vendor compiler this +# implementation is almost 70% faster in 64-bit build, but delivers +# virtually same performance in 32-bit build on PA-8600. +# +# SHA512 performance is >2.9x better than gcc 3.2 generated code on +# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the +# code is executed on PA-RISC 2.0 processor and switches to 64-bit +# code path delivering adequate performance even in "blended" 32-bit +# build. Though 64-bit code is not any faster than code generated by +# vendor compiler on PA-8600... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +if ($output =~ /512/) { + $func="sha512_block_data_order"; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $LAST10BITS=0x017; + $LD="ldd"; + $LDM="ldd,ma"; + $ST="std"; +} else { + $func="sha256_block_data_order"; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $LAST10BITS=0x0f2; + $LD="ldw"; + $LDM="ldwm"; + $ST="stw"; +} + +$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker + # [+ argument transfer] +$XOFF=16*$SZ+32; # local variables +$FRAME+=$XOFF; +$XOFF+=$FRAME_MARKER; # distance between %sp and local variables + +$ctx="%r26"; # zapped by $a0 +$inp="%r25"; # zapped by $a1 +$num="%r24"; # zapped by $t0 + +$a0 ="%r26"; +$a1 ="%r25"; +$t0 ="%r24"; +$t1 ="%r29"; +$Tbl="%r31"; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); + +sub ROUND_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$code.=<<___; + _ror $e,$Sigma1[0],$a0 + and $f,$e,$t0 + _ror $e,$Sigma1[1],$a1 + addl $t1,$h,$h + andcm $g,$e,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 + or $t0,$t1,$t1 ; Ch(e,f,g) + addl @X[$i%16],$h,$h + xor $a0,$a1,$a1 ; Sigma1(e) + addl $t1,$h,$h + _ror $a,$Sigma0[0],$a0 + addl $a1,$h,$h + + _ror $a,$Sigma0[1],$a1 + and $a,$b,$t0 + and $a,$c,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $t1,$t0,$t0 + and $b,$c,$t1 + xor $a0,$a1,$a1 ; Sigma0(a) + addl $h,$d,$d + xor $t1,$t0,$t0 ; Maj(a,b,c) + `"$LDM $SZ($Tbl),$t1" if ($i<15)` + addl $a1,$h,$h + addl $t0,$h,$h + +___ +} + +sub ROUND_16_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$i-=16; +$code.=<<___; + _ror @X[($i+1)%16],$sigma0[0],$a0 + _ror @X[($i+1)%16],$sigma0[1],$a1 + addl @X[($i+9)%16],@X[$i],@X[$i] + _ror @X[($i+14)%16],$sigma1[0],$t0 + _ror @X[($i+14)%16],$sigma1[1],$t1 + xor $a1,$a0,$a0 + _shr @X[($i+1)%16],$sigma0[2],$a1 + xor $t1,$t0,$t0 + _shr @X[($i+14)%16],$sigma1[2],$t1 + xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) + xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) + $LDM $SZ($Tbl),$t1 + addl $a0,@X[$i],@X[$i] + addl $t0,@X[$i],@X[$i] +___ +$code.=<<___ if ($i==15); + extru $t1,31,10,$a1 + comiclr,<> $LAST10BITS,$a1,%r0 + ldo 1($Tbl),$Tbl ; signal end of $Tbl +___ +&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .ALIGN 64 +L\$table +___ +$code.=<<___ if ($SZ==8); + .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd + .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc + .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 + .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 + .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe + .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 + .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 + .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 + .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 + .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 + .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 + .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 + .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 + .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 + .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 + .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 + .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 + .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df + .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 + .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b + .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 + .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 + .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 + .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 + .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 + .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 + .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb + .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 + .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 + .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec + .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 + .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b + .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 + .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 + .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 + .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b + .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 + .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c + .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a + .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 +___ +$code.=<<___ if ($SZ==4); + .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +___ +$code.=<<___; + + .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 64 +$func + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + _shl $num,`log(16*$SZ)/log(2)`,$num + addl $inp,$num,$num ; $num to point at the end of $inp + + $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) + $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) + + blr %r0,$Tbl + ldi 3,$t1 +L\$pic + andcm $Tbl,$t1,$Tbl ; wipe privilege level + ldo L\$table-L\$pic($Tbl),$Tbl +___ +$code.=<<___ if ($SZ==8 && $SIZE_T==4); + ldi 31,$t1 + mtctl $t1,%cr11 + extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 + b L\$parisc1 + nop +___ +$code.=<<___; + $LD `0*$SZ`($ctx),$A ; load context + $LD `1*$SZ`($ctx),$B + $LD `2*$SZ`($ctx),$C + $LD `3*$SZ`($ctx),$D + $LD `4*$SZ`($ctx),$E + $LD `5*$SZ`($ctx),$F + $LD `6*$SZ`($ctx),$G + $LD `7*$SZ`($ctx),$H + + extru $inp,31,`log($SZ)/log(2)`,$t0 + sh3addl $t0,%r0,$t0 + subi `8*$SZ`,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop + ldi `$SZ-1`,$t0 + $LDM $SZ($Tbl),$t1 + andcm $inp,$t0,$t0 ; align $inp +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + $LD `$SZ*15`($t0),@X[15] + $LD `$SZ*16`($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align data + $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ + +for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +L\$rounds + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ +for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? + nop + + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl + + $LD `0*$SZ`($ctx),@X[0] ; load context + $LD `1*$SZ`($ctx),@X[1] + $LD `2*$SZ`($ctx),@X[2] + $LD `3*$SZ`($ctx),@X[3] + $LD `4*$SZ`($ctx),@X[4] + $LD `5*$SZ`($ctx),@X[5] + addl @X[0],$A,$A + $LD `6*$SZ`($ctx),@X[6] + addl @X[1],$B,$B + $LD `7*$SZ`($ctx),@X[7] + ldo `16*$SZ`($inp),$inp ; advance $inp + + $ST $A,`0*$SZ`($ctx) ; save context + addl @X[2],$C,$C + $ST $B,`1*$SZ`($ctx) + addl @X[3],$D,$D + $ST $C,`2*$SZ`($ctx) + addl @X[4],$E,$E + $ST $D,`3*$SZ`($ctx) + addl @X[5],$F,$F + $ST $E,`4*$SZ`($ctx) + addl @X[6],$G,$G + $ST $F,`5*$SZ`($ctx) + addl @X[7],$H,$H + $ST $G,`6*$SZ`($ctx) + $ST $H,`7*$SZ`($ctx) + + cmpb,*<>,n $inp,$num,L\$oop + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +___ +if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 +{{ +$code.=<<___; + b L\$done + nop + + .ALIGN 64 +L\$parisc1 +___ + +@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, + $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = + ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); +$a0 ="%r17"; +$a1 ="%r18"; +$a2 ="%r19"; +$a3 ="%r20"; +$t0 ="%r21"; +$t1 ="%r22"; +$t2 ="%r28"; +$t3 ="%r29"; +$Tbl="%r31"; + +@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx + +sub ROUND_00_15_pa1 { +my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, + $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; + +$code.=<<___ if (!$flag); + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] +___ +$code.=<<___; + shd $ehi,$elo,$Sigma1[0],$t0 + add $Xlo,$hlo,$hlo + shd $elo,$ehi,$Sigma1[0],$t1 + addc $Xhi,$hhi,$hhi ; h += X[i] + shd $ehi,$elo,$Sigma1[1],$t2 + ldwm 8($Tbl),$Xhi + shd $elo,$ehi,$Sigma1[1],$t3 + ldw -4($Tbl),$Xlo ; load K[i] + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + and $flo,$elo,$a0 + and $fhi,$ehi,$a1 + shd $ehi,$elo,$Sigma1[2],$t2 + andcm $glo,$elo,$a2 + shd $elo,$ehi,$Sigma1[2],$t3 + andcm $ghi,$ehi,$a3 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma1(e) + add $Xlo,$hlo,$hlo + xor $a2,$a0,$a0 + addc $Xhi,$hhi,$hhi ; h += K[i] + xor $a3,$a1,$a1 ; Ch(e,f,g) + + add $t0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[0],$t0 + addc $t1,$hhi,$hhi ; h += Sigma1(e) + shd $alo,$ahi,$Sigma0[0],$t1 + add $a0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[1],$t2 + addc $a1,$hhi,$hhi ; h += Ch(e,f,g) + shd $alo,$ahi,$Sigma0[1],$t3 + + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + shd $ahi,$alo,$Sigma0[2],$t2 + and $alo,$blo,$a0 + shd $alo,$ahi,$Sigma0[2],$t3 + and $ahi,$bhi,$a1 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma0(a) + + and $alo,$clo,$a2 + and $ahi,$chi,$a3 + xor $a2,$a0,$a0 + add $hlo,$dlo,$dlo + xor $a3,$a1,$a1 + addc $hhi,$dhi,$dhi ; d += h + and $blo,$clo,$a2 + add $t0,$hlo,$hlo + and $bhi,$chi,$a3 + addc $t1,$hhi,$hhi ; h += Sigma0(a) + xor $a2,$a0,$a0 + add $a0,$hlo,$hlo + xor $a3,$a1,$a1 ; Maj(a,b,c) + addc $a1,$hhi,$hhi ; h += Maj(a,b,c) + +___ +$code.=<<___ if ($i==15 && $flag); + extru $Xlo,31,10,$Xlo + comiclr,= $LAST10BITS,$Xlo,%r0 + b L\$rounds_pa1 + nop +___ +push(@X,shift(@X)); push(@X,shift(@X)); +} + +sub ROUND_16_xx_pa1 { +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; +my ($i)=shift; +$i-=16; +$code.=<<___; + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] + ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 + ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] + ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 + ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] + shd $Xnhi,$Xnlo,$sigma0[0],$t0 + shd $Xnlo,$Xnhi,$sigma0[0],$t1 + add $a0,$Xlo,$Xlo + shd $Xnhi,$Xnlo,$sigma0[1],$t2 + addc $a1,$Xhi,$Xhi + shd $Xnlo,$Xnhi,$sigma0[1],$t3 + xor $t2,$t0,$t0 + shd $Xnhi,$Xnlo,$sigma0[2],$t2 + xor $t3,$t1,$t1 + extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 + xor $t2,$t0,$t0 + shd $a3,$a2,$sigma1[0],$a0 + xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) + shd $a2,$a3,$sigma1[0],$a1 + add $t0,$Xlo,$Xlo + shd $a3,$a2,$sigma1[1],$t2 + addc $t1,$Xhi,$Xhi + shd $a2,$a3,$sigma1[1],$t3 + xor $t2,$a0,$a0 + shd $a3,$a2,$sigma1[2],$t2 + xor $t3,$a1,$a1 + extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 + xor $t2,$a0,$a0 + xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) + add $a0,$Xlo,$Xlo + addc $a1,$Xhi,$Xhi + + stw $Xhi,`-$XOFF+8*($i%16)`(%sp) + stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) +___ +&ROUND_00_15_pa1($i,@_,1); +} +$code.=<<___; + ldw `0*4`($ctx),$Ahi ; load context + ldw `1*4`($ctx),$Alo + ldw `2*4`($ctx),$Bhi + ldw `3*4`($ctx),$Blo + ldw `4*4`($ctx),$Chi + ldw `5*4`($ctx),$Clo + ldw `6*4`($ctx),$Dhi + ldw `7*4`($ctx),$Dlo + ldw `8*4`($ctx),$Ehi + ldw `9*4`($ctx),$Elo + ldw `10*4`($ctx),$Fhi + ldw `11*4`($ctx),$Flo + ldw `12*4`($ctx),$Ghi + ldw `13*4`($ctx),$Glo + ldw `14*4`($ctx),$Hhi + ldw `15*4`($ctx),$Hlo + + extru $inp,31,2,$t0 + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop_pa1 + extru $inp,31,2,$a3 + comib,= 0,$a3,L\$aligned_pa1 + sub $inp,$a3,$inp + + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + vshd $X[0],$X[1],$X[0] + vshd $X[1],$t2,$X[1] + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + vshd $t2,$t3,$t2 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 + vshd $t3,$a0,$t3 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<=(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +for (;$i<(128/4-1);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +$code.=<<___; + b L\$collected_pa1 + stw $t[0],`-$XOFF+$i*4`(%sp) + +___ +} +$code.=<<___; +L\$aligned_pa1 + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] +___ +push(@t,shift(@t)); +} +for (;$i<128/4;$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) +___ +push(@t,shift(@t)); +} +$code.="L\$collected_pa1\n"; +} + +for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } +$code.="L\$rounds_pa1\n"; +for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } + +$code.=<<___; + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl + + ldw `0*4`($ctx),$t1 ; update context + ldw `1*4`($ctx),$t0 + ldw `2*4`($ctx),$t3 + ldw `3*4`($ctx),$t2 + ldw `4*4`($ctx),$a1 + ldw `5*4`($ctx),$a0 + ldw `6*4`($ctx),$a3 + add $t0,$Alo,$Alo + ldw `7*4`($ctx),$a2 + addc $t1,$Ahi,$Ahi + ldw `8*4`($ctx),$t1 + add $t2,$Blo,$Blo + ldw `9*4`($ctx),$t0 + addc $t3,$Bhi,$Bhi + ldw `10*4`($ctx),$t3 + add $a0,$Clo,$Clo + ldw `11*4`($ctx),$t2 + addc $a1,$Chi,$Chi + ldw `12*4`($ctx),$a1 + add $a2,$Dlo,$Dlo + ldw `13*4`($ctx),$a0 + addc $a3,$Dhi,$Dhi + ldw `14*4`($ctx),$a3 + add $t0,$Elo,$Elo + ldw `15*4`($ctx),$a2 + addc $t1,$Ehi,$Ehi + stw $Ahi,`0*4`($ctx) + add $t2,$Flo,$Flo + stw $Alo,`1*4`($ctx) + addc $t3,$Fhi,$Fhi + stw $Bhi,`2*4`($ctx) + add $a0,$Glo,$Glo + stw $Blo,`3*4`($ctx) + addc $a1,$Ghi,$Ghi + stw $Chi,`4*4`($ctx) + add $a2,$Hlo,$Hlo + stw $Clo,`5*4`($ctx) + addc $a3,$Hhi,$Hhi + stw $Dhi,`6*4`($ctx) + ldo `16*$SZ`($inp),$inp ; advance $inp + stw $Dlo,`7*4`($ctx) + stw $Ehi,`8*4`($ctx) + stw $Elo,`9*4`($ctx) + stw $Fhi,`10*4`($ctx) + stw $Flo,`11*4`($ctx) + stw $Ghi,`12*4`($ctx) + stw $Glo,`13*4`($ctx) + stw $Hhi,`14*4`($ctx) + comb,= $inp,$num,L\$done + stw $Hlo,`15*4`($ctx) + b L\$oop_pa1 + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +L\$done +___ +}} +$code.=<<___; + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by " +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices + { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); + $opcode|=(1<<3) if ($mod =~ /^,m/); + $opcode|=(1<<2) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices + { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 + { sprintf "\t.WORD\t0x%08x\t; %s", + (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ + $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 + : sprintf("shd\t%$1,%$2,%d",$3)/e or + # translate made up instructons: _ror, _shr, _align, _shl + s/_ror(\s+)(%r[0-9]+),/ + ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or + + s/_shr(\s+%r[0-9]+),([0-9]+),/ + $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or + + s/_align(\s+%r[0-9]+,%r[0-9]+),/ + ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or + + s/_shl(\s+%r[0-9]+),([0-9]+),/ + $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; + + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); + + s/cmpb,\*/comb,/ if ($SIZE_T==4); + + s/\bbv\b/bve/ if ($SIZE_T==8); + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-ppc.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-ppc.pl new file mode 100755 index 0000000..fe95b01 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-ppc.pl @@ -0,0 +1,799 @@ +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# I let hardware handle unaligned input, except on page boundaries +# (see below for details). Otherwise straightforward implementation +# with X vector in register bank. + +# sha256 | sha512 +# -m64 -m32 | -m64 -m32 +# --------------------------------------+----------------------- +# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*) +# Power6,xlc-7 +150% +90% | +100% +430%(*) +# +# (*) 64-bit code in 32-bit application context, which actually is +# on TODO list. It should be noted that for safe deployment in +# 32-bit *mutli-threaded* context asyncronous signals should be +# blocked upon entry to SHA512 block routine. This is because +# 32-bit signaling procedure invalidates upper halves of GPRs. +# Context switch procedure preserves them, but not signaling:-( + +# Second version is true multi-thread safe. Trouble with the original +# version was that it was using thread local storage pointer register. +# Well, it scrupulously preserved it, but the problem would arise the +# moment asynchronous signal was delivered and signal handler would +# dereference the TLS pointer. While it's never the case in openssl +# application or test suite, we have to respect this scenario and not +# use TLS pointer register. Alternative would be to require caller to +# block signals prior calling this routine. For the record, in 32-bit +# context R2 serves as TLS pointer, while in 64-bit context - R13. + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $UCMP="cmpld"; + $SHL="sldi"; + $POP="ld"; + $PUSH="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $UCMP="cmplw"; + $SHL="slwi"; + $POP="lwz"; + $PUSH="stw"; +} else { die "nonsense $flavour"; } + +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +if ($output =~ /512/) { + $func="sha512_block_ppc"; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $LD="ld"; + $ST="std"; + $ROR="rotrdi"; + $SHR="srdi"; +} else { + $func="sha256_block_ppc"; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $LD="lwz"; + $ST="stw"; + $ROR="rotrwi"; + $SHR="srwi"; +} + +$FRAME=32*$SIZE_T+16*$SZ; +$LOCALS=6*$SIZE_T; + +$sp ="r1"; +$toc="r2"; +$ctx="r3"; # zapped by $a0 +$inp="r4"; # zapped by $a1 +$num="r5"; # zapped by $t0 + +$T ="r0"; +$a0 ="r3"; +$a1 ="r4"; +$t0 ="r5"; +$t1 ="r6"; +$Tbl="r7"; + +$A ="r8"; +$B ="r9"; +$C ="r10"; +$D ="r11"; +$E ="r12"; +$F =$t1; $t1 = "r0"; # stay away from "r13"; +$G ="r14"; +$H ="r15"; + +@V=($A,$B,$C,$D,$E,$F,$G,$H); +@X=("r16","r17","r18","r19","r20","r21","r22","r23", + "r24","r25","r26","r27","r28","r29","r30","r31"); + +$inp="r31" if($SZ==4 || $SIZE_T==8); # reassigned $inp! aliases with @X[15] + +sub ROUND_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$code.=<<___; + $ROR $a0,$e,$Sigma1[0] + $ROR $a1,$e,$Sigma1[1] + and $t0,$f,$e + xor $a0,$a0,$a1 + add $h,$h,$t1 + andc $t1,$g,$e + $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]` + or $t0,$t0,$t1 ; Ch(e,f,g) + add $h,$h,@X[$i%16] + xor $a0,$a0,$a1 ; Sigma1(e) + add $h,$h,$t0 + add $h,$h,$a0 + + $ROR $a0,$a,$Sigma0[0] + $ROR $a1,$a,$Sigma0[1] + and $t0,$a,$b + and $t1,$a,$c + xor $a0,$a0,$a1 + $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]` + xor $t0,$t0,$t1 + and $t1,$b,$c + xor $a0,$a0,$a1 ; Sigma0(a) + add $d,$d,$h + xor $t0,$t0,$t1 ; Maj(a,b,c) +___ +$code.=<<___ if ($i<15); + $LD $t1,`($i+1)*$SZ`($Tbl) +___ +$code.=<<___; + add $h,$h,$a0 + add $h,$h,$t0 + +___ +} + +sub ROUND_16_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$i-=16; +$code.=<<___; + $ROR $a0,@X[($i+1)%16],$sigma0[0] + $ROR $a1,@X[($i+1)%16],$sigma0[1] + $ROR $t0,@X[($i+14)%16],$sigma1[0] + $ROR $t1,@X[($i+14)%16],$sigma1[1] + xor $a0,$a0,$a1 + $SHR $a1,@X[($i+1)%16],$sigma0[2] + xor $t0,$t0,$t1 + $SHR $t1,@X[($i+14)%16],$sigma1[2] + add @X[$i],@X[$i],@X[($i+9)%16] + xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f]) + xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f]) + $LD $t1,`$i*$SZ`($Tbl) + add @X[$i],@X[$i],$a0 + add @X[$i],@X[$i],$t0 +___ +&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); +} + +$code=<<___; +.machine "any" +.text + +.globl $func +.align 6 +$func: + $STU $sp,-$FRAME($sp) + mflr r0 + $SHL $num,$num,`log(16*$SZ)/log(2)` + + $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) + + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) +___ + +if ($SZ==4 || $SIZE_T==8) { +$code.=<<___; + $LD $A,`0*$SZ`($ctx) + mr $inp,r4 ; incarnate $inp + $LD $B,`1*$SZ`($ctx) + $LD $C,`2*$SZ`($ctx) + $LD $D,`3*$SZ`($ctx) + $LD $E,`4*$SZ`($ctx) + $LD $F,`5*$SZ`($ctx) + $LD $G,`6*$SZ`($ctx) + $LD $H,`7*$SZ`($ctx) +___ +} else { + for ($i=16;$i<32;$i++) { + $code.=<<___; + lwz r$i,`$LITTLE_ENDIAN^(4*($i-16))`($ctx) +___ + } +} + +$code.=<<___; + bl LPICmeup +LPICedup: + andi. r0,$inp,3 + bne Lunaligned +Laligned: + add $num,$inp,$num + $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer + $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer + bl Lsha2_block_private + b Ldone + +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for the input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... +.align 4 +Lunaligned: + subfic $t1,$inp,4096 + andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary + beq Lcross_page + $UCMP $num,$t1 + ble Laligned ; didn't cross the page boundary + subfc $num,$t1,$num + add $t1,$inp,$t1 + $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num + $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer + $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer + bl Lsha2_block_private + ; $inp equals to the intermediate end pointer here + $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num +Lcross_page: + li $t1,`16*$SZ/4` + mtctr $t1 +___ +if ($SZ==4 || $SIZE_T==8) { +$code.=<<___; + addi r20,$sp,$LOCALS ; aligned spot below the frame +Lmemcpy: + lbz r16,0($inp) + lbz r17,1($inp) + lbz r18,2($inp) + lbz r19,3($inp) + addi $inp,$inp,4 + stb r16,0(r20) + stb r17,1(r20) + stb r18,2(r20) + stb r19,3(r20) + addi r20,r20,4 + bdnz Lmemcpy +___ +} else { +$code.=<<___; + addi r12,$sp,$LOCALS ; aligned spot below the frame +Lmemcpy: + lbz r8,0($inp) + lbz r9,1($inp) + lbz r10,2($inp) + lbz r11,3($inp) + addi $inp,$inp,4 + stb r8,0(r12) + stb r9,1(r12) + stb r10,2(r12) + stb r11,3(r12) + addi r12,r12,4 + bdnz Lmemcpy +___ +} + +$code.=<<___; + $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp + addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer + addi $inp,$sp,$LOCALS ; fictitious inp pointer + $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num + $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer + $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer + bl Lsha2_block_private + $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp + $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num + addic. $num,$num,`-16*$SZ` ; num-- + bne Lunaligned + +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 +___ + +if ($SZ==4 || $SIZE_T==8) { +$code.=<<___; +.align 4 +Lsha2_block_private: + $LD $t1,0($Tbl) +___ +for($i=0;$i<16;$i++) { +$code.=<<___ if ($SZ==4 && !$LITTLE_ENDIAN); + lwz @X[$i],`$i*$SZ`($inp) +___ +$code.=<<___ if ($SZ==4 && $LITTLE_ENDIAN); + lwz $a0,`$i*$SZ`($inp) + rotlwi @X[$i],$a0,8 + rlwimi @X[$i],$a0,24,0,7 + rlwimi @X[$i],$a0,24,16,23 +___ +# 64-bit loads are split to 2x32-bit ones, as CPU can't handle +# unaligned 64-bit loads, only 32-bit ones... +$code.=<<___ if ($SZ==8 && !$LITTLE_ENDIAN); + lwz $t0,`$i*$SZ`($inp) + lwz @X[$i],`$i*$SZ+4`($inp) + insrdi @X[$i],$t0,32,0 +___ +$code.=<<___ if ($SZ==8 && $LITTLE_ENDIAN); + lwz $a0,`$i*$SZ`($inp) + lwz $a1,`$i*$SZ+4`($inp) + rotlwi $t0,$a0,8 + rotlwi @X[$i],$a1,8 + rlwimi $t0,$a0,24,0,7 + rlwimi @X[$i],$a1,24,0,7 + rlwimi $t0,$a0,24,16,23 + rlwimi @X[$i],$a1,24,16,23 + insrdi @X[$i],$t0,32,0 +___ + &ROUND_00_15($i,@V); + unshift(@V,pop(@V)); +} +$code.=<<___; + li $t0,`$rounds/16-1` + mtctr $t0 +.align 4 +Lrounds: + addi $Tbl,$Tbl,`16*$SZ` +___ +for(;$i<32;$i++) { + &ROUND_16_xx($i,@V); + unshift(@V,pop(@V)); +} +$code.=<<___; + bdnz Lrounds + + $POP $ctx,`$FRAME-$SIZE_T*22`($sp) + $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer + $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer + subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl + + $LD r16,`0*$SZ`($ctx) + $LD r17,`1*$SZ`($ctx) + $LD r18,`2*$SZ`($ctx) + $LD r19,`3*$SZ`($ctx) + $LD r20,`4*$SZ`($ctx) + $LD r21,`5*$SZ`($ctx) + $LD r22,`6*$SZ`($ctx) + addi $inp,$inp,`16*$SZ` ; advance inp + $LD r23,`7*$SZ`($ctx) + add $A,$A,r16 + add $B,$B,r17 + $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) + add $C,$C,r18 + $ST $A,`0*$SZ`($ctx) + add $D,$D,r19 + $ST $B,`1*$SZ`($ctx) + add $E,$E,r20 + $ST $C,`2*$SZ`($ctx) + add $F,$F,r21 + $ST $D,`3*$SZ`($ctx) + add $G,$G,r22 + $ST $E,`4*$SZ`($ctx) + add $H,$H,r23 + $ST $F,`5*$SZ`($ctx) + $ST $G,`6*$SZ`($ctx) + $UCMP $inp,$num + $ST $H,`7*$SZ`($ctx) + bne Lsha2_block_private + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size $func,.-$func +___ +} else { +######################################################################## +# SHA512 for PPC32, X vector is off-loaded to stack... +# +# | sha512 +# | -m32 +# ----------------------+----------------------- +# PPC74x0,gcc-4.0.1 | +48% +# POWER6,gcc-4.4.6 | +124%(*) +# POWER7,gcc-4.4.6 | +79%(*) +# e300,gcc-4.1.0 | +167% +# +# (*) ~1/3 of -m64 result [and ~20% better than -m32 code generated +# by xlc-12.1] + +my $XOFF=$LOCALS; + +my @V=map("r$_",(16..31)); # A..H + +my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15)); +my ($x0,$x1)=("r3","r4"); # zaps $ctx and $inp + +sub ROUND_00_15_ppc32 { +my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, + $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_; + +$code.=<<___; + lwz $t2,`$SZ*($i%16)+($LITTLE_ENDIAN^4)`($Tbl) + xor $a0,$flo,$glo + lwz $t3,`$SZ*($i%16)+($LITTLE_ENDIAN^0)`($Tbl) + xor $a1,$fhi,$ghi + addc $hlo,$hlo,$t0 ; h+=x[i] + stw $t0,`$XOFF+0+$SZ*($i%16)`($sp) ; save x[i] + + srwi $s0,$elo,$Sigma1[0] + srwi $s1,$ehi,$Sigma1[0] + and $a0,$a0,$elo + adde $hhi,$hhi,$t1 + and $a1,$a1,$ehi + stw $t1,`$XOFF+4+$SZ*($i%16)`($sp) + srwi $t0,$elo,$Sigma1[1] + srwi $t1,$ehi,$Sigma1[1] + addc $hlo,$hlo,$t2 ; h+=K512[i] + insrwi $s0,$ehi,$Sigma1[0],0 + insrwi $s1,$elo,$Sigma1[0],0 + xor $a0,$a0,$glo ; Ch(e,f,g) + adde $hhi,$hhi,$t3 + xor $a1,$a1,$ghi + insrwi $t0,$ehi,$Sigma1[1],0 + insrwi $t1,$elo,$Sigma1[1],0 + addc $hlo,$hlo,$a0 ; h+=Ch(e,f,g) + srwi $t2,$ehi,$Sigma1[2]-32 + srwi $t3,$elo,$Sigma1[2]-32 + xor $s0,$s0,$t0 + xor $s1,$s1,$t1 + insrwi $t2,$elo,$Sigma1[2]-32,0 + insrwi $t3,$ehi,$Sigma1[2]-32,0 + xor $a0,$alo,$blo ; a^b, b^c in next round + adde $hhi,$hhi,$a1 + xor $a1,$ahi,$bhi + xor $s0,$s0,$t2 ; Sigma1(e) + xor $s1,$s1,$t3 + + srwi $t0,$alo,$Sigma0[0] + and $a2,$a2,$a0 + addc $hlo,$hlo,$s0 ; h+=Sigma1(e) + and $a3,$a3,$a1 + srwi $t1,$ahi,$Sigma0[0] + srwi $s0,$ahi,$Sigma0[1]-32 + adde $hhi,$hhi,$s1 + srwi $s1,$alo,$Sigma0[1]-32 + insrwi $t0,$ahi,$Sigma0[0],0 + insrwi $t1,$alo,$Sigma0[0],0 + xor $a2,$a2,$blo ; Maj(a,b,c) + addc $dlo,$dlo,$hlo ; d+=h + xor $a3,$a3,$bhi + insrwi $s0,$alo,$Sigma0[1]-32,0 + insrwi $s1,$ahi,$Sigma0[1]-32,0 + adde $dhi,$dhi,$hhi + srwi $t2,$ahi,$Sigma0[2]-32 + srwi $t3,$alo,$Sigma0[2]-32 + xor $s0,$s0,$t0 + addc $hlo,$hlo,$a2 ; h+=Maj(a,b,c) + xor $s1,$s1,$t1 + insrwi $t2,$alo,$Sigma0[2]-32,0 + insrwi $t3,$ahi,$Sigma0[2]-32,0 + adde $hhi,$hhi,$a3 +___ +$code.=<<___ if ($i>=15); + lwz $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp) + lwz $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp) +___ +$code.=<<___ if ($i<15 && !$LITTLE_ENDIAN); + lwz $t1,`$SZ*($i+1)+0`($inp) + lwz $t0,`$SZ*($i+1)+4`($inp) +___ +$code.=<<___ if ($i<15 && $LITTLE_ENDIAN); + lwz $a2,`$SZ*($i+1)+0`($inp) + lwz $a3,`$SZ*($i+1)+4`($inp) + rotlwi $t1,$a2,8 + rotlwi $t0,$a3,8 + rlwimi $t1,$a2,24,0,7 + rlwimi $t0,$a3,24,0,7 + rlwimi $t1,$a2,24,16,23 + rlwimi $t0,$a3,24,16,23 +___ +$code.=<<___; + xor $s0,$s0,$t2 ; Sigma0(a) + xor $s1,$s1,$t3 + addc $hlo,$hlo,$s0 ; h+=Sigma0(a) + adde $hhi,$hhi,$s1 +___ +$code.=<<___ if ($i==15); + lwz $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp) + lwz $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp) +___ +} +sub ROUND_16_xx_ppc32 { +my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, + $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_; + +$code.=<<___; + srwi $s0,$t0,$sigma0[0] + srwi $s1,$t1,$sigma0[0] + srwi $t2,$t0,$sigma0[1] + srwi $t3,$t1,$sigma0[1] + insrwi $s0,$t1,$sigma0[0],0 + insrwi $s1,$t0,$sigma0[0],0 + srwi $a0,$t0,$sigma0[2] + insrwi $t2,$t1,$sigma0[1],0 + insrwi $t3,$t0,$sigma0[1],0 + insrwi $a0,$t1,$sigma0[2],0 + xor $s0,$s0,$t2 + lwz $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp) + srwi $a1,$t1,$sigma0[2] + xor $s1,$s1,$t3 + lwz $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp) + xor $a0,$a0,$s0 + srwi $s0,$t2,$sigma1[0] + xor $a1,$a1,$s1 + srwi $s1,$t3,$sigma1[0] + addc $x0,$x0,$a0 ; x[i]+=sigma0(x[i+1]) + srwi $a0,$t3,$sigma1[1]-32 + insrwi $s0,$t3,$sigma1[0],0 + insrwi $s1,$t2,$sigma1[0],0 + adde $x1,$x1,$a1 + srwi $a1,$t2,$sigma1[1]-32 + + insrwi $a0,$t2,$sigma1[1]-32,0 + srwi $t2,$t2,$sigma1[2] + insrwi $a1,$t3,$sigma1[1]-32,0 + insrwi $t2,$t3,$sigma1[2],0 + xor $s0,$s0,$a0 + lwz $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp) + srwi $t3,$t3,$sigma1[2] + xor $s1,$s1,$a1 + lwz $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp) + xor $s0,$s0,$t2 + addc $x0,$x0,$a0 ; x[i]+=x[i+9] + xor $s1,$s1,$t3 + adde $x1,$x1,$a1 + addc $x0,$x0,$s0 ; x[i]+=sigma1(x[i+14]) + adde $x1,$x1,$s1 +___ + ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1); + &ROUND_00_15_ppc32(@_); +} + +$code.=<<___; +.align 4 +Lsha2_block_private: +___ +$code.=<<___ if (!$LITTLE_ENDIAN); + lwz $t1,0($inp) + xor $a2,@V[3],@V[5] ; B^C, magic seed + lwz $t0,4($inp) + xor $a3,@V[2],@V[4] +___ +$code.=<<___ if ($LITTLE_ENDIAN); + lwz $a1,0($inp) + xor $a2,@V[3],@V[5] ; B^C, magic seed + lwz $a0,4($inp) + xor $a3,@V[2],@V[4] + rotlwi $t1,$a1,8 + rotlwi $t0,$a0,8 + rlwimi $t1,$a1,24,0,7 + rlwimi $t0,$a0,24,0,7 + rlwimi $t1,$a1,24,16,23 + rlwimi $t0,$a0,24,16,23 +___ +for($i=0;$i<16;$i++) { + &ROUND_00_15_ppc32($i,@V); + unshift(@V,pop(@V)); unshift(@V,pop(@V)); + ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1); +} +$code.=<<___; + li $a0,`$rounds/16-1` + mtctr $a0 +.align 4 +Lrounds: + addi $Tbl,$Tbl,`16*$SZ` +___ +for(;$i<32;$i++) { + &ROUND_16_xx_ppc32($i,@V); + unshift(@V,pop(@V)); unshift(@V,pop(@V)); + ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1); +} +$code.=<<___; + bdnz Lrounds + + $POP $ctx,`$FRAME-$SIZE_T*22`($sp) + $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer + $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer + subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl + + lwz $t0,`$LITTLE_ENDIAN^0`($ctx) + lwz $t1,`$LITTLE_ENDIAN^4`($ctx) + lwz $t2,`$LITTLE_ENDIAN^8`($ctx) + lwz $t3,`$LITTLE_ENDIAN^12`($ctx) + lwz $a0,`$LITTLE_ENDIAN^16`($ctx) + lwz $a1,`$LITTLE_ENDIAN^20`($ctx) + lwz $a2,`$LITTLE_ENDIAN^24`($ctx) + addc @V[1],@V[1],$t1 + lwz $a3,`$LITTLE_ENDIAN^28`($ctx) + adde @V[0],@V[0],$t0 + lwz $t0,`$LITTLE_ENDIAN^32`($ctx) + addc @V[3],@V[3],$t3 + lwz $t1,`$LITTLE_ENDIAN^36`($ctx) + adde @V[2],@V[2],$t2 + lwz $t2,`$LITTLE_ENDIAN^40`($ctx) + addc @V[5],@V[5],$a1 + lwz $t3,`$LITTLE_ENDIAN^44`($ctx) + adde @V[4],@V[4],$a0 + lwz $a0,`$LITTLE_ENDIAN^48`($ctx) + addc @V[7],@V[7],$a3 + lwz $a1,`$LITTLE_ENDIAN^52`($ctx) + adde @V[6],@V[6],$a2 + lwz $a2,`$LITTLE_ENDIAN^56`($ctx) + addc @V[9],@V[9],$t1 + lwz $a3,`$LITTLE_ENDIAN^60`($ctx) + adde @V[8],@V[8],$t0 + stw @V[0],`$LITTLE_ENDIAN^0`($ctx) + stw @V[1],`$LITTLE_ENDIAN^4`($ctx) + addc @V[11],@V[11],$t3 + stw @V[2],`$LITTLE_ENDIAN^8`($ctx) + stw @V[3],`$LITTLE_ENDIAN^12`($ctx) + adde @V[10],@V[10],$t2 + stw @V[4],`$LITTLE_ENDIAN^16`($ctx) + stw @V[5],`$LITTLE_ENDIAN^20`($ctx) + addc @V[13],@V[13],$a1 + stw @V[6],`$LITTLE_ENDIAN^24`($ctx) + stw @V[7],`$LITTLE_ENDIAN^28`($ctx) + adde @V[12],@V[12],$a0 + stw @V[8],`$LITTLE_ENDIAN^32`($ctx) + stw @V[9],`$LITTLE_ENDIAN^36`($ctx) + addc @V[15],@V[15],$a3 + stw @V[10],`$LITTLE_ENDIAN^40`($ctx) + stw @V[11],`$LITTLE_ENDIAN^44`($ctx) + adde @V[14],@V[14],$a2 + stw @V[12],`$LITTLE_ENDIAN^48`($ctx) + stw @V[13],`$LITTLE_ENDIAN^52`($ctx) + stw @V[14],`$LITTLE_ENDIAN^56`($ctx) + stw @V[15],`$LITTLE_ENDIAN^60`($ctx) + + addi $inp,$inp,`16*$SZ` ; advance inp + $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) + $UCMP $inp,$num + bne Lsha2_block_private + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size $func,.-$func +___ +} + +# Ugly hack here, because PPC assembler syntax seem to vary too +# much from platforms to platform... +$code.=<<___; +.align 6 +LPICmeup: + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl ; vvvvvv "distance" between . and 1st data entry + addi $Tbl,$Tbl,`64-8` + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-s390x.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-s390x.pl new file mode 100644 index 0000000..427d6f8 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-s390x.pl @@ -0,0 +1,322 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256/512 block procedures for s390x. + +# April 2007. +# +# sha256_block_data_order is reportedly >3 times faster than gcc 3.3 +# generated code (must be a bug in compiler, as improvement is +# "pathologically" high, in particular in comparison to other SHA +# modules). But the real twist is that it detects if hardware support +# for SHA256 is available and in such case utilizes it. Then the +# performance can reach >6.5x of assembler one for larger chunks. +# +# sha512_block_data_order is ~70% faster than gcc 3.3 generated code. + +# January 2009. +# +# Add support for hardware SHA512 and reschedule instructions to +# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster +# than software. + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 SHA256 was measured to +# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +$t0="%r0"; +$t1="%r1"; +$ctx="%r2"; $t2="%r2"; +$inp="%r3"; +$len="%r4"; # used as index in inner loop + +$A="%r5"; +$B="%r6"; +$C="%r7"; +$D="%r8"; +$E="%r9"; +$F="%r10"; +$G="%r11"; +$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H); +$tbl="%r13"; +$T1="%r14"; +$sp="%r15"; + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +if ($output =~ /512/) { + $label="512"; + $SZ=8; + $LD="lg"; # load from memory + $ST="stg"; # store to memory + $ADD="alg"; # add with memory operand + $ROT="rllg"; # rotate left + $SHR="srlg"; # logical right shift [see even at the end] + @Sigma0=(25,30,36); + @Sigma1=(23,46,50); + @sigma0=(56,63, 7); + @sigma1=( 3,45, 6); + $rounds=80; + $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled +} else { + $label="256"; + $SZ=4; + $LD="llgf"; # load from memory + $ST="st"; # store to memory + $ADD="al"; # add with memory operand + $ROT="rll"; # rotate left + $SHR="srl"; # logical right shift + @Sigma0=(10,19,30); + @Sigma1=( 7,21,26); + @sigma0=(14,25, 3); + @sigma1=(13,15,10); + $rounds=64; + $kimdfunc=2; # magic function code for kimd instruction +} +$Func="sha${label}_block_data_order"; +$Table="K${label}"; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*$SZ; + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___ if ($i<16); + $LD $T1,`$i*$SZ`($inp) ### $i +___ +$code.=<<___; + $ROT $t0,$e,$Sigma1[0] + $ROT $t1,$e,$Sigma1[1] + lgr $t2,$f + xgr $t0,$t1 + $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` + xgr $t2,$g + $ST $T1,`$stdframe+$SZ*($i%16)`($sp) + xgr $t0,$t1 # Sigma1(e) + algr $T1,$h # T1+=h + ngr $t2,$e + lgr $t1,$a + algr $T1,$t0 # T1+=Sigma1(e) + $ROT $h,$a,$Sigma0[0] + xgr $t2,$g # Ch(e,f,g) + $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i] + $ROT $t0,$a,$Sigma0[1] + algr $T1,$t2 # T1+=Ch(e,f,g) + ogr $t1,$b + xgr $h,$t0 + lgr $t2,$a + ngr $t1,$c + $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]` + xgr $h,$t0 # h=Sigma0(a) + ngr $t2,$b + algr $h,$T1 # h+=T1 + ogr $t2,$t1 # Maj(a,b,c) + algr $d,$T1 # d+=T1 + algr $h,$t2 # h+=Maj(a,b,c) +___ +} + +sub BODY_16_XX { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i + $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp) + $ROT $t0,$T1,$sigma0[0] + $SHR $T1,$sigma0[2] + $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` + xgr $T1,$t0 + $ROT $t0,$t1,$sigma1[0] + xgr $T1,$t2 # sigma0(X[i+1]) + $SHR $t1,$sigma1[2] + $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i] + xgr $t1,$t0 + $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` + $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9] + xgr $t1,$t0 # sigma1(X[i+14]) + algr $T1,$t1 # +=sigma1(X[i+14]) +___ + &BODY_00_15(@_); +} + +$code.=<<___; +.text +.align 64 +.type $Table,\@object +$Table: +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +___ +$code.=<<___; +.size $Table,.-$Table +.globl $Func +.type $Func,\@function +$Func: + sllg $len,$len,`log(16*$SZ)/log(2)` +___ +$code.=<<___ if ($kimdfunc); + larl %r1,OPENSSL_s390xcap_P + lg %r0,16(%r1) # check kimd capabilities + tmhh %r0,`0x8000>>$kimdfunc` + jz .Lsoftware + lghi %r0,$kimdfunc + lgr %r1,$ctx + lgr %r2,$inp + lgr %r3,$len + .long 0xb93e0002 # kimd %r0,%r2 + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 16 +.Lsoftware: +___ +$code.=<<___; + lghi %r1,-$frame + la $len,0($len,$inp) + stm${g} $ctx,%r15,`2*$SIZE_T`($sp) + lgr %r0,$sp + la $sp,0(%r1,$sp) + st${g} %r0,0($sp) + + larl $tbl,$Table + $LD $A,`0*$SZ`($ctx) + $LD $B,`1*$SZ`($ctx) + $LD $C,`2*$SZ`($ctx) + $LD $D,`3*$SZ`($ctx) + $LD $E,`4*$SZ`($ctx) + $LD $F,`5*$SZ`($ctx) + $LD $G,`6*$SZ`($ctx) + $LD $H,`7*$SZ`($ctx) + +.Lloop: + lghi $len,0 +___ +for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".Lrounds_16_xx:\n"; +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + aghi $len,`16*$SZ` + lghi $t0,`($rounds-16)*$SZ` + clgr $len,$t0 + jne .Lrounds_16_xx + + l${g} $ctx,`$frame+2*$SIZE_T`($sp) + la $inp,`16*$SZ`($inp) + $ADD $A,`0*$SZ`($ctx) + $ADD $B,`1*$SZ`($ctx) + $ADD $C,`2*$SZ`($ctx) + $ADD $D,`3*$SZ`($ctx) + $ADD $E,`4*$SZ`($ctx) + $ADD $F,`5*$SZ`($ctx) + $ADD $G,`6*$SZ`($ctx) + $ADD $H,`7*$SZ`($ctx) + $ST $A,`0*$SZ`($ctx) + $ST $B,`1*$SZ`($ctx) + $ST $C,`2*$SZ`($ctx) + $ST $D,`3*$SZ`($ctx) + $ST $E,`4*$SZ`($ctx) + $ST $F,`5*$SZ`($ctx) + $ST $G,`6*$SZ`($ctx) + $ST $H,`7*$SZ`($ctx) + cl${g} $inp,`$frame+4*$SIZE_T`($sp) + jne .Lloop + + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) + br %r14 +.size $Func,.-$Func +.string "SHA${label} block transform for s390x, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +# unlike 32-bit shift 64-bit one takes three arguments +$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm; + +print $code; +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-sparcv9.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-sparcv9.pl new file mode 100644 index 0000000..4a1ce5f --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-sparcv9.pl @@ -0,0 +1,857 @@ +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Hardware SPARC T4 support by David S. Miller . +# ==================================================================== + +# SHA256 performance improvement over compiler generated code varies +# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit +# build]. Just like in SHA1 module I aim to ensure scalability on +# UltraSPARC T1 by packing X[16] to 8 64-bit registers. + +# SHA512 on pre-T1 UltraSPARC. +# +# Performance is >75% better than 64-bit code generated by Sun C and +# over 2x than 32-bit code. X[16] resides on stack, but access to it +# is scheduled for L2 latency and staged through 32 least significant +# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI +# duality. Nevetheless it's ~40% faster than SHA256, which is pretty +# good [optimal coefficient is 50%]. +# +# SHA512 on UltraSPARC T1. +# +# It's not any faster than 64-bit code generated by Sun C 5.8. This is +# because 64-bit code generator has the advantage of using 64-bit +# loads(*) to access X[16], which I consciously traded for 32-/64-bit +# ABI duality [as per above]. But it surpasses 32-bit Sun C generated +# code by 60%, not to mention that it doesn't suffer from severe decay +# when running 4 times physical cores threads and that it leaves gcc +# [3.4] behind by over 4x factor! If compared to SHA256, single thread +# performance is only 10% better, but overall throughput for maximum +# amount of threads for given CPU exceeds corresponding one of SHA256 +# by 30% [again, optimal coefficient is 50%]. +# +# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly +# in-order, i.e. load instruction has to complete prior next +# instruction in given thread is executed, even if the latter is +# not dependent on load result! This means that on T1 two 32-bit +# loads are always slower than one 64-bit load. Once again this +# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately, +# 2x32-bit loads can be as fast as 1x64-bit ones. +# +# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte, +# which is 9.3x/11.1x faster than software. Multi-process benchmark +# saturates at 11.5x single-process result on 8-core processor, or +# ~11/16GBps per 2.85GHz socket. + +$output=pop; +open STDOUT,">$output"; + +if ($output =~ /512/) { + $label="512"; + $SZ=8; + $LD="ldx"; # load from memory + $ST="stx"; # store to memory + $SLL="sllx"; # shift left logical + $SRL="srlx"; # shift right logical + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=( 7, 1, 8); # right shift first + @sigma1=( 6,19,61); # right shift first + $lastK=0x817; + $rounds=80; + $align=4; + + $locals=16*$SZ; # X[16] + + $A="%o0"; + $B="%o1"; + $C="%o2"; + $D="%o3"; + $E="%o4"; + $F="%o5"; + $G="%g1"; + $H="%o7"; + @V=($A,$B,$C,$D,$E,$F,$G,$H); +} else { + $label="256"; + $SZ=4; + $LD="ld"; # load from memory + $ST="st"; # store to memory + $SLL="sll"; # shift left logical + $SRL="srl"; # shift right logical + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 3, 7,18); # right shift first + @sigma1=(10,17,19); # right shift first + $lastK=0x8f2; + $rounds=64; + $align=8; + + $locals=0; # X[16] is register resident + @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); + + $A="%l0"; + $B="%l1"; + $C="%l2"; + $D="%l3"; + $E="%l4"; + $F="%l5"; + $G="%l6"; + $H="%l7"; + @V=($A,$B,$C,$D,$E,$F,$G,$H); +} +$T1="%g2"; +$tmp0="%g3"; +$tmp1="%g4"; +$tmp2="%g5"; + +$ctx="%i0"; +$inp="%i1"; +$len="%i2"; +$Ktbl="%i3"; +$tmp31="%i4"; +$tmp32="%i5"; + +########### SHA256 +$Xload = sub { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; + + if ($i==0) { +$code.=<<___; + ldx [$inp+0],@X[0] + ldx [$inp+16],@X[2] + ldx [$inp+32],@X[4] + ldx [$inp+48],@X[6] + ldx [$inp+8],@X[1] + ldx [$inp+24],@X[3] + subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too + ldx [$inp+40],@X[5] + bz,pt %icc,.Laligned + ldx [$inp+56],@X[7] + + sllx @X[0],$tmp31,@X[0] + ldx [$inp+64],$T1 +___ +for($j=0;$j<7;$j++) +{ $code.=<<___; + srlx @X[$j+1],$tmp32,$tmp1 + sllx @X[$j+1],$tmp31,@X[$j+1] + or $tmp1,@X[$j],@X[$j] +___ +} +$code.=<<___; + srlx $T1,$tmp32,$T1 + or $T1,@X[7],@X[7] +.Laligned: +___ + } + + if ($i&1) { + $code.="\tadd @X[$i/2],$h,$T1\n"; + } else { + $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n"; + } +} if ($SZ==4); + +########### SHA512 +$Xload = sub { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8)); + +$code.=<<___ if ($i==0); + ld [$inp+0],%l0 + ld [$inp+4],%l1 + ld [$inp+8],%l2 + ld [$inp+12],%l3 + ld [$inp+16],%l4 + ld [$inp+20],%l5 + ld [$inp+24],%l6 + cmp $tmp31,0 + ld [$inp+28],%l7 +___ +$code.=<<___ if ($i<15); + sllx @pair[1],$tmp31,$tmp2 ! Xload($i) + add $tmp31,32,$tmp0 + sllx @pair[0],$tmp0,$tmp1 + `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)` + srlx @pair[2],$tmp32,@pair[1] + or $tmp1,$tmp2,$tmp2 + or @pair[1],$tmp2,$tmp2 + `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` + add $h,$tmp2,$T1 + $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] +___ +$code.=<<___ if ($i==12); + bnz,a,pn %icc,.+8 + ld [$inp+128],%l0 +___ +$code.=<<___ if ($i==15); + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 + sllx @pair[1],$tmp31,$tmp2 ! Xload($i) + add $tmp31,32,$tmp0 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 + sllx @pair[0],$tmp0,$tmp1 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 + srlx @pair[2],$tmp32,@pair[1] + or $tmp1,$tmp2,$tmp2 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 + or @pair[1],$tmp2,$tmp2 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 + add $h,$tmp2,$T1 + $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 +___ +} if ($SZ==8); + +########### common +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; + + if ($i<16) { + &$Xload(@_); + } else { + $code.="\tadd $h,$T1,$T1\n"; + } + +$code.=<<___; + $SRL $e,@Sigma1[0],$h !! $i + xor $f,$g,$tmp2 + $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1 + and $e,$tmp2,$tmp2 + $SRL $e,@Sigma1[1],$tmp0 + xor $tmp1,$h,$h + $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1 + xor $tmp0,$h,$h + $SRL $e,@Sigma1[2],$tmp0 + xor $tmp1,$h,$h + $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1 + xor $tmp0,$h,$h + xor $g,$tmp2,$tmp2 ! Ch(e,f,g) + xor $tmp1,$h,$tmp0 ! Sigma1(e) + + $SRL $a,@Sigma0[0],$h + add $tmp2,$T1,$T1 + $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i] + $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1 + add $tmp0,$T1,$T1 + $SRL $a,@Sigma0[1],$tmp0 + xor $tmp1,$h,$h + $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 + xor $tmp0,$h,$h + $SRL $a,@Sigma0[2],$tmp0 + xor $tmp1,$h,$h + $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 + xor $tmp0,$h,$h + xor $tmp1,$h,$h ! Sigma0(a) + + or $a,$b,$tmp0 + and $a,$b,$tmp1 + and $c,$tmp0,$tmp0 + or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c) + add $tmp2,$T1,$T1 ! +=K[$i] + add $tmp1,$h,$h + + add $T1,$d,$d + add $T1,$h,$h +___ +} + +########### SHA256 +$BODY_16_XX = sub { +my $i=@_[0]; +my $xi; + + if ($i&1) { + $xi=$tmp32; + $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n"; + } else { + $xi=@X[(($i+1)/2)%8]; + } +$code.=<<___; + srl $xi,@sigma0[0],$T1 !! Xupdate($i) + sll $xi,`32-@sigma0[2]`,$tmp1 + srl $xi,@sigma0[1],$tmp0 + xor $tmp1,$T1,$T1 + sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 + xor $tmp0,$T1,$T1 + srl $xi,@sigma0[2],$tmp0 + xor $tmp1,$T1,$T1 +___ + if ($i&1) { + $xi=@X[(($i+14)/2)%8]; + } else { + $xi=$tmp32; + $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n"; + } +$code.=<<___; + srl $xi,@sigma1[0],$tmp2 + xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1]) + sll $xi,`32-@sigma1[2]`,$tmp1 + srl $xi,@sigma1[1],$tmp0 + xor $tmp1,$tmp2,$tmp2 + sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1 + xor $tmp0,$tmp2,$tmp2 + srl $xi,@sigma1[2],$tmp0 + xor $tmp1,$tmp2,$tmp2 +___ + if ($i&1) { + $xi=@X[($i/2)%8]; +$code.=<<___; + srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] + xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) + srl @X[($i/2)%8],0,$tmp0 + add $tmp2,$tmp1,$tmp1 + add $xi,$T1,$T1 ! +=X[i] + xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] + add $tmp1,$T1,$T1 + + srl $T1,0,$T1 + or $T1,@X[($i/2)%8],@X[($i/2)%8] +___ + } else { + $xi=@X[(($i+9)/2)%8]; +$code.=<<___; + srlx @X[($i/2)%8],32,$tmp1 ! X[i] + xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) + add $xi,$T1,$T1 ! +=X[i+9] + add $tmp2,$tmp1,$tmp1 + srl @X[($i/2)%8],0,@X[($i/2)%8] + add $tmp1,$T1,$T1 + + sllx $T1,32,$tmp0 + or $tmp0,@X[($i/2)%8],@X[($i/2)%8] +___ + } + &BODY_00_15(@_); +} if ($SZ==4); + +########### SHA512 +$BODY_16_XX = sub { +my $i=@_[0]; +my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1)); + +$code.=<<___; + sllx %l2,32,$tmp0 !! Xupdate($i) + or %l3,$tmp0,$tmp0 + + srlx $tmp0,@sigma0[0],$T1 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 + sllx $tmp0,`64-@sigma0[2]`,$tmp1 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 + srlx $tmp0,@sigma0[1],$tmp0 + xor $tmp1,$T1,$T1 + sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 + xor $tmp0,$T1,$T1 + srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0 + xor $tmp1,$T1,$T1 + sllx %l6,32,$tmp2 + xor $tmp0,$T1,$T1 ! sigma0(X[$i+1]) + or %l7,$tmp2,$tmp2 + + srlx $tmp2,@sigma1[0],$tmp1 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 + sllx $tmp2,`64-@sigma1[2]`,$tmp0 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 + srlx $tmp2,@sigma1[1],$tmp2 + xor $tmp0,$tmp1,$tmp1 + sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 + xor $tmp2,$tmp1,$tmp1 + srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2 + xor $tmp0,$tmp1,$tmp1 + sllx %l4,32,$tmp0 + xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 + or %l5,$tmp0,$tmp0 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 + + sllx %l0,32,$tmp2 + add $tmp1,$T1,$T1 + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 + or %l1,$tmp2,$tmp2 + add $tmp0,$T1,$T1 ! +=X[$i+9] + ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 + add $tmp2,$T1,$T1 ! +=X[$i] + $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`] +___ + &BODY_00_15(@_); +} if ($SZ==8); + +$code.=<<___; +#include "sparc_arch.h" + +#ifdef __arch64__ +.register %g2,#scratch +.register %g3,#scratch +#endif + +.section ".text",#alloc,#execinstr + +.align 64 +K${label}: +.type K${label},#object +___ +if ($SZ==4) { +$code.=<<___; + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +___ +} else { +$code.=<<___; + .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd + .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc + .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 + .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 + .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe + .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 + .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 + .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 + .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 + .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 + .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 + .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 + .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 + .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 + .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 + .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 + .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 + .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df + .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 + .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b + .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 + .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 + .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 + .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 + .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 + .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 + .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb + .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 + .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 + .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec + .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 + .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b + .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 + .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 + .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 + .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b + .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 + .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c + .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a + .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 +___ +} +$code.=<<___; +.size K${label},.-K${label} + +#ifdef __PIC__ +SPARC_PIC_THUNK(%g1) +#endif + +.globl sha${label}_block_data_order +.align 32 +sha${label}_block_data_order: + SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) + ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] + + andcc %g1, CFR_SHA${label}, %g0 + be .Lsoftware + nop +___ +$code.=<<___ if ($SZ==8); # SHA512 + ldd [%o0 + 0x00], %f0 ! load context + ldd [%o0 + 0x08], %f2 + ldd [%o0 + 0x10], %f4 + ldd [%o0 + 0x18], %f6 + ldd [%o0 + 0x20], %f8 + ldd [%o0 + 0x28], %f10 + andcc %o1, 0x7, %g0 + ldd [%o0 + 0x30], %f12 + bne,pn %icc, .Lhwunaligned + ldd [%o0 + 0x38], %f14 + +.Lhwaligned_loop: + ldd [%o1 + 0x00], %f16 + ldd [%o1 + 0x08], %f18 + ldd [%o1 + 0x10], %f20 + ldd [%o1 + 0x18], %f22 + ldd [%o1 + 0x20], %f24 + ldd [%o1 + 0x28], %f26 + ldd [%o1 + 0x30], %f28 + ldd [%o1 + 0x38], %f30 + ldd [%o1 + 0x40], %f32 + ldd [%o1 + 0x48], %f34 + ldd [%o1 + 0x50], %f36 + ldd [%o1 + 0x58], %f38 + ldd [%o1 + 0x60], %f40 + ldd [%o1 + 0x68], %f42 + ldd [%o1 + 0x70], %f44 + subcc %o2, 1, %o2 ! done yet? + ldd [%o1 + 0x78], %f46 + add %o1, 0x80, %o1 + prefetch [%o1 + 63], 20 + prefetch [%o1 + 64+63], 20 + + .word 0x81b02860 ! SHA512 + + bne,pt SIZE_T_CC, .Lhwaligned_loop + nop + +.Lhwfinish: + std %f0, [%o0 + 0x00] ! store context + std %f2, [%o0 + 0x08] + std %f4, [%o0 + 0x10] + std %f6, [%o0 + 0x18] + std %f8, [%o0 + 0x20] + std %f10, [%o0 + 0x28] + std %f12, [%o0 + 0x30] + retl + std %f14, [%o0 + 0x38] + +.align 16 +.Lhwunaligned: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f18 +.Lhwunaligned_loop: + ldd [%o1 + 0x08], %f20 + ldd [%o1 + 0x10], %f22 + ldd [%o1 + 0x18], %f24 + ldd [%o1 + 0x20], %f26 + ldd [%o1 + 0x28], %f28 + ldd [%o1 + 0x30], %f30 + ldd [%o1 + 0x38], %f32 + ldd [%o1 + 0x40], %f34 + ldd [%o1 + 0x48], %f36 + ldd [%o1 + 0x50], %f38 + ldd [%o1 + 0x58], %f40 + ldd [%o1 + 0x60], %f42 + ldd [%o1 + 0x68], %f44 + ldd [%o1 + 0x70], %f46 + ldd [%o1 + 0x78], %f48 + subcc %o2, 1, %o2 ! done yet? + ldd [%o1 + 0x80], %f50 + add %o1, 0x80, %o1 + prefetch [%o1 + 63], 20 + prefetch [%o1 + 64+63], 20 + + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + faligndata %f26, %f28, %f24 + faligndata %f28, %f30, %f26 + faligndata %f30, %f32, %f28 + faligndata %f32, %f34, %f30 + faligndata %f34, %f36, %f32 + faligndata %f36, %f38, %f34 + faligndata %f38, %f40, %f36 + faligndata %f40, %f42, %f38 + faligndata %f42, %f44, %f40 + faligndata %f44, %f46, %f42 + faligndata %f46, %f48, %f44 + faligndata %f48, %f50, %f46 + + .word 0x81b02860 ! SHA512 + + bne,pt SIZE_T_CC, .Lhwunaligned_loop + for %f50, %f50, %f18 ! %f18=%f50 + + ba .Lhwfinish + nop +___ +$code.=<<___ if ($SZ==4); # SHA256 + ld [%o0 + 0x00], %f0 + ld [%o0 + 0x04], %f1 + ld [%o0 + 0x08], %f2 + ld [%o0 + 0x0c], %f3 + ld [%o0 + 0x10], %f4 + ld [%o0 + 0x14], %f5 + andcc %o1, 0x7, %g0 + ld [%o0 + 0x18], %f6 + bne,pn %icc, .Lhwunaligned + ld [%o0 + 0x1c], %f7 + +.Lhwloop: + ldd [%o1 + 0x00], %f8 + ldd [%o1 + 0x08], %f10 + ldd [%o1 + 0x10], %f12 + ldd [%o1 + 0x18], %f14 + ldd [%o1 + 0x20], %f16 + ldd [%o1 + 0x28], %f18 + ldd [%o1 + 0x30], %f20 + subcc %o2, 1, %o2 ! done yet? + ldd [%o1 + 0x38], %f22 + add %o1, 0x40, %o1 + prefetch [%o1 + 63], 20 + + .word 0x81b02840 ! SHA256 + + bne,pt SIZE_T_CC, .Lhwloop + nop + +.Lhwfinish: + st %f0, [%o0 + 0x00] ! store context + st %f1, [%o0 + 0x04] + st %f2, [%o0 + 0x08] + st %f3, [%o0 + 0x0c] + st %f4, [%o0 + 0x10] + st %f5, [%o0 + 0x14] + st %f6, [%o0 + 0x18] + retl + st %f7, [%o0 + 0x1c] + +.align 8 +.Lhwunaligned: + alignaddr %o1, %g0, %o1 + + ldd [%o1 + 0x00], %f10 +.Lhwunaligned_loop: + ldd [%o1 + 0x08], %f12 + ldd [%o1 + 0x10], %f14 + ldd [%o1 + 0x18], %f16 + ldd [%o1 + 0x20], %f18 + ldd [%o1 + 0x28], %f20 + ldd [%o1 + 0x30], %f22 + ldd [%o1 + 0x38], %f24 + subcc %o2, 1, %o2 ! done yet? + ldd [%o1 + 0x40], %f26 + add %o1, 0x40, %o1 + prefetch [%o1 + 63], 20 + + faligndata %f10, %f12, %f8 + faligndata %f12, %f14, %f10 + faligndata %f14, %f16, %f12 + faligndata %f16, %f18, %f14 + faligndata %f18, %f20, %f16 + faligndata %f20, %f22, %f18 + faligndata %f22, %f24, %f20 + faligndata %f24, %f26, %f22 + + .word 0x81b02840 ! SHA256 + + bne,pt SIZE_T_CC, .Lhwunaligned_loop + for %f26, %f26, %f10 ! %f10=%f26 + + ba .Lhwfinish + nop +___ +$code.=<<___; +.align 16 +.Lsoftware: + save %sp,-STACK_FRAME-$locals,%sp + and $inp,`$align-1`,$tmp31 + sllx $len,`log(16*$SZ)/log(2)`,$len + andn $inp,`$align-1`,$inp + sll $tmp31,3,$tmp31 + add $inp,$len,$len +___ +$code.=<<___ if ($SZ==8); # SHA512 + mov 32,$tmp32 + sub $tmp32,$tmp31,$tmp32 +___ +$code.=<<___; +.Lpic: call .+8 + add %o7,K${label}-.Lpic,$Ktbl + + $LD [$ctx+`0*$SZ`],$A + $LD [$ctx+`1*$SZ`],$B + $LD [$ctx+`2*$SZ`],$C + $LD [$ctx+`3*$SZ`],$D + $LD [$ctx+`4*$SZ`],$E + $LD [$ctx+`5*$SZ`],$F + $LD [$ctx+`6*$SZ`],$G + $LD [$ctx+`7*$SZ`],$H + +.Lloop: +___ +for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".L16_xx:\n"; +for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + and $tmp2,0xfff,$tmp2 + cmp $tmp2,$lastK + bne .L16_xx + add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16 + +___ +$code.=<<___ if ($SZ==4); # SHA256 + $LD [$ctx+`0*$SZ`],@X[0] + $LD [$ctx+`1*$SZ`],@X[1] + $LD [$ctx+`2*$SZ`],@X[2] + $LD [$ctx+`3*$SZ`],@X[3] + $LD [$ctx+`4*$SZ`],@X[4] + $LD [$ctx+`5*$SZ`],@X[5] + $LD [$ctx+`6*$SZ`],@X[6] + $LD [$ctx+`7*$SZ`],@X[7] + + add $A,@X[0],$A + $ST $A,[$ctx+`0*$SZ`] + add $B,@X[1],$B + $ST $B,[$ctx+`1*$SZ`] + add $C,@X[2],$C + $ST $C,[$ctx+`2*$SZ`] + add $D,@X[3],$D + $ST $D,[$ctx+`3*$SZ`] + add $E,@X[4],$E + $ST $E,[$ctx+`4*$SZ`] + add $F,@X[5],$F + $ST $F,[$ctx+`5*$SZ`] + add $G,@X[6],$G + $ST $G,[$ctx+`6*$SZ`] + add $H,@X[7],$H + $ST $H,[$ctx+`7*$SZ`] +___ +$code.=<<___ if ($SZ==8); # SHA512 + ld [$ctx+`0*$SZ+0`],%l0 + ld [$ctx+`0*$SZ+4`],%l1 + ld [$ctx+`1*$SZ+0`],%l2 + ld [$ctx+`1*$SZ+4`],%l3 + ld [$ctx+`2*$SZ+0`],%l4 + ld [$ctx+`2*$SZ+4`],%l5 + ld [$ctx+`3*$SZ+0`],%l6 + + sllx %l0,32,$tmp0 + ld [$ctx+`3*$SZ+4`],%l7 + sllx %l2,32,$tmp1 + or %l1,$tmp0,$tmp0 + or %l3,$tmp1,$tmp1 + add $tmp0,$A,$A + add $tmp1,$B,$B + $ST $A,[$ctx+`0*$SZ`] + sllx %l4,32,$tmp2 + $ST $B,[$ctx+`1*$SZ`] + sllx %l6,32,$T1 + or %l5,$tmp2,$tmp2 + or %l7,$T1,$T1 + add $tmp2,$C,$C + $ST $C,[$ctx+`2*$SZ`] + add $T1,$D,$D + $ST $D,[$ctx+`3*$SZ`] + + ld [$ctx+`4*$SZ+0`],%l0 + ld [$ctx+`4*$SZ+4`],%l1 + ld [$ctx+`5*$SZ+0`],%l2 + ld [$ctx+`5*$SZ+4`],%l3 + ld [$ctx+`6*$SZ+0`],%l4 + ld [$ctx+`6*$SZ+4`],%l5 + ld [$ctx+`7*$SZ+0`],%l6 + + sllx %l0,32,$tmp0 + ld [$ctx+`7*$SZ+4`],%l7 + sllx %l2,32,$tmp1 + or %l1,$tmp0,$tmp0 + or %l3,$tmp1,$tmp1 + add $tmp0,$E,$E + add $tmp1,$F,$F + $ST $E,[$ctx+`4*$SZ`] + sllx %l4,32,$tmp2 + $ST $F,[$ctx+`5*$SZ`] + sllx %l6,32,$T1 + or %l5,$tmp2,$tmp2 + or %l7,$T1,$T1 + add $tmp2,$G,$G + $ST $G,[$ctx+`6*$SZ`] + add $T1,$H,$H + $ST $H,[$ctx+`7*$SZ`] +___ +$code.=<<___; + add $inp,`16*$SZ`,$inp ! advance inp + cmp $inp,$len + bne SIZE_T_CC,.Lloop + sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl + + ret + restore +.type sha${label}_block_data_order,#function +.size sha${label}_block_data_order,(.-sha${label}_block_data_order) +.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by " +.align 4 +___ + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my $ref,$opf; +my %visopf = ( "faligndata" => 0x048, + "for" => 0x07c ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} +sub unalignaddr { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my $ref="$mnemonic\t$rs1,$rs2,$rd"; + + foreach ($rs1,$rs2,$rd) { + if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } + else { return $ref; } + } + return sprintf ".word\t0x%08x !%s", + 0x81b00300|$rd<<25|$rs1<<14|$rs2, + $ref; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ + &unvis($1,$2,$3,$4) + /ge; + s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unalignaddr($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512-x86_64.pl b/openssl-1.1.0h/crypto/sha/asm/sha512-x86_64.pl new file mode 100755 index 0000000..c9b7b28 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512-x86_64.pl @@ -0,0 +1,2407 @@ +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. Rights for redistribution and usage in source and binary +# forms are granted according to the OpenSSL license. +# ==================================================================== +# +# sha256/512_block procedure for x86_64. +# +# 40% improvement over compiler-generated code on Opteron. On EM64T +# sha256 was observed to run >80% faster and sha512 - >40%. No magical +# tricks, just straight implementation... I really wonder why gcc +# [being armed with inline assembler] fails to generate as fast code. +# The only thing which is cool about this module is that it's very +# same instruction sequence used for both SHA-256 and SHA-512. In +# former case the instructions operate on 32-bit operands, while in +# latter - on 64-bit ones. All I had to do is to get one flavor right, +# the other one passed the test right away:-) +# +# sha256_block runs in ~1005 cycles on Opteron, which gives you +# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock +# frequency in GHz. sha512_block runs in ~1275 cycles, which results +# in 128*1000/1275=100MBps per GHz. Is there room for improvement? +# Well, if you compare it to IA-64 implementation, which maintains +# X[16] in register bank[!], tends to 4 instructions per CPU clock +# cycle and runs in 1003 cycles, 1275 is very good result for 3-way +# issue Opteron pipeline and X[16] maintained in memory. So that *if* +# there is a way to improve it, *then* the only way would be to try to +# offload X[16] updates to SSE unit, but that would require "deeper" +# loop unroll, which in turn would naturally cause size blow-up, not +# to mention increased complexity! And once again, only *if* it's +# actually possible to noticeably improve overall ILP, instruction +# level parallelism, on a given CPU implementation in this case. +# +# Special note on Intel EM64T. While Opteron CPU exhibits perfect +# performance ratio of 1.5 between 64- and 32-bit flavors [see above], +# [currently available] EM64T CPUs apparently are far from it. On the +# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit +# sha256_block:-( This is presumably because 64-bit shifts/rotates +# apparently are not atomic instructions, but implemented in microcode. +# +# May 2012. +# +# Optimization including one of Pavel Semjanov's ideas, alternative +# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and +# unfortunately -2% SHA512 on P4 [which nobody should care about +# that much]. +# +# June 2012. +# +# Add SIMD code paths, see below for improvement coefficients. SSSE3 +# code path was not attempted for SHA512, because improvement is not +# estimated to be high enough, noticeably less than 9%, to justify +# the effort, not on pre-AVX processors. [Obviously with exclusion +# for VIA Nano, but it has SHA512 instruction that is faster and +# should be used instead.] For reference, corresponding estimated +# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that +# higher coefficients are observed on VIA Nano and Bulldozer has more +# to do with specifics of their architecture [which is topic for +# separate discussion]. +# +# November 2012. +# +# Add AVX2 code path. Two consecutive input blocks are loaded to +# 256-bit %ymm registers, with data from first block to least +# significant 128-bit halves and data from second to most significant. +# The data is then processed with same SIMD instruction sequence as +# for AVX, but with %ymm as operands. Side effect is increased stack +# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB +# code size increase. +# +# March 2014. +# +# Add support for Intel SHA Extensions. + +###################################################################### +# Current performance in cycles per processed byte (less is better): +# +# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) +# +# AMD K8 14.9 - - 9.57 - +# P4 17.3 - - 30.8 - +# Core 2 15.6 13.8(+13%) - 9.97 - +# Westmere 14.8 12.3(+19%) - 9.58 - +# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) +# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) +# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) +# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) +# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) +# VIA Nano 23.0 16.5(+39%) - 14.7 - +# Atom 23.0 18.9(+22%) - 14.7 - +# Silvermont 27.4 20.6(+33%) - 17.5 - +# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - +# +# (*) whichever best applicable, including SHAEXT; +# (**) switch from ror to shrd stands for fair share of improvement; +# (***) execution time is fully determined by remaining integer-only +# part, body_00_15; reducing the amount of SIMD instructions +# below certain limit makes no difference/sense; to conserve +# space SHA256 XOP code path is therefore omitted; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +$shaext=1; ### set to zero if compiling for 1.0.1 +$avx=1 if (!$shaext && $avx); + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +if ($output =~ /512/) { + $func="sha512_block_data_order"; + $TABLE="K512"; + $SZ=8; + @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", + "%r8", "%r9", "%r10","%r11"); + ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; +} else { + $func="sha256_block_data_order"; + $TABLE="K256"; + $SZ=4; + @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); + ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; +} + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$_rsp="16*$SZ+3*8(%rsp)"; +$framesz="16*$SZ+4*8"; + + +sub ROUND_00_15() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); + +$code.=<<___; + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 + mov $f,$a2 + + xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $g,$a2 # f^g + + mov $T1,`$SZ*($i&0xf)`(%rsp) + xor $a,$a1 + and $e,$a2 # (f^g)&e + + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g + + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 + add $a2,$T1 # T1+=Ch(e,f,g) + + mov $a,$a2 + add ($Tbl),$T1 # T1+=K[round] + xor $a,$a1 + + xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) + mov $b,$h + + and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + + xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + add $T1,$d # d+=T1 + add $T1,$h # h+=T1 + + lea $STRIDE($Tbl),$Tbl # round++ +___ +$code.=<<___ if ($i<15); + add $a1,$h # h+=Sigma0(a) +___ + ($a2,$a3) = ($a3,$a2); +} + +sub ROUND_16_XX() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 + + mov $a0,$T1 + ror \$`$sigma0[1]-$sigma0[0]`,$a0 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + + xor $T1,$a0 + shr \$$sigma0[2],$T1 + ror \$$sigma0[0],$a0 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + + ror \$$sigma1[0],$a2 + xor $a0,$T1 # sigma0(X[(i+1)&0xf]) + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) + add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + + add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a2,$T1 + mov $a,$a1 +___ + &ROUND_00_15(@_); +} + +$code=<<___; +.text + +.extern OPENSSL_ia32cap_P +.globl $func +.type $func,\@function,3 +.align 16 +$func: +___ +$code.=<<___ if ($SZ==4 || $avx); + lea OPENSSL_ia32cap_P(%rip),%r11 + mov 0(%r11),%r9d + mov 4(%r11),%r10d + mov 8(%r11),%r11d +___ +$code.=<<___ if ($SZ==4 && $shaext); + test \$`1<<29`,%r11d # check for SHA + jnz _shaext_shortcut +___ +$code.=<<___ if ($avx && $SZ==8); + test \$`1<<11`,%r10d # check for XOP + jnz .Lxop_shortcut +___ +$code.=<<___ if ($avx>1); + and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 + cmp \$`1<<8|1<<5|1<<3`,%r11d + je .Lavx2_shortcut +___ +$code.=<<___ if ($avx); + and \$`1<<30`,%r9d # mask "Intel CPU" bit + and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits + or %r9d,%r10d + cmp \$`1<<28|1<<9|1<<30`,%r10d + je .Lavx_shortcut +___ +$code.=<<___ if ($SZ==4); + test \$`1<<9`,%r10d + jnz .Lssse3_shortcut +___ +$code.=<<___; + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + and \$-64,%rsp # align stack frame + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %r11,$_rsp # save copy of %rsp +.Lprologue: + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H + jmp .Lloop + +.align 16 +.Lloop: + mov $B,$a3 + lea $TABLE(%rip),$Tbl + xor $C,$a3 # magic +___ + for($i=0;$i<16;$i++) { + $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; + $code.=" bswap $T1\n"; + &ROUND_00_15($i,@ROT); + unshift(@ROT,pop(@ROT)); + } +$code.=<<___; + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: +___ + for(;$i<32;$i++) { + &ROUND_16_XX($i,@ROT); + unshift(@ROT,pop(@ROT)); + } + +$code.=<<___; + cmpb \$0,`$SZ-1`($Tbl) + jnz .Lrounds_16_xx + + mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) + lea 16*$SZ($inp),$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop + + mov $_rsp,%rsi + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue: + ret +.size $func,.-$func +___ + +if ($SZ==4) { +$code.=<<___; +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " +___ +} else { +$code.=<<___; +.align 64 +.type $TABLE,\@object +$TABLE: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + + .quad 0x0001020304050607,0x08090a0b0c0d0e0f + .quad 0x0001020304050607,0x08090a0b0c0d0e0f + .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " +___ +} + +###################################################################### +# SIMD code paths +# +if ($SZ==4 && $shaext) {{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.type sha256_block_data_order_shaext,\@function,3 +.align 64 +sha256_block_data_order_shaext: +_shaext_shortcut: +___ +$code.=<<___ if ($win64); + lea `-8-5*16`(%rsp),%rsp + movaps %xmm6,-8-5*16(%rax) + movaps %xmm7,-8-4*16(%rax) + movaps %xmm8,-8-3*16(%rax) + movaps %xmm9,-8-2*16(%rax) + movaps %xmm10,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x200-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*32-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -8-5*16(%rax),%xmm6 + movaps -8-4*16(%rax),%xmm7 + movaps -8-3*16(%rax),%xmm8 + movaps -8-2*16(%rax),%xmm9 + movaps -8-1*16(%rax),%xmm10 + mov %rax,%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +___ +}}} +{{{ + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a2,$b)', # a^b, b^c in next round + + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +###################################################################### +# SSSE3 code path +# +if ($SZ==4) { # SHA256 only +my @X = map("%xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; +.type ${func}_ssse3,\@function,3 +.align 64 +${func}_ssse3: +.Lssse3_shortcut: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + shl \$4,%rdx # num*16 + sub \$`$framesz+$win64*16*4`,%rsp + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + and \$-64,%rsp # align stack frame + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %r11,$_rsp # save copy of %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___; +.Lprologue_ssse3: + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + +$code.=<<___; + #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + movdqu 0x00($inp),@X[0] + movdqu 0x10($inp),@X[1] + movdqu 0x20($inp),@X[2] + pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] + lea $TABLE(%rip),$Tbl + pshufb $t3,@X[1] + movdqa 0x00($Tbl),$t0 + movdqa 0x20($Tbl),$t1 + pshufb $t3,@X[2] + paddd @X[0],$t0 + movdqa 0x40($Tbl),$t2 + pshufb $t3,@X[3] + movdqa 0x60($Tbl),$t3 + paddd @X[1],$t1 + paddd @X[2],$t2 + paddd @X[3],$t3 + movdqa $t0,0x00(%rsp) + mov $A,$a1 + movdqa $t1,0x10(%rsp) + mov $B,$a3 + movdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + movdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + sub \$`-16*2*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_SSSE3 () { + ( + '&movdqa ($t0,@X[1]);', + '&movdqa ($t3,@X[3])', + '&palignr ($t0,@X[0],$SZ)', # X[1..4] + '&palignr ($t3,@X[2],$SZ);', # X[9..12] + '&movdqa ($t1,$t0)', + '&movdqa ($t2,$t0);', + '&psrld ($t0,$sigma0[2])', + '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] + '&psrld ($t2,$sigma0[0])', + '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&pslld ($t1,8*$SZ-$sigma0[1]);'. + '&pxor ($t0,$t2)', + '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t1)', + '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t2);', + '&movdqa ($t2,$t3)', + '&pxor ($t0,$t1);', # sigma0(X[1..4]) + '&psrld ($t3,$sigma1[2])', + '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2)', + '&pshufb ($t3,$t4)', # sigma1(X[14..15]) + '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&movdqa ($t2,$t3);', + '&psrld ($t3,$sigma1[2])', + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2);', + '&movdqa ($t2,16*2*$j."($Tbl)")', + '&pshufb ($t3,$t5)', + '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub SSSE3_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + if (0) { + foreach (Xupdate_256_SSSE3()) { # 36 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } else { # squeeze extra 4% on Westmere and 19% on Atom + eval(shift(@insns)); #@ + &movdqa ($t0,@X[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &palignr ($t0,@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t1,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t2,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[4..15] + eval(shift(@insns)); + eval(shift(@insns)); #@ + &pslld ($t1,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,16*2*$j."($Tbl)"); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + } + &paddd ($t2,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &movdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lssse3_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + + add $SZ*0($ctx),$A + lea 16*$SZ($inp),$inp + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_ssse3 + + mov $_rsp,%rsi +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_ssse3: + ret +.size ${func}_ssse3,.-${func}_ssse3 +___ +} + +if ($avx) {{ +###################################################################### +# XOP code path +# +if ($SZ==8) { # SHA512 only +$code.=<<___; +.type ${func}_xop,\@function,3 +.align 64 +${func}_xop: +.Lxop_shortcut: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + shl \$4,%rdx # num*16 + sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + and \$-64,%rsp # align stack frame + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %r11,$_rsp # save copy of %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps %xmm10,16*$SZ+96(%rsp) + movaps %xmm11,16*$SZ+112(%rsp) +___ +$code.=<<___; +.Lprologue_xop: + + vzeroupper + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H + jmp .Lloop_xop +___ + if ($SZ==4) { # SHA256 + my @X = map("%xmm$_",(0..3)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +$code.=<<___; +.align 16 +.Lloop_xop: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00($inp),@X[0] + vmovdqu 0x10($inp),@X[1] + vmovdqu 0x20($inp),@X[2] + vmovdqu 0x30($inp),@X[3] + vpshufb $t3,@X[0],@X[0] + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + mov $A,$a1 + vmovdqa $t1,0x10(%rsp) + mov $B,$a3 + vmovdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + sub \$`-16*2*$SZ`,$Tbl # size optimization +___ +sub XOP_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t0,$t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t2,@X[3],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq ($t3,$t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrld ($t2,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpslldq ($t3,$t3,8); # 22 instructions + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &XOP_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lxop_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + + } else { # SHA512 + my @X = map("%xmm$_",(0..7)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); + +$code.=<<___; +.align 16 +.Lloop_xop: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00($inp),@X[0] + lea $TABLE+0x80(%rip),$Tbl # size optimization + vmovdqu 0x10($inp),@X[1] + vmovdqu 0x20($inp),@X[2] + vpshufb $t3,@X[0],@X[0] + vmovdqu 0x30($inp),@X[3] + vpshufb $t3,@X[1],@X[1] + vmovdqu 0x40($inp),@X[4] + vpshufb $t3,@X[2],@X[2] + vmovdqu 0x50($inp),@X[5] + vpshufb $t3,@X[3],@X[3] + vmovdqu 0x60($inp),@X[6] + vpshufb $t3,@X[4],@X[4] + vmovdqu 0x70($inp),@X[7] + vpshufb $t3,@X[5],@X[5] + vpaddq -0x80($Tbl),@X[0],$t0 + vpshufb $t3,@X[6],@X[6] + vpaddq -0x60($Tbl),@X[1],$t1 + vpshufb $t3,@X[7],@X[7] + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + vpaddq 0x00($Tbl),@X[4],$t0 + vmovdqa $t1,0x10(%rsp) + vpaddq 0x20($Tbl),@X[5],$t1 + vmovdqa $t2,0x20(%rsp) + vpaddq 0x40($Tbl),@X[6],$t2 + vmovdqa $t3,0x30(%rsp) + vpaddq 0x60($Tbl),@X[7],$t3 + vmovdqa $t0,0x40(%rsp) + mov $A,$a1 + vmovdqa $t1,0x50(%rsp) + mov $B,$a3 + vmovdqa $t2,0x60(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x70(%rsp) + mov $E,$a0 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + add \$`16*2*$SZ`,$Tbl +___ +sub XOP_512_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body); # 52 instructions + + &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] + eval(shift(@insns)); + eval(shift(@insns)); + &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrlq ($t0,$t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrlq ($t2,@X[7],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) + eval(shift(@insns)); + eval(shift(@insns)); + &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<8; $j++) { + &XOP_512_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); + &jne (".Lxop_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +} +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + + add $SZ*0($ctx),$A + lea 16*$SZ($inp),$inp + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_xop + + mov $_rsp,%rsi + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps 16*$SZ+96(%rsp),%xmm10 + movaps 16*$SZ+112(%rsp),%xmm11 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_xop: + ret +.size ${func}_xop,.-${func}_xop +___ +} +###################################################################### +# AVX+shrd code path +# +local *ror = sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type ${func}_avx,\@function,3 +.align 64 +${func}_avx: +.Lavx_shortcut: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + shl \$4,%rdx # num*16 + sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + and \$-64,%rsp # align stack frame + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %r11,$_rsp # save copy of %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps %xmm10,16*$SZ+96(%rsp) + movaps %xmm11,16*$SZ+112(%rsp) +___ +$code.=<<___; +.Lprologue_avx: + + vzeroupper + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + if ($SZ==4) { # SHA256 + my @X = map("%xmm$_",(0..3)); + my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00($inp),@X[0] + vmovdqu 0x10($inp),@X[1] + vmovdqu 0x20($inp),@X[2] + vmovdqu 0x30($inp),@X[3] + vpshufb $t3,@X[0],@X[0] + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + mov $A,$a1 + vmovdqa $t1,0x10(%rsp) + mov $B,$a3 + vmovdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + sub \$`-16*2*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_AVX () { + ( + '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] + '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] + '&vpsrld ($t2,$t0,$sigma0[0]);', + '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] + '&vpsrld ($t3,$t0,$sigma0[2])', + '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', + '&vpxor ($t0,$t3,$t2)', + '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t1)', + '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t2)', + '&vpsrld ($t2,$t3,$sigma1[2]);', + '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) + '&vpsrlq ($t3,$t3,$sigma1[0]);', + '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', + '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) + '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) + '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&vpsrld ($t2,$t3,$sigma1[2])', + '&vpsrlq ($t3,$t3,$sigma1[0])', + '&vpxor ($t2,$t2,$t3);', + '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', + '&vpxor ($t2,$t2,$t3)', + '&vpshufb ($t2,$t2,$t5)', + '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub AVX_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &AVX_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); + &jne (".Lavx_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } + + } else { # SHA512 + my @X = map("%xmm$_",(0..7)); + my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); + +$code.=<<___; + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu 0x00($inp),@X[0] + lea $TABLE+0x80(%rip),$Tbl # size optimization + vmovdqu 0x10($inp),@X[1] + vmovdqu 0x20($inp),@X[2] + vpshufb $t3,@X[0],@X[0] + vmovdqu 0x30($inp),@X[3] + vpshufb $t3,@X[1],@X[1] + vmovdqu 0x40($inp),@X[4] + vpshufb $t3,@X[2],@X[2] + vmovdqu 0x50($inp),@X[5] + vpshufb $t3,@X[3],@X[3] + vmovdqu 0x60($inp),@X[6] + vpshufb $t3,@X[4],@X[4] + vmovdqu 0x70($inp),@X[7] + vpshufb $t3,@X[5],@X[5] + vpaddq -0x80($Tbl),@X[0],$t0 + vpshufb $t3,@X[6],@X[6] + vpaddq -0x60($Tbl),@X[1],$t1 + vpshufb $t3,@X[7],@X[7] + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + vpaddq 0x00($Tbl),@X[4],$t0 + vmovdqa $t1,0x10(%rsp) + vpaddq 0x20($Tbl),@X[5],$t1 + vmovdqa $t2,0x20(%rsp) + vpaddq 0x40($Tbl),@X[6],$t2 + vmovdqa $t3,0x30(%rsp) + vpaddq 0x60($Tbl),@X[7],$t3 + vmovdqa $t0,0x40(%rsp) + mov $A,$a1 + vmovdqa $t1,0x50(%rsp) + mov $B,$a3 + vmovdqa $t2,0x60(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x70(%rsp) + mov $E,$a0 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + add \$`16*2*$SZ`,$Tbl +___ +sub Xupdate_512_AVX () { + ( + '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] + '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] + '&vpsrlq ($t2,$t0,$sigma0[0])', + '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] + '&vpsrlq ($t3,$t0,$sigma0[2])', + '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', + '&vpxor ($t0,$t3,$t2)', + '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t1)', + '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', + '&vpxor ($t0,$t0,$t2)', + '&vpsrlq ($t3,@X[7],$sigma1[2]);', + '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) + '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', + '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) + '&vpsrlq ($t1,@X[7],$sigma1[0]);', + '&vpxor ($t3,$t3,$t2)', + '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', + '&vpxor ($t3,$t3,$t1)', + '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', + '&vpxor ($t3,$t3,$t2)', + '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) + '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + ); +} + +sub AVX_512_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body); # 52 instructions + + foreach (Xupdate_512_AVX()) { # 23 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<8; $j++) { + &AVX_512_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); + &jne (".Lavx_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +} +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + + add $SZ*0($ctx),$A + lea 16*$SZ($inp),$inp + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_avx + + mov $_rsp,%rsi + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps 16*$SZ+96(%rsp),%xmm10 + movaps 16*$SZ+112(%rsp),%xmm11 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx: + ret +.size ${func}_avx,.-${func}_avx +___ + +if ($avx>1) {{ +###################################################################### +# AVX2+BMI code path +# +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $PUSH8=8*2*$SZ; +use integer; + +sub bodyx_00_15 () { + # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] + '&and ($a4,$e)', # f&e + '&rorx ($a0,$e,$Sigma1[2])', + '&rorx ($a2,$e,$Sigma1[1])', + + '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past + '&lea ($h,"($h,$a4)")', + '&andn ($a4,$e,$g)', # ~e&g + '&xor ($a0,$a2)', + + '&rorx ($a1,$e,$Sigma1[0])', + '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) + '&xor ($a0,$a1)', # Sigma1(e) + '&mov ($a2,$a)', + + '&rorx ($a4,$a,$Sigma0[2])', + '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) + '&xor ($a2,$b)', # a^b, b^c in next round + '&rorx ($a1,$a,$Sigma0[1])', + + '&rorx ($a0,$a,$Sigma0[0])', + '&lea ($d,"($d,$h)")', # d+=h + '&and ($a3,$a2)', # (b^c)&(a^b) + '&xor ($a1,$a4)', + + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + '&xor ($a1,$a0)', # Sigma0(a) + '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) + '&mov ($a4,$e)', # copy of f in future + + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); + # and at the finish one has to $a+=$a1 +} + +$code.=<<___; +.type ${func}_avx2,\@function,3 +.align 64 +${func}_avx2: +.Lavx2_shortcut: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp + shl \$4,%rdx # num*16 + and \$-256*$SZ,%rsp # align stack frame + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + add \$`2*$SZ*($rounds-8)`,%rsp + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %r11,$_rsp # save copy of %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps %xmm10,16*$SZ+96(%rsp) + movaps %xmm11,16*$SZ+112(%rsp) +___ +$code.=<<___; +.Lprologue_avx2: + + vzeroupper + sub \$-16*$SZ,$inp # inp++, size optimization + mov $SZ*0($ctx),$A + mov $inp,%r12 # borrow $T1 + mov $SZ*1($ctx),$B + cmp %rdx,$inp # $_end + mov $SZ*2($ctx),$C + cmove %rsp,%r12 # next block or random data + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + if ($SZ==4) { # SHA256 + my @X = map("%ymm$_",(0..3)); + my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); + +$code.=<<___; + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu -16*$SZ+0($inp),%xmm0 + vmovdqu -16*$SZ+16($inp),%xmm1 + vmovdqu -16*$SZ+32($inp),%xmm2 + vmovdqu -16*$SZ+48($inp),%xmm3 + #mov $inp,$_inp # offload $inp + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t3,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t3,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + lea -$PUSH8(%rsp),%rsp + mov $B,$a3 + vmovdqa $t2,0x00(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x20(%rsp) + mov $F,$a4 + sub \$-16*2*$SZ,$Tbl # size optimization + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 96 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &AVX2_256_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } + } else { # SHA512 + my @X = map("%ymm$_",(0..7)); + my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); + +$code.=<<___; + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -16*$SZ($inp),%xmm0 + vmovdqu -16*$SZ+16($inp),%xmm1 + vmovdqu -16*$SZ+32($inp),%xmm2 + lea $TABLE+0x80(%rip),$Tbl # size optimization + vmovdqu -16*$SZ+48($inp),%xmm3 + vmovdqu -16*$SZ+64($inp),%xmm4 + vmovdqu -16*$SZ+80($inp),%xmm5 + vmovdqu -16*$SZ+96($inp),%xmm6 + vmovdqu -16*$SZ+112($inp),%xmm7 + #mov $inp,$_inp # offload $inp + vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t2,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t2,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + vpshufb $t2,@X[2],@X[2] + vinserti128 \$1,64(%r12),@X[4],@X[4] + vpshufb $t2,@X[3],@X[3] + vinserti128 \$1,80(%r12),@X[5],@X[5] + vpshufb $t2,@X[4],@X[4] + vinserti128 \$1,96(%r12),@X[6],@X[6] + vpshufb $t2,@X[5],@X[5] + vinserti128 \$1,112(%r12),@X[7],@X[7] + + vpaddq -0x80($Tbl),@X[0],$t0 + vpshufb $t2,@X[6],@X[6] + vpaddq -0x60($Tbl),@X[1],$t1 + vpshufb $t2,@X[7],@X[7] + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + vpaddq 0x00($Tbl),@X[4],$t0 + vmovdqa $t1,0x20(%rsp) + vpaddq 0x20($Tbl),@X[5],$t1 + vmovdqa $t2,0x40(%rsp) + vpaddq 0x40($Tbl),@X[6],$t2 + vmovdqa $t3,0x60(%rsp) + lea -$PUSH8(%rsp),%rsp + vpaddq 0x60($Tbl),@X[7],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + mov $B,$a3 + vmovdqa $t2,0x40(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x60(%rsp) + mov $F,$a4 + add \$16*2*$SZ,$Tbl + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_512_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body); # 48 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); + foreach (Xupdate_512_AVX()) { # 23 instructions + eval; + if ($_ !~ /\;$/) { + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<8; $j++) { + &AVX2_512_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1-0x80)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } +} +$code.=<<___; + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),$Tbl + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + cmp `$PUSH8+2*8`($Tbl),$inp # $_end + je .Ldone_avx2 + + xor $a1,$a1 + mov $B,$a3 + xor $C,$a3 # magic + mov $F,$a4 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: +___ + for ($i=0; $i<8; ) { + my $base="+16($Tbl)"; + foreach(bodyx_00_15()) { eval; } + } +$code.=<<___; + lea -$PUSH8($Tbl),$Tbl + cmp %rsp,$Tbl + jae .Lower_avx2 + + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),%rsp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + lea `2*16*$SZ`($inp),$inp # inp+=2 + add $SZ*6($ctx),$G + mov $inp,%r12 + add $SZ*7($ctx),$H + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + cmove %rsp,%r12 # next block or stale data + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + jbe .Loop_avx2 + lea (%rsp),$Tbl + +.Ldone_avx2: + lea ($Tbl),%rsp + mov $_rsp,%rsi + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps 16*$SZ+96(%rsp),%xmm10 + movaps 16*$SZ+112(%rsp),%xmm11 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx2: + ret +.size ${func}_avx2,.-${func}_avx2 +___ +}} +}}}}} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HanderlData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue +___ +$code.=<<___ if ($avx>1); + lea .Lavx2_shortcut(%rip),%r10 + cmp %r10,%rbx # context->RipRbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx + jb .Lin_prologue # non-AVX code + + lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area + lea 512($context),%rdi # &context.Xmm6 + mov \$`$SZ==4?8:12`,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler +___ + +$code.=<<___ if ($SZ==4 && $shaext); +.type shaext_handler,\@abi-omnipotent +.align 16 +shaext_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lprologue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lin_prologue + + lea .Lepilogue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + lea -8-5*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$10,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lin_prologue +.size shaext_handler,.-shaext_handler +___ + +$code.=<<___; +.section .pdata +.align 4 + .rva .LSEH_begin_$func + .rva .LSEH_end_$func + .rva .LSEH_info_$func +___ +$code.=<<___ if ($SZ==4 && $shaext); + .rva .LSEH_begin_${func}_shaext + .rva .LSEH_end_${func}_shaext + .rva .LSEH_info_${func}_shaext +___ +$code.=<<___ if ($SZ==4); + .rva .LSEH_begin_${func}_ssse3 + .rva .LSEH_end_${func}_ssse3 + .rva .LSEH_info_${func}_ssse3 +___ +$code.=<<___ if ($avx && $SZ==8); + .rva .LSEH_begin_${func}_xop + .rva .LSEH_end_${func}_xop + .rva .LSEH_info_${func}_xop +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_${func}_avx + .rva .LSEH_end_${func}_avx + .rva .LSEH_info_${func}_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_${func}_avx2 + .rva .LSEH_end_${func}_avx2 + .rva .LSEH_info_${func}_avx2 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_$func: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue,.Lepilogue # HandlerData[] +___ +$code.=<<___ if ($SZ==4 && $shaext); +.LSEH_info_${func}_shaext: + .byte 9,0,0,0 + .rva shaext_handler +___ +$code.=<<___ if ($SZ==4); +.LSEH_info_${func}_ssse3: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] +___ +$code.=<<___ if ($avx && $SZ==8); +.LSEH_info_${func}_xop: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_${func}_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] +___ +$code.=<<___ if ($avx>1); +.LSEH_info_${func}_avx2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] +___ +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/openssl-1.1.0h/crypto/sha/asm/sha512p8-ppc.pl b/openssl-1.1.0h/crypto/sha/asm/sha512p8-ppc.pl new file mode 100755 index 0000000..4d3d3b2 --- /dev/null +++ b/openssl-1.1.0h/crypto/sha/asm/sha512p8-ppc.pl @@ -0,0 +1,431 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256/512 for PowerISA v2.07. +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This module is +# ~60% faster than integer-only sha512-ppc.pl. To anchor to something +# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than +# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than +# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting +# result is degree of computational resources' utilization. POWER8 is +# "massively multi-threaded chip" and difference between single- and +# maximum multi-process benchmark results tells that utlization is +# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and +# for sha1-ppc.pl - 73%. 100% means that multi-process result equals +# to single-process one, given that all threads end up on the same +# physical core. + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; +} else { die "nonsense $flavour"; } + +$LENDIAN=($flavour=~/le/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +if ($output =~ /512/) { + $bits=512; + $SZ=8; + $sz="d"; + $rounds=80; +} else { + $bits=256; + $SZ=4; + $sz="w"; + $rounds=64; +} + +$func="sha${bits}_block_p8"; +$FRAME=8*$SIZE_T; + +$sp ="r1"; +$toc="r2"; +$ctx="r3"; +$inp="r4"; +$num="r5"; +$Tbl="r6"; +$idx="r7"; +$lrsave="r8"; +$offload="r11"; +$vrsave="r12"; +($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); + $x00=0 if ($flavour =~ /osx/); + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); +@X=map("v$_",(8..23)); +($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31)); + +sub ROUND { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)%16; + +$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); + lvx_u @X[$i+1],0,$inp ; load X[i] in advance + addi $inp,$inp,16 +___ +$code.=<<___ if ($i<16 && ($i%(16/$SZ))); + vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ +___ +$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); + vperm @X[$i],@X[$i],@X[$i],$lemask +___ +$code.=<<___; + `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)` + vsel $Func,$g,$f,$e ; Ch(e,f,g) + vshasigma${sz} $S1,$e,1,15 ; Sigma1(e) + vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] + vshasigma${sz} $S0,$a,1,0 ; Sigma0(a) + `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)` + vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) + vxor $Func,$a,$b + `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)` + vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e) + vsel $Func,$b,$c,$Func ; Maj(a,b,c) + vaddu${sz}m $g,$g,$Ki ; future h+=K[i] + vaddu${sz}m $d,$d,$h ; d+=h + vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c) + `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)` + lvx $Ki,$idx,$Tbl ; load next K[i] + addi $idx,$idx,16 + vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c) + `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)` +___ +} + +$code=<<___; +.machine "any" +.text + +.globl $func +.align 6 +$func: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr $lrsave + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + mfspr $vrsave,256 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r11,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + mtspr 256,r11 + + bl LPICmeup + addi $offload,$sp,$FRAME+15 +___ +$code.=<<___ if ($LENDIAN); + li $idx,8 + lvsl $lemask,0,$idx + vspltisb $Ki,0x0f + vxor $lemask,$lemask,$Ki +___ +$code.=<<___ if ($SZ==4); + lvx_4w $A,$x00,$ctx + lvx_4w $E,$x10,$ctx + vsldoi $B,$A,$A,4 # unpack + vsldoi $C,$A,$A,8 + vsldoi $D,$A,$A,12 + vsldoi $F,$E,$E,4 + vsldoi $G,$E,$E,8 + vsldoi $H,$E,$E,12 +___ +$code.=<<___ if ($SZ==8); + lvx_u $A,$x00,$ctx + lvx_u $C,$x10,$ctx + lvx_u $E,$x20,$ctx + vsldoi $B,$A,$A,8 # unpack + lvx_u $G,$x30,$ctx + vsldoi $D,$C,$C,8 + vsldoi $F,$E,$E,8 + vsldoi $H,$G,$G,8 +___ +$code.=<<___; + li r0,`($rounds-16)/16` # inner loop counter + b Loop +.align 5 +Loop: + lvx $Ki,$x00,$Tbl + li $idx,16 + lvx_u @X[0],0,$inp + addi $inp,$inp,16 + stvx $A,$x00,$offload # offload $A-$H + stvx $B,$x10,$offload + stvx $C,$x20,$offload + stvx $D,$x30,$offload + stvx $E,$x40,$offload + stvx $F,$x50,$offload + stvx $G,$x60,$offload + stvx $H,$x70,$offload + vaddu${sz}m $H,$H,$Ki # h+K[i] + lvx $Ki,$idx,$Tbl + addi $idx,$idx,16 +___ +for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mtctr r0 + b L16_xx +.align 5 +L16_xx: +___ +for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bdnz L16_xx + + lvx @X[2],$x00,$offload + subic. $num,$num,1 + lvx @X[3],$x10,$offload + vaddu${sz}m $A,$A,@X[2] + lvx @X[4],$x20,$offload + vaddu${sz}m $B,$B,@X[3] + lvx @X[5],$x30,$offload + vaddu${sz}m $C,$C,@X[4] + lvx @X[6],$x40,$offload + vaddu${sz}m $D,$D,@X[5] + lvx @X[7],$x50,$offload + vaddu${sz}m $E,$E,@X[6] + lvx @X[8],$x60,$offload + vaddu${sz}m $F,$F,@X[7] + lvx @X[9],$x70,$offload + vaddu${sz}m $G,$G,@X[8] + vaddu${sz}m $H,$H,@X[9] + bne Loop +___ +$code.=<<___ if ($SZ==4); + lvx @X[0],$idx,$Tbl + addi $idx,$idx,16 + vperm $A,$A,$B,$Ki # pack the answer + lvx @X[1],$idx,$Tbl + vperm $E,$E,$F,$Ki + vperm $A,$A,$C,@X[0] + vperm $E,$E,$G,@X[0] + vperm $A,$A,$D,@X[1] + vperm $E,$E,$H,@X[1] + stvx_4w $A,$x00,$ctx + stvx_4w $E,$x10,$ctx +___ +$code.=<<___ if ($SZ==8); + vperm $A,$A,$B,$Ki # pack the answer + vperm $C,$C,$D,$Ki + vperm $E,$E,$F,$Ki + vperm $G,$G,$H,$Ki + stvx_u $A,$x00,$ctx + stvx_u $C,$x10,$ctx + stvx_u $E,$x20,$ctx + stvx_u $G,$x30,$ctx +___ +$code.=<<___; + li r10,`$FRAME+8*16+15` + mtlr $lrsave + li r11,`$FRAME+8*16+31` + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,4,1,0x80,6,3,0 + .long 0 +.size $func,.-$func +___ + +# Ugly hack here, because PPC assembler syntax seem to vary too +# much from platforms to platform... +$code.=<<___; +.align 6 +LPICmeup: + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl ; vvvvvv "distance" between . and 1st data entry + addi $Tbl,$Tbl,`64-8` + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` +___ + +if ($SZ==8) { + local *table = sub { + foreach(@_) { $code.=".quad $_,$_\n"; } + }; + table( + "0x428a2f98d728ae22","0x7137449123ef65cd", + "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc", + "0x3956c25bf348b538","0x59f111f1b605d019", + "0x923f82a4af194f9b","0xab1c5ed5da6d8118", + "0xd807aa98a3030242","0x12835b0145706fbe", + "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2", + "0x72be5d74f27b896f","0x80deb1fe3b1696b1", + "0x9bdc06a725c71235","0xc19bf174cf692694", + "0xe49b69c19ef14ad2","0xefbe4786384f25e3", + "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65", + "0x2de92c6f592b0275","0x4a7484aa6ea6e483", + "0x5cb0a9dcbd41fbd4","0x76f988da831153b5", + "0x983e5152ee66dfab","0xa831c66d2db43210", + "0xb00327c898fb213f","0xbf597fc7beef0ee4", + "0xc6e00bf33da88fc2","0xd5a79147930aa725", + "0x06ca6351e003826f","0x142929670a0e6e70", + "0x27b70a8546d22ffc","0x2e1b21385c26c926", + "0x4d2c6dfc5ac42aed","0x53380d139d95b3df", + "0x650a73548baf63de","0x766a0abb3c77b2a8", + "0x81c2c92e47edaee6","0x92722c851482353b", + "0xa2bfe8a14cf10364","0xa81a664bbc423001", + "0xc24b8b70d0f89791","0xc76c51a30654be30", + "0xd192e819d6ef5218","0xd69906245565a910", + "0xf40e35855771202a","0x106aa07032bbd1b8", + "0x19a4c116b8d2d0c8","0x1e376c085141ab53", + "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8", + "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb", + "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3", + "0x748f82ee5defb2fc","0x78a5636f43172f60", + "0x84c87814a1f0ab72","0x8cc702081a6439ec", + "0x90befffa23631e28","0xa4506cebde82bde9", + "0xbef9a3f7b2c67915","0xc67178f2e372532b", + "0xca273eceea26619c","0xd186b8c721c0c207", + "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178", + "0x06f067aa72176fba","0x0a637dc5a2c898a6", + "0x113f9804bef90dae","0x1b710b35131c471b", + "0x28db77f523047d84","0x32caab7b40c72493", + "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c", + "0x4cc5d4becb3e42b6","0x597f299cfc657e2a", + "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0"); +$code.=<<___ if (!$LENDIAN); +.quad 0x0001020304050607,0x1011121314151617 +___ +$code.=<<___ if ($LENDIAN); # quad-swapped +.quad 0x1011121314151617,0x0001020304050607 +___ +} else { + local *table = sub { + foreach(@_) { $code.=".long $_,$_,$_,$_\n"; } + }; + table( + "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5", + "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5", + "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3", + "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174", + "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc", + "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da", + "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7", + "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967", + "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13", + "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85", + "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3", + "0xd192e819","0xd6990624","0xf40e3585","0x106aa070", + "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5", + "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3", + "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208", + "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0"); +$code.=<<___ if (!$LENDIAN); +.long 0x00010203,0x10111213,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x08090a0b,0x10111213 +___ +$code.=<<___ if ($LENDIAN); # word-swapped +.long 0x10111213,0x10111213,0x10111213,0x00010203 +.long 0x10111213,0x10111213,0x04050607,0x00010203 +.long 0x10111213,0x08090a0b,0x04050607,0x00010203 +___ +} +$code.=<<___; +.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by " +.align 2 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; -- cgit v1.2.3