From aa4d426b4d3527d7e166df1a05058c9a4a0f6683 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 00:33:56 +0200 Subject: initial/final commit --- .../crypto/poly1305/asm/poly1305-s390x.pl | 227 +++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100755 openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl (limited to 'openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl') diff --git a/openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl new file mode 100755 index 0000000..82d757d --- /dev/null +++ b/openssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl @@ -0,0 +1,227 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for s390x. +# +# June 2015 +# +# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated +# code. For older compiler improvement coefficient is >3x, because +# then base 2^64 and base 2^32 implementations are compared. +# +# On side note, z13 enables vector base 2^26 implementation... + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$sp="%r15"; + +my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); + +$code.=<<___; +.text + +.globl poly1305_init +.type poly1305_init,\@function +.align 16 +poly1305_init: + lghi %r0,0 + lghi %r1,-1 + stg %r0,0($ctx) # zero hash value + stg %r0,8($ctx) + stg %r0,16($ctx) + + cl${g}r $inp,%r0 + je .Lno_key + + lrvg %r4,0($inp) # load little-endian key + lrvg %r5,8($inp) + + nihl %r1,0xffc0 # 0xffffffc0ffffffff + srlg %r0,%r1,4 # 0x0ffffffc0fffffff + srlg %r1,%r1,4 + nill %r1,0xfffc # 0x0ffffffc0ffffffc + + ngr %r4,%r0 + ngr %r5,%r1 + + stg %r4,32($ctx) + stg %r5,40($ctx) + +.Lno_key: + lghi %r2,0 + br %r14 +.size poly1305_init,.-poly1305_init +___ +{ +my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); +my ($r0,$r1,$s1) = map("%r$_",(0..2)); + +$code.=<<___; +.globl poly1305_blocks +.type poly1305_blocks,\@function +.align 16 +poly1305_blocks: + srl${g} $len,4 # fixed-up in 64-bit build + lghi %r0,0 + cl${g}r $len,%r0 + je .Lno_data + + stm${g} %r6,%r14,`6*$SIZE_T`($sp) + + llgfr $padbit,$padbit # clear upper half, much needed with + # non-64-bit ABI + lg $r0,32($ctx) # load key + lg $r1,40($ctx) + + lg $h0,0($ctx) # load hash value + lg $h1,8($ctx) + lg $h2,16($ctx) + + st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx + srlg $s1,$r1,2 + algr $s1,$r1 # s1 = r1 + r1>>2 + j .Loop + +.align 16 +.Loop: + lrvg $d0lo,0($inp) # load little-endian input + lrvg $d1lo,8($inp) + la $inp,16($inp) + + algr $d0lo,$h0 # accumulate input + alcgr $d1lo,$h1 + + lgr $h0,$d0lo + mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo + lgr $h1,$d1lo + mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo + + mlgr $t0,$r1 # h0*r1 -> $t0:$h0 + mlgr $t1,$r0 # h1*r0 -> $t1:$h1 + alcgr $h2,$padbit + + algr $d0lo,$d1lo + lgr $d1lo,$h2 + alcgr $d0hi,$d1hi + lghi $d1hi,0 + + algr $h1,$h0 + alcgr $t1,$t0 + + msgr $d1lo,$s1 # h2*s1 + msgr $h2,$r0 # h2*r0 + + algr $h1,$d1lo + alcgr $t1,$d1hi # $d1hi is zero + + algr $h1,$d0hi + alcgr $h2,$t1 + + lghi $h0,-4 # final reduction step + ngr $h0,$h2 + srlg $t0,$h2,2 + algr $h0,$t0 + lghi $t1,3 + ngr $h2,$t1 + + algr $h0,$d0lo + alcgr $h1,$d1hi # $d1hi is still zero + alcgr $h2,$d1hi # $d1hi is still zero + + brct$g $len,.Loop + + l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx + + stg $h0,0($ctx) # store hash value + stg $h1,8($ctx) + stg $h2,16($ctx) + + lm${g} %r6,%r14,`6*$SIZE_T`($sp) +.Lno_data: + br %r14 +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($mac,$nonce)=($inp,$len); +my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); + +$code.=<<___; +.globl poly1305_emit +.type poly1305_emit,\@function +.align 16 +poly1305_emit: + stm${g} %r6,%r9,`6*$SIZE_T`($sp) + + lg $h0,0($ctx) + lg $h1,8($ctx) + lg $h2,16($ctx) + + lghi %r0,5 + lghi %r1,0 + lgr $d0,$h0 + lgr $d1,$h1 + + algr $h0,%r0 # compare to modulus + alcgr $h1,%r1 + alcgr $h2,%r1 + + srlg $h2,$h2,2 # did it borrow/carry? + slgr %r1,$h2 # 0-$h2>>2 + lg $h2,0($nonce) # load nonce + lghi %r0,-1 + lg $ctx,8($nonce) + xgr %r0,%r1 # ~%r1 + + ngr $h0,%r1 + ngr $d0,%r0 + ngr $h1,%r1 + ngr $d1,%r0 + ogr $h0,$d0 + rllg $d0,$h2,32 # flip nonce words + ogr $h1,$d1 + rllg $d1,$ctx,32 + + algr $h0,$d0 # accumulate nonce + alcgr $h1,$d1 + + strvg $h0,0($mac) # write little-endian result + strvg $h1,8($mac) + + lm${g} %r6,%r9,`6*$SIZE_T`($sp) + br %r14 +.size poly1305_emit,.-poly1305_emit + +.string "Poly1305 for s390x, CRYPTOGAMS by " +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm; + +print $code; +close STDOUT; -- cgit v1.2.3