From aa4d426b4d3527d7e166df1a05058c9a4a0f6683 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 00:33:56 +0200 Subject: initial/final commit --- openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl | 670 +++++++++++++++++++++++++ 1 file changed, 670 insertions(+) create mode 100755 openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl (limited to 'openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl') diff --git a/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl b/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl new file mode 100755 index 0000000..f0598cb --- /dev/null +++ b/openssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl @@ -0,0 +1,670 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +# May 2016 +# +# 2x aggregated reduction improves performance by 50% (resulting +# performance on POWER8 is 1 cycle per processed byte), and 4x +# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb). + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; + $UCMP="cmpld"; + $SHRI="srdi"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; + $UCMP="cmplw"; + $SHRI="srwi"; +} else { die "nonsense $flavour"; } + +$sp="r1"; +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); +my $vrsave="r12"; + +$code=<<___; +.machine "any" + +.text + +.globl .gcm_init_p8 +.align 5 +.gcm_init_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $H,0,r4 # load H + + vspltisb $xC2,-16 # 0xf0 + vspltisb $t0,1 # one + vaddubm $xC2,$xC2,$xC2 # 0xe0 + vxor $zero,$zero,$zero + vor $xC2,$xC2,$t0 # 0xe1 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... + vsldoi $t1,$zero,$t0,1 # ...1 + vaddubm $xC2,$xC2,$xC2 # 0xc2... + vspltisb $t2,7 + vor $xC2,$xC2,$t1 # 0xc2....01 + vspltb $t1,$H,0 # most significant byte + vsl $H,$H,$t0 # H<<=1 + vsrab $t1,$t1,$t2 # broadcast carry bit + vand $t1,$t1,$xC2 + vxor $IN,$H,$t1 # twisted H + + vsldoi $H,$IN,$IN,8 # twist even more ... + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 + vsldoi $Hl,$zero,$H,8 # ... and split + vsldoi $Hh,$H,$zero,8 + + stvx_u $xC2,0,r3 # save pre-computed table + stvx_u $Hl,r8,r3 + li r8,0x40 + stvx_u $H, r9,r3 + li r9,0x50 + stvx_u $Hh,r10,r3 + li r10,0x60 + + vpmsumd $Xl,$IN,$Hl # H.lo·H.lo + vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi + vpmsumd $Xh,$IN,$Hh # H.hi·H.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $IN1,$Xl,$t1 + + vsldoi $H2,$IN1,$IN1,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $H2l,r8,r3 # save H^2 + li r8,0x70 + stvx_u $H2,r9,r3 + li r9,0x80 + stvx_u $H2h,r10,r3 + li r10,0x90 +___ +{ +my ($t4,$t5,$t6) = ($Hl,$H,$Hh); +$code.=<<___; + vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo + vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo + vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi + vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi + vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi + vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vsldoi $t4,$Xm1,$zero,8 + vsldoi $t5,$zero,$Xm1,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + vxor $Xl1,$Xl1,$t4 + vxor $Xh1,$Xh1,$t5 + + vsldoi $Xl,$Xl,$Xl,8 + vsldoi $Xl1,$Xl1,$Xl1,8 + vxor $Xl,$Xl,$t2 + vxor $Xl1,$Xl1,$t6 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vpmsumd $Xl1,$Xl1,$xC2 + vxor $t1,$t1,$Xh + vxor $t5,$t5,$Xh1 + vxor $Xl,$Xl,$t1 + vxor $Xl1,$Xl1,$t5 + + vsldoi $H,$Xl,$Xl,8 + vsldoi $H2,$Xl1,$Xl1,8 + vsldoi $Hl,$zero,$H,8 + vsldoi $Hh,$H,$zero,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $Hl,r8,r3 # save H^3 + li r8,0xa0 + stvx_u $H,r9,r3 + li r9,0xb0 + stvx_u $Hh,r10,r3 + li r10,0xc0 + stvx_u $H2l,r8,r3 # save H^4 + stvx_u $H2,r9,r3 + stvx_u $H2h,r10,r3 + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_init_p8,.-.gcm_init_p8 +___ +} +$code.=<<___; +.globl .gcm_gmult_p8 +.align 5 +.gcm_gmult_p8: + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $IN,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $IN,$IN,$IN,$lemask + vxor $zero,$zero,$zero + + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl .gcm_ghash_p8 +.align 5 +.gcm_ghash_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $Xl,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + li r8,0x40 + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + li r9,0x50 + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + li r10,0x60 + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $Xl,$Xl,$Xl,$lemask + vxor $zero,$zero,$zero + + ${UCMP}i $len,64 + bge Lgcm_ghash_p8_4x + + lvx_u $IN,0,$inp + addi $inp,$inp,16 + subic. $len,$len,16 + le?vperm $IN,$IN,$IN,$lemask + vxor $IN,$IN,$Xl + beq Lshort + + lvx_u $H2l,r8,$Htbl # load H^2 + li r8,16 + lvx_u $H2, r9,$Htbl + add r9,$inp,$len # end of input + lvx_u $H2h,r10,$Htbl + be?b Loop_2x + +.align 5 +Loop_2x: + lvx_u $IN1,0,$inp + le?vperm $IN1,$IN1,$IN1,$lemask + + subic $len,$len,32 + vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo + vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo + subfe r0,r0,r0 # borrow?-1:0 + vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi + vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi + and r0,r0,$len + vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi + vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi + add $inp,$inp,r0 + + vxor $Xl,$Xl,$Xl1 + vxor $Xm,$Xm,$Xm1 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh1 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + lvx_u $IN,r8,$inp + addi $inp,$inp,32 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + le?vperm $IN,$IN,$IN,$lemask + vxor $t1,$t1,$Xh + vxor $IN,$IN,$t1 + vxor $IN,$IN,$Xl + $UCMP r9,$inp + bgt Loop_2x # done yet? + + cmplwi $len,0 + bne Leven + +Lshort: + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + +Leven: + vxor $Xl,$Xl,$t1 + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 +___ +{ +my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h, + $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31)); +my $IN0=$IN; +my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h); + +$code.=<<___; +.align 5 +.gcm_ghash_p8_4x: +Lgcm_ghash_p8_4x: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + li r10,0x60 + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME-4`($sp) # save vrsave + mtspr 256,r0 # preserve all AltiVec registers + + lvsl $t0,0,r8 # 0x0001..0e0f + #lvx_u $H2l,r8,$Htbl # load H^2 + li r8,0x70 + lvx_u $H2, r9,$Htbl + li r9,0x80 + vspltisb $t1,8 # 0x0808..0808 + #lvx_u $H2h,r10,$Htbl + li r10,0x90 + lvx_u $H3l,r8,$Htbl # load H^3 + li r8,0xa0 + lvx_u $H3, r9,$Htbl + li r9,0xb0 + lvx_u $H3h,r10,$Htbl + li r10,0xc0 + lvx_u $H4l,r8,$Htbl # load H^4 + li r8,0x10 + lvx_u $H4, r9,$Htbl + li r9,0x20 + lvx_u $H4h,r10,$Htbl + li r10,0x30 + + vsldoi $t2,$zero,$t1,8 # 0x0000..0808 + vaddubm $hiperm,$t0,$t2 # 0x0001..1617 + vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f + + $SHRI $len,$len,4 # this allows to use sign bit + # as carry + lvx_u $IN0,0,$inp # load input + lvx_u $IN1,r8,$inp + subic. $len,$len,8 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + + vxor $Xh,$IN0,$Xl + + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vperm $H21l,$H2,$H,$hiperm + vperm $t0,$IN2,$IN3,$loperm + vperm $H21h,$H2,$H,$loperm + vperm $t1,$IN2,$IN3,$hiperm + vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi + + vxor $Xm2,$Xm2,$Xm1 + vxor $Xl3,$Xl3,$Xl1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh3,$Xh3,$Xh1 + + blt Ltail_4x + +Loop_4x: + lvx_u $IN0,0,$inp + lvx_u $IN1,r8,$inp + subic. $len,$len,4 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + le?vperm $IN0,$IN0,$IN0,$lemask + + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + vxor $Xh,$Xh,$Xh3 + vperm $t0,$IN2,$IN3,$loperm + vperm $t1,$IN2,$IN3,$hiperm + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xl,$Xl,$xC2 + + vxor $Xl3,$Xl3,$Xl1 + vxor $Xh3,$Xh3,$Xh1 + vxor $Xh,$Xh,$IN0 + vxor $Xm2,$Xm2,$Xm1 + vxor $Xh,$Xh,$t1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh,$Xh,$Xl + bge Loop_4x + +Ltail_4x: + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh3 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + addic. $len,$len,4 + beq Ldone_4x + + lvx_u $IN0,0,$inp + ${UCMP}i $len,2 + li $len,-4 + blt Lone + lvx_u $IN1,r8,$inp + beq Ltwo + +Lthree: + lvx_u $IN2,r9,$inp + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + + vxor $Xh,$IN0,$Xl + vmr $H4l,$H3l + vmr $H4, $H3 + vmr $H4h,$H3h + + vperm $t0,$IN1,$IN2,$loperm + vperm $t1,$IN1,$IN2,$hiperm + vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo + vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi + + vxor $Xm3,$Xm3,$Xm2 + b Ltail_4x + +.align 4 +Ltwo: + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + + vxor $Xh,$IN0,$Xl + vperm $t0,$zero,$IN1,$loperm + vperm $t1,$zero,$IN1,$hiperm + + vsldoi $H4l,$zero,$H2,8 + vmr $H4, $H2 + vsldoi $H4h,$H2,$zero,8 + + vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo + vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi + vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi + + b Ltail_4x + +.align 4 +Lone: + le?vperm $IN0,$IN0,$IN0,$lemask + + vsldoi $H4l,$zero,$H,8 + vmr $H4, $H + vsldoi $H4h,$H,$zero,8 + + vxor $Xh,$IN0,$Xl + vxor $Xl3,$Xl3,$Xl3 + vxor $Xm3,$Xm3,$Xm3 + vxor $Xh3,$Xh3,$Xh3 + + b Ltail_4x + +Ldone_4x: + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtspr 256,$vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,0,0x80,0,4,0 + .long 0 +___ +} +$code.=<<___; +.size .gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o; + } else { + s/le\?/#le#/o or + s/be\?//o; + } + print $_,"\n"; +} + +close STDOUT; # enforce flush -- cgit v1.2.3