From aa4d426b4d3527d7e166df1a05058c9a4a0f6683 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 00:33:56 +0200 Subject: initial/final commit --- openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl | 160 ++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl (limited to 'openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl') diff --git a/openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl b/openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl new file mode 100644 index 0000000..c0e5400 --- /dev/null +++ b/openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl @@ -0,0 +1,160 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# February 2012 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication +# used in bn_gf2m.c. It's kind of low-hanging mechanical port from +# C for the time being... The subroutine runs in 37 cycles, which is +# 4.5x faster than compiler-generated code. Though comparison is +# totally unfair, because this module utilizes Galois Field Multiply +# instruction. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector + +($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); +($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); +($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); +($A,$B)=($Alo,$B_1); +$xFF="B1"; + +sub mul_1x1_upper { +my ($A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication +|| XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 + XORMPY $Alo,$B_0,$Alox0 +|| XORMPY $Ahi,$B_0,$Ahix0 + XORMPY $Alo,$B_3,$Alox3 +|| XORMPY $Ahi,$B_3,$Ahix3 + XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +___ +} +sub mul_1x1_merged { +my ($OUTlo,$OUThi,$A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi +|| XORMPY $Alo,$B_2,$Alox2 + XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 +|| XORMPY $Alo,$B_0,A1 ; $Alox0 + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| XORMPY $Ahi,$B_0,$Ahix0 +|| XORMPY $Alo,$B_3,$Alox3 +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| XORMPY $Ahi,$B_3,$Ahix3 +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +|| XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +|| MV A1,$Alox0 +___ +} +sub mul_1x1_lower { +my ($OUTlo,$OUThi)=@_; +$code.=<<___; + ;NOP + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi + NOP + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +___ +} +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 + .endif + + .global _bn_GF2m_mul_2x2 +_bn_GF2m_mul_2x2: + .asmfunc + MVK 0xFF,$xFF +___ + &mul_1x1_upper($a0,$b0); # a0·b0 +$code.=<<___; +|| MV $b1,$B + MV $a1,$A +___ + &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 +$code.=<<___; +|| XOR $b0,$b1,$B + XOR $a0,$a1,$A +___ + &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) +$code.=<<___; + XOR A28,A31,A29 +|| XOR B28,B31,B29 ; a0·b0+a1·b1 +___ + &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) +$code.=<<___; +|| BNOP B3 + XOR A29,A30,A30 +|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 + XOR B28,A30,A30 +|| STW A28,*${rp}[0] + XOR B30,A31,A31 +|| STW A30,*${rp}[1] + STW A31,*${rp}[2] + STW B31,*${rp}[3] + .endasmfunc +___ + +print $code; +close STDOUT; -- cgit v1.2.3