aboutsummaryrefslogtreecommitdiff
path: root/gnu/packages/samba.scm
blob: da103ed839f652896ca25699afdb0e495318d979 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2013, 2015, 2017, 2021 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2015 Mark H Weaver <mhw@netris.org>
;;; Copyright © 2016, 2017, 2019, 2021 Efraim Flashner <efraim@flashner.co.il>
;;; Copyright © 2016 Adonay "adfeno" Felipe Nogueira <https://libreplanet.org/wiki/User:Adfeno> <adfeno@openmailbox.org>
;;; Copyright © 2017 Thomas Danckaert <post@thomasdanckaert.be>
;;; Copyright © 2017, 2018, 2020 Marius Bakke <mbakke@fastmail.com>
;;; Copyright © 2018–2021 Tobias Geerinckx-Rice <me@tobias.gr>
;;; Copyright © 2018 Ricardo Wurmus <rekado@elephly.net>
;;; Copyright © 2019 Rutger Helling <rhelling@mykolab.com>
;;; Copyright © 2020 Pierre Langlois <pierre.langlois@gmx.com>
;;; Copyright © 2020, 2022 Maxim Cournoyer <maxim.cournoyer@gmail.com>
;;; Copyright © 2022 Jean-Pierre De Jesus DIAZ <me@jeandudey.tech>
;;; Copyright © 2022 Guillaume Le Vaillant <glv@posteo.net>
;;; Copyright © 2022 Maxime Devos <maximedevos@telenet.be>
;;;
;;; This file is part of GNU Guix.
;;;
;;; GNU Guix is free software; you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or (at
;;; your option) any later version.
;;;
;;; GNU Guix is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.

(define-module (gnu packages samba)
  #:use-module (guix gexp)
  #:use-module (guix packages)
  #:use-module (guix download)
  #:use-module (guix git-download)
  #:use-module (guix build-system gnu)
  #:use-module (guix build-system copy)
  #:use-module ((guix licenses) #:prefix license:)
  #:use-module (guix utils)
  #:use-module (gnu packages)
  #:use-module (gnu packages acl)
  #:use-module (gnu packages admin)
  #:use-module (gnu packages autotools)
  #:use-module (gnu packages backup)
  #:use-module (gnu packages base)
  #:use-module (gnu packages check)
  #:use-module (gnu packages crypto)
  #:use-module (gnu packages cups)
  #:use-module (gnu packages databases)
  #:use-module (gnu packages docbook)
  #:use-module (gnu packages glib)
  #:use-module (gnu packages gnome)
  #:use-module (gnu packages gnupg)
  #:use-module (gnu packages kerberos)
  #:use-module (gnu packages linux)
  #:use-module (gnu packages onc-rpc)
  #:use-module (gnu packages openldap)
  #:use-module (gnu packages perl)
  #:use-module (gnu packages pkg-config)
  #:use-module (gnu packages popt)
  #:use-module (gnu packages python)
  #:use-module (gnu packages python-crypto)
  #:use-module (gnu packages python-xyz)
  #:use-module (gnu packages readline)
  #:use-module (gnu packages time)
  #:use-module (gnu packages tls)
  #:use-module (gnu packages web)
  #:use-module (gnu packages xml))

(define-public cifs-utils
  (package
    (name "cifs-utils")
    (version "7.0")
    (source
     (origin
       (method url-fetch)
       (uri (string-append "https://download.samba.org/pub/linux-cifs/"
                           "cifs-utils/cifs-utils-" version ".tar.bz2"))
       (sha256 (base32
                "0qc1ph94yvg87m87xangw9dd0m5ds2q1zd2sqkzldsnkbfwamvqd"))))
    (build-system gnu-build-system)
    (native-inputs
     (list autoconf automake pkg-config
           ;; To generate the manpages.
           python-docutils)) ; rst2man
    (inputs
     (list keyutils libcap-ng linux-pam mit-krb5 samba talloc))
    (arguments
     (list #:configure-flags #~(list "--enable-man")
           #:phases
           #~(modify-phases %standard-phases
               (add-before 'bootstrap 'trigger-bootstrap
                 ;; The shipped configure script is buggy, e.g., it contains a
                 ;; unexpanded literal ‘LIBCAP_NG_PATH’ line).
                 (lambda _
                   (delete-file "configure")))
               (add-before 'configure 'set-root-sbin
                 ;; Don't try to install into "/sbin".
                 (lambda _
                   (setenv "ROOTSBINDIR" (string-append #$output "/sbin")))))))
    (synopsis "User-space utilities for Linux CIFS (Samba) mounts")
    (description "@code{cifs-utils} is a set of user-space utilities for
mounting and managing @acronym{CIFS, Common Internet File System} shares using
the Linux kernel CIFS client.")
    (home-page "https://wiki.samba.org/index.php/LinuxCIFS_utils")
    ;; cifs-utils is licensed as GPL3 or later, but 3 files contain LGPL code.
    (license license:gpl3+)))

(define-public iniparser
  (package
    (name "iniparser")
    (version "4.1")
    (source (origin
             (method git-fetch)
             (uri (git-reference
                    (url "https://github.com/ndevilla/iniparser")
                    (commit (string-append "v" version))))
             (file-name (git-file-name name version))
             (sha256
              (base32
               "0dhab6pad6wh816lr7r3jb6z273njlgw2vpw8kcfnmi7ijaqhnr5"))))
    (build-system gnu-build-system)
    (arguments
     `(#:make-flags
       (list ,(string-append "CC=" (cc-for-target)))
       #:phases
       (modify-phases %standard-phases
         (replace 'configure
           (lambda* (#:key outputs #:allow-other-keys)
             (substitute* '("Makefile" "test/Makefile")
               (("/usr/lib")
                (string-append (assoc-ref outputs "out") "/lib")))
             #t))
         (replace 'build
           (lambda* (#:key make-flags #:allow-other-keys)
             (apply invoke "make" "libiniparser.so.1"
                    make-flags)))
         (replace 'install
           (lambda* (#:key outputs #:allow-other-keys)
             (let* ((out  (assoc-ref outputs "out"))
                    (lib  (string-append out "/lib"))
                    (inc  (string-append out "/include"))
                    (doc  (string-append out "/share/doc/" ,name))
                    (html (string-append doc "/html")))
               (define (install dir)
                 (lambda (file)
                   (install-file file dir)))
               (for-each (install lib)
                         (find-files "." "^lib.*\\.so"))
               (with-directory-excursion lib
                 (symlink "libiniparser.so.1" "libiniparser.so"))
               (for-each (install inc)
                         (find-files "src" "\\.h$"))
               (for-each (install html)
                         (find-files "html" ".*"))
               (for-each (install doc)
                         '("AUTHORS" "INSTALL" "LICENSE" "README.md"))
               #t))))))
    (home-page "https://github.com/ndevilla/iniparser")
    (synopsis "Simple @file{.ini} configuration file parsing library")
    (description
     "The iniParser C library reads and writes Windows-style @file{.ini}
configuration files.  These are simple text files with a basic structure
composed of sections, properties, and values.  While not expressive, they
are easy to read, write, and modify.

The library is small, thread safe, and written in portable ANSI C with no
external dependencies.")
    (license license:x11)))

(define-public samba
  (package
    (name "samba")
    (version "4.16.8")
    (source
     ;; For updaters: the current PGP fingerprint is
     ;; 81F5E2832BD2545A1897B713AA99442FB680B620.
     (origin
       (method url-fetch)
       (uri (string-append "https://download.samba.org/pub/samba/stable/"
                           "samba-" version ".tar.gz"))
       (sha256
        (base32 "11a1vikbijaq7csg49h5ivn25gx84v6wx8z8kgsj1wmkhsf9bcmv"))))
    (build-system gnu-build-system)
    (arguments
     (list
      #:make-flags #~(list "TEST_OPTIONS=--quick") ;some tests are very long
      #:phases
      #~(modify-phases %standard-phases
          (add-before 'configure 'setup-docbook-stylesheets
            (lambda* (#:key inputs #:allow-other-keys)
              ;; Append Samba's own DTDs to XML_CATALOG_FILES
              ;; (c.f. docs-xml/build/README).
              (copy-file "docs-xml/build/catalog.xml.in"
                         "docs-xml/build/catalog.xml")
              (substitute* "docs-xml/build/catalog.xml"
                (("/@abs_top_srcdir@")
                 (string-append (getcwd) "/docs-xml")))
              ;; Honor XML_CATALOG_FILES.
              (substitute* "buildtools/wafsamba/wafsamba.py"
                (("XML_CATALOG_FILES=\"\\$\\{SAMBA_CATALOGS\\}" all)
                 (string-append all " $XML_CATALOG_FILES")))))
          (replace 'configure
            ;; Samba uses a custom configuration script that runs WAF.
            (lambda* (#:key inputs #:allow-other-keys)
              (let* ((libdir (string-append #$output "/lib")))
                (invoke "./configure"
                        "--enable-selftest"
                        "--enable-fhs"
                        (string-append "--prefix=" #$output)
                        "--sysconfdir=/etc"
                        "--localstatedir=/var"
                        ;; Install public and private libraries into
                        ;; a single directory to avoid RPATH issues.
                        (string-append "--libdir=" libdir)
                        (string-append "--with-privatelibdir=" libdir)
                        "--with-system-mitkrb5" ;#$(this-package-input "mit-krb5")
                        (string-append "--with-system-mitkdc="
                                       (search-input-file inputs "sbin/krb5kdc"))
                        "--with-experimental-mit-ad-dc"))))
          (add-before 'install 'disable-etc,var-samba-directories-setup
            (lambda _
              (substitute* "dynconfig/wscript"
                (("bld\\.INSTALL_DIR.*") "")))))
      ;; FIXME: The test suite seemingly hangs after failing to provision the
      ;; test environment.
      #:tests? #f))
    (inputs
     (list acl
           cmocka
           cups
           gamin
           dbus
           gpgme
           gnutls
           jansson
           libarchive
           libtirpc
           linux-pam
           lmdb
           mit-krb5
           openldap
           perl
           python
           popt
           readline
           tdb))
    (propagated-inputs
     ;; In Requires or Requires.private of pkg-config files.
     (list ldb talloc tevent))
    (native-inputs
     (list perl-parse-yapp
           pkg-config
           python-cryptography          ;for krb5 tests
           python-dnspython
           python-iso8601
           python-markdown
           rpcsvc-proto                 ;for 'rpcgen'
           python-pyasn1                ;for krb5 tests
           ;; For generating man pages.
           docbook-xml-4.2
           docbook-xsl-next             ;otherwise the man pages are corrupted
           libxslt
           libxml2))                    ;for XML_CATALOG_FILES
    (home-page "https://www.samba.org/")
    (synopsis
     "The standard Windows interoperability suite of programs for GNU and Unix")
    (description
     "Since 1992, Samba has provided secure, stable and fast file and print
services for all clients using the SMB/CIFS protocol, such as all versions of
DOS and Windows, OS/2, GNU/Linux and many others.

Samba is an important component to seamlessly integrate Linux/Unix Servers and
Desktops into Active Directory environments using the winbind daemon.")
    (license license:gpl3+)))

;;; FIXME: Invert inheritance relationship; the "pinned" package shouldn't be
;;; susceptible to changes in the free one.
(define-public samba/pinned
  ;; Version that rarely changes, depended on by libsoup.
  (hidden-package
   (package
     (inherit samba)
     (replacement samba/fixed)
     (version "4.15.3")
     (source
      (origin
        (method url-fetch)
        (uri (string-append "https://download.samba.org/pub/samba/stable/"
                            "samba-" version ".tar.gz"))
        (sha256
         (base32 "1nrp85aya0pbbqdqjaqcw82cnzzys16yls37hi2h6mci8d09k4si"))))
     (native-inputs
      (list perl-parse-yapp
            pkg-config
            python-cryptography         ;for krb5 tests
            python-dnspython
            python-iso8601
            python-markdown
            rpcsvc-proto                ;for 'rpcgen'
            python-pyasn1               ;for krb5 tests
            ;; For generating man pages.
            docbook-xml-4.2
            docbook-xsl
            libxslt
            libxml2)))))

(define-public samba/fixed
  (package
    (inherit samba/pinned)
    ;; This is 4.15.13, but we need to trim the store file name to have
    ;; the same length as the one we are grafting above.
    (version "4.15.A")
    (source
     (origin
       (method url-fetch)
       (uri (string-append "https://download.samba.org/pub/samba/stable/"
                           "samba-4.15.13.tar.gz"))
       (sha256
        (base32 "0s29vzn5f42vjhx6h25c7v67n14ymqxn8glqa97d0rajd99y64n4"))))))

(define-public talloc
  (package
    (name "talloc")
    (version "2.3.3")
    (source (origin
              (method url-fetch)
              (uri (string-append "https://www.samba.org/ftp/talloc/talloc-"
                                  version ".tar.gz"))
              (sha256
               (base32
                "1ala3l6v8qk2pwq97z1zdkj1isnfnrp1923srp2g22mxd0impsbb"))))
    (build-system gnu-build-system)
    (arguments
     '(#:phases
       (modify-phases %standard-phases
         (replace 'configure
           (lambda* (#:key outputs #:allow-other-keys)
             ;; talloc uses a custom configuration script that runs a Python
             ;; script called 'waf', and doesn't tolerate unknown options.
             (setenv "CONFIG_SHELL" (which "sh"))
             (let ((out (assoc-ref outputs "out")))
               (invoke "./configure"
                       (string-append "--prefix=" out))))))))
    (native-inputs
     (list which))
    (inputs
     (list python))
    (home-page "https://talloc.samba.org")
    (synopsis "Hierarchical, reference counted memory pool system")
    (description
     "Talloc is a hierarchical, reference counted memory pool system with
destructors.  It is the core memory allocator used in Samba.")
    (license license:gpl3+))) ;; The bundled "replace" library uses LGPL3.

(define-public talloc/static
  (package
    (inherit talloc)
    (name "talloc-static")
    (synopsis
     "Hierarchical, reference counted memory pool system (static library)")
    (arguments
     (substitute-keyword-arguments (package-arguments talloc)
       ((#:phases phases)
        ;; Since Waf, the build system talloc uses, apparently does not
        ;; support building static libraries from a ./configure flag, roll our
        ;; own build process.  No need to be ashamed, we're not the only ones
        ;; doing that:
        ;; <https://github.com/proot-me/proot-static-build/blob/master/GNUmakefile>.
        ;; :-)
        `(modify-phases ,phases
           (replace 'build
             (lambda _
               (invoke "gcc" "-c" "-Ibin/default" "-I" "lib/replace"
                       "-I." "-Wall" "-g" "-D__STDC_WANT_LIB_EXT1__=1"
                       "talloc.c")
               (invoke "ar" "rc" "libtalloc.a" "talloc.o")))
           (replace 'install
             (lambda* (#:key outputs #:allow-other-keys)
               (let* ((out     (assoc-ref outputs "out"))
                      (lib     (string-append out "/lib"))
                      (include (string-append out "/include")))
                 (mkdir-p lib)
                 (install-file "libtalloc.a" lib)
                 (install-file "talloc.h" include)
                 #t)))
           (delete 'check)))))))            ;XXX: tests rely on Python modules

(define-public tevent
  (package
    (name "tevent")
    (version "0.11.0")
    (source (origin
              (method url-fetch)
              (uri (string-append "https://www.samba.org/ftp/tevent/tevent-"
                                  version ".tar.gz"))
              (sha256
               (base32
                "1fl2pj4p8p5fa2laykwf1sfjdw7pkw9slklj3vzc5ah8x348d6pf"))))
    (build-system gnu-build-system)
    (arguments
     '(#:phases
       (modify-phases %standard-phases
         (replace 'configure
           ;; tevent uses a custom configuration script that runs waf.
           (lambda* (#:key outputs #:allow-other-keys)
             (let ((out (assoc-ref outputs "out")))
               (invoke "./configure"
                       (string-append "--prefix=" out)
                       "--bundled-libraries=NONE")))))))
    (native-inputs
     (list cmocka pkg-config python which))
    (propagated-inputs
     (list talloc)) ; required by tevent.pc
    (synopsis "Event system library")
    (home-page "https://tevent.samba.org/")
    (description
     "Tevent is an event system based on the talloc memory management library.
It is the core event system used in Samba.  The low level tevent has support for
many event types, including timers, signals, and the classic file descriptor events.")
    (license license:lgpl3+)))

(define-public ldb
  (package
    (name "ldb")
    (version "2.4.1")
    (source (origin
              (method url-fetch)
              (uri (string-append "https://www.samba.org/ftp/ldb/ldb-"
                                  version ".tar.gz"))
              (sha256
               (base32
                "13yd7lavbx8bxwnmzl0j7xnl2gl4wmnn0q9g7n3y7bvbnhm07hb9"))
              (modules '((guix build utils)))
              (snippet
               '(begin
                  (for-each (lambda (file)
                              ;; Delete everything except the build tools.
                              (unless (or (string-prefix? "third_party/waf" file)
                                          (string-suffix? "wscript" file))
                                (delete-file file)))
                            (find-files "third_party"))))))
    (build-system gnu-build-system)
    (arguments
     '(;; LMDB is only supported on 64-bit systems, yet the test suite
       ;; requires it.
       #:tests? (assoc-ref %build-inputs "lmdb")
       #:phases
       (modify-phases %standard-phases
         (replace 'configure
           ;; ldb use a custom configuration script that runs waf.
           (lambda* (#:key outputs #:allow-other-keys)
             (let ((out (assoc-ref outputs "out")))
               (invoke "./configure"
                       (string-append "--prefix=" out)
                       (string-append "--with-modulesdir=" out
                                      "/lib/ldb/modules")
                       "--bundled-libraries=NONE")))))))
    (native-inputs
     (list cmocka pkg-config python which))
    (propagated-inputs
     ;; ldb.pc refers to all these.
     (list talloc tdb))
    (inputs
     `(,@(if (target-64bit?)
             `(("lmdb" ,lmdb))
             '())
       ("popt" ,popt)
       ("tevent" ,tevent)))
    (synopsis "LDAP-like embedded database")
    (home-page "https://ldb.samba.org/")
    (description
     "Ldb is a LDAP-like embedded database built on top of TDB.  What ldb does
is provide a fast database with an LDAP-like API designed to be used within an
application.  In some ways it can be seen as a intermediate solution between
key-value pair databases and a real LDAP database.")
    (license license:lgpl3+)))

(define-public ppp
  (package
    (name "ppp")
    (version "2.4.9")
    (source (origin
              (method git-fetch)
              (uri (git-reference
                    (url "https://github.com/ppp-project/ppp")
                    (commit version)))
              (file-name (git-file-name name version))
              (sha256
               (base32
                "1bhhksdclsnkw54a517ndrw55q5zljjbh9pcqz1z4a2z2flxpsgk"))))
    (build-system gnu-build-system)
    (arguments
      (list #:tests? #f                    ;; No "check" target
            #:make-flags #~(list (string-append "CC=" #$(cc-for-target)))
            #:phases
            #~(modify-phases %standard-phases
                (add-before 'configure 'patch-Makefile
                  (lambda* (#:key inputs #:allow-other-keys)
                    (let ((openssl (assoc-ref inputs "openssl"))
                          (libpcap (assoc-ref inputs "libpcap")))
                      (substitute* "pppd/Makefile.linux"
                        (("/usr/include/openssl")
                         (string-append openssl "/include"))
                        (("-DPPP_FILTER")
                         (string-append "-DPPP_FILTER -I" libpcap "/include")))
                      (substitute* "pppd/pppcrypt.h"
                        (("des\\.h") "openssl/des.h")))
                    #t)))))
    (inputs
     (list libpcap openssl))
    (synopsis "Implementation of the Point-to-Point Protocol")
    (home-page "https://ppp.samba.org/")
    (description
     "The Point-to-Point Protocol (PPP) provides a standard way to establish
a network connection over a serial link.  At present, this package supports IP
and IPV6 and the protocols layered above them, such as TCP and UDP.")
    ;; pppd, pppstats and pppdump are under BSD-style notices.
    ;; some of the pppd plugins are GPL'd.
    ;; chat is public domain.
    (license (list license:bsd-3
                   license:bsd-4
                   license:gpl2+
                   license:public-domain))))

(define-public wsdd
  (package
    (name "wsdd")
    (version "0.7.0")
    (source
     (origin
       (method git-fetch)
       (uri (git-reference (url "https://github.com/christgau/wsdd")
                           (commit (string-append "v" version))))
       (file-name (git-file-name name version))
       (sha256
        (base32 "04an2w6hamnai668ag4vq8x0i09fsg2jrayb4a7ar0x6bn837k7m"))))
    (build-system copy-build-system)
    (inputs
     `(("python" ,python)))
    (arguments
     '(#:install-plan
       '(("src/wsdd.py" "bin/wsdd")
         ("man/wsdd.1" "share/man/man1/"))))
    (home-page "https://github.com/christgau/wsdd")
    (synopsis "Web Service Discovery host daemon")
    (description "This daemon allows (Samba) hosts to be found by Web
Service Dicovery Clients.  It also implements the client side of the
discovery protocol which searches for devices implementing
WSD.")
    (license license:expat)))
ref='/smtps-and-pop3s-console-program/diff/openssl-1.1.0h/crypto/asn1/tasn_dec.c'>openssl-1.1.0h/crypto/asn1/tasn_dec.c1159
-rw-r--r--openssl-1.1.0h/crypto/asn1/tasn_enc.c605
-rw-r--r--openssl-1.1.0h/crypto/asn1/tasn_fre.c208
-rw-r--r--openssl-1.1.0h/crypto/asn1/tasn_new.c348
-rw-r--r--openssl-1.1.0h/crypto/asn1/tasn_prn.c538
-rw-r--r--openssl-1.1.0h/crypto/asn1/tasn_scn.c65
-rw-r--r--openssl-1.1.0h/crypto/asn1/tasn_typ.c84
-rw-r--r--openssl-1.1.0h/crypto/asn1/tasn_utl.c240
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_algor.c93
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_bignum.c146
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_info.c39
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_int64.c265
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_long.c196
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_pkey.c47
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_sig.c39
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_spki.c33
-rw-r--r--openssl-1.1.0h/crypto/asn1/x_val.c20
-rw-r--r--openssl-1.1.0h/crypto/async/arch/async_null.c23
-rw-r--r--openssl-1.1.0h/crypto/async/arch/async_null.h30
-rw-r--r--openssl-1.1.0h/crypto/async/arch/async_posix.c58
-rw-r--r--openssl-1.1.0h/crypto/async/arch/async_posix.h58
-rw-r--r--openssl-1.1.0h/crypto/async/arch/async_win.c55
-rw-r--r--openssl-1.1.0h/crypto/async/arch/async_win.h36
-rw-r--r--openssl-1.1.0h/crypto/async/async.c433
-rw-r--r--openssl-1.1.0h/crypto/async/async_err.c51
-rw-r--r--openssl-1.1.0h/crypto/async/async_locl.h77
-rw-r--r--openssl-1.1.0h/crypto/async/async_wait.c211
-rw-r--r--openssl-1.1.0h/crypto/async/build.info4
-rw-r--r--openssl-1.1.0h/crypto/bf/asm/bf-586.pl149
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_cbc.c86
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_cfb64.c74
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_ecb.c43
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_enc.c179
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_locl.h84
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_ofb64.c61
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_pi.h530
-rw-r--r--openssl-1.1.0h/crypto/bf/bf_skey.c67
-rw-r--r--openssl-1.1.0h/crypto/bf/build.info6
-rw-r--r--openssl-1.1.0h/crypto/bio/b_addr.c883
-rw-r--r--openssl-1.1.0h/crypto/bio/b_dump.c143
-rw-r--r--openssl-1.1.0h/crypto/bio/b_print.c926
-rw-r--r--openssl-1.1.0h/crypto/bio/b_sock.c378
-rw-r--r--openssl-1.1.0h/crypto/bio/b_sock2.c277
-rw-r--r--openssl-1.1.0h/crypto/bio/bf_buff.c455
-rw-r--r--openssl-1.1.0h/crypto/bio/bf_lbuf.c320
-rw-r--r--openssl-1.1.0h/crypto/bio/bf_nbio.c194
-rw-r--r--openssl-1.1.0h/crypto/bio/bf_null.c118
-rw-r--r--openssl-1.1.0h/crypto/bio/bio_cb.c99
-rw-r--r--openssl-1.1.0h/crypto/bio/bio_err.c126
-rw-r--r--openssl-1.1.0h/crypto/bio/bio_lcl.h188
-rw-r--r--openssl-1.1.0h/crypto/bio/bio_lib.c600
-rw-r--r--openssl-1.1.0h/crypto/bio/bio_meth.c150
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_acpt.c562
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_bio.c805
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_conn.c543
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_dgram.c1923
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_fd.c275
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_file.c424
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_log.c407
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_mem.c347
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_null.c83
-rw-r--r--openssl-1.1.0h/crypto/bio/bss_sock.c231
-rw-r--r--openssl-1.1.0h/crypto/bio/build.info8
-rw-r--r--openssl-1.1.0h/crypto/blake2/blake2_impl.h130
-rw-r--r--openssl-1.1.0h/crypto/blake2/blake2_locl.h91
-rw-r--r--openssl-1.1.0h/crypto/blake2/blake2b.c270
-rw-r--r--openssl-1.1.0h/crypto/blake2/blake2s.c264
-rw-r--r--openssl-1.1.0h/crypto/blake2/build.info3
-rw-r--r--openssl-1.1.0h/crypto/blake2/m_blake2b.c59
-rw-r--r--openssl-1.1.0h/crypto/blake2/m_blake2s.c59
-rw-r--r--openssl-1.1.0h/crypto/bn/README.pod247
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/alpha-mont.pl331
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/armv4-gf2m.pl332
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/armv4-mont.pl756
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/armv8-mont.pl1510
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/bn-586.pl785
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/bn-c64xplus.asm382
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl160
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/co-586.pl298
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/ia64-mont.pl860
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/ia64.S1562
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/mips-mont.pl433
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/mips.pl2241
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/pa-risc2.s1624
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/pa-risc2W.s1612
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/parisc-mont.pl1002
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/ppc-mont.pl342
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/ppc.pl2014
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/ppc64-mont.pl1635
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/rsaz-avx2.pl1967
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/rsaz-x86_64.pl2358
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/s390x-gf2m.pl228
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/s390x-mont.pl284
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/s390x.S713
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/sparct4-mont.pl1232
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/sparcv8.S1458
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/sparcv8plus.S1562
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/sparcv9-gf2m.pl200
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/sparcv9-mont.pl619
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/sparcv9a-mont.pl887
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/via-mont.pl254
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/vis3-mont.pl384
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/x86-gf2m.pl325
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/x86-mont.pl629
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/x86_64-gcc.c649
-rw-r--r--openssl-1.1.0h/crypto/bn/asm/x86_64-gf2m.pl397
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/x86_64-mont.pl1521
-rwxr-xr-xopenssl-1.1.0h/crypto/bn/asm/x86_64-mont5.pl3835
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_add.c209
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_asm.c1039
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_blind.c289
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_const.c553
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_ctx.c353
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_depr.c68
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_dh.c220
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_div.c423
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_err.c107
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_exp.c1376
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_exp2.c201
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_gcd.c616
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_gf2m.c1224
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_intern.c210
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_kron.c140
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_lcl.h689
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_lib.c1016
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_mod.c201
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_mont.c422
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_mpi.c86
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_mul.c1011
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_nist.c1239
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_prime.c596
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_prime.h274
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_prime.pl46
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_print.c343
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_rand.c258
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_recp.c199
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_shift.c175
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_sqr.c235
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_sqrt.c358
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_srp.c545
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_word.c201
-rw-r--r--openssl-1.1.0h/crypto/bn/bn_x931p.c242
-rw-r--r--openssl-1.1.0h/crypto/bn/build.info84
-rw-r--r--openssl-1.1.0h/crypto/bn/rsaz_exp.c352
-rw-r--r--openssl-1.1.0h/crypto/bn/rsaz_exp.h77
-rw-r--r--openssl-1.1.0h/crypto/buffer/buf_err.c44
-rw-r--r--openssl-1.1.0h/crypto/buffer/buffer.c166
-rw-r--r--openssl-1.1.0h/crypto/buffer/build.info2
-rw-r--r--openssl-1.1.0h/crypto/build.info37
-rw-r--r--openssl-1.1.0h/crypto/c64xpluscpuid.pl287
-rw-r--r--openssl-1.1.0h/crypto/camellia/asm/cmll-x86.pl1150
-rw-r--r--openssl-1.1.0h/crypto/camellia/asm/cmll-x86_64.pl1088
-rw-r--r--openssl-1.1.0h/crypto/camellia/asm/cmllt4-sparcv9.pl939
-rw-r--r--openssl-1.1.0h/crypto/camellia/build.info11
-rw-r--r--openssl-1.1.0h/crypto/camellia/camellia.c541
-rw-r--r--openssl-1.1.0h/crypto/camellia/cmll_cbc.c24
-rw-r--r--openssl-1.1.0h/crypto/camellia/cmll_cfb.c43
-rw-r--r--openssl-1.1.0h/crypto/camellia/cmll_ctr.c22
-rw-r--r--openssl-1.1.0h/crypto/camellia/cmll_ecb.c20
-rw-r--r--openssl-1.1.0h/crypto/camellia/cmll_locl.h43
-rw-r--r--openssl-1.1.0h/crypto/camellia/cmll_misc.c35
-rw-r--r--openssl-1.1.0h/crypto/camellia/cmll_ofb.c24
-rw-r--r--openssl-1.1.0h/crypto/cast/asm/cast-586.pl192
-rw-r--r--openssl-1.1.0h/crypto/cast/build.info6
-rw-r--r--openssl-1.1.0h/crypto/cast/c_cfb64.c74
-rw-r--r--openssl-1.1.0h/crypto/cast/c_ecb.c32
-rw-r--r--openssl-1.1.0h/crypto/cast/c_enc.c151
-rw-r--r--openssl-1.1.0h/crypto/cast/c_ofb64.c61
-rw-r--r--openssl-1.1.0h/crypto/cast/c_skey.c118
-rw-r--r--openssl-1.1.0h/crypto/cast/cast_lcl.h190
-rw-r--r--openssl-1.1.0h/crypto/cast/cast_s.h544
-rwxr-xr-xopenssl-1.1.0h/crypto/chacha/asm/chacha-armv4.pl1158
-rwxr-xr-xopenssl-1.1.0h/crypto/chacha/asm/chacha-armv8.pl1135
-rwxr-xr-xopenssl-1.1.0h/crypto/chacha/asm/chacha-c64xplus.pl926
-rwxr-xr-xopenssl-1.1.0h/crypto/chacha/asm/chacha-ppc.pl953
-rwxr-xr-xopenssl-1.1.0h/crypto/chacha/asm/chacha-s390x.pl326
-rwxr-xr-xopenssl-1.1.0h/crypto/chacha/asm/chacha-x86.pl1154
-rwxr-xr-xopenssl-1.1.0h/crypto/chacha/asm/chacha-x86_64.pl2245
-rw-r--r--openssl-1.1.0h/crypto/chacha/build.info17
-rw-r--r--openssl-1.1.0h/crypto/chacha/chacha_enc.c121
-rw-r--r--openssl-1.1.0h/crypto/cmac/build.info2
-rw-r--r--openssl-1.1.0h/crypto/cmac/cm_ameth.c51
-rw-r--r--openssl-1.1.0h/crypto/cmac/cm_pmeth.c161
-rw-r--r--openssl-1.1.0h/crypto/cmac/cmac.c224
-rw-r--r--openssl-1.1.0h/crypto/cms/build.info5
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_asn1.c403
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_att.c152
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_cd.c82
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_dd.c99
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_enc.c212
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_env.c902
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_err.c258
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_ess.c337
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_io.c88
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_kari.c411
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_lcl.h444
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_lib.c587
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_pwri.c392
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_sd.c925
-rw-r--r--openssl-1.1.0h/crypto/cms/cms_smime.c842
-rw-r--r--openssl-1.1.0h/crypto/comp/build.info4
-rw-r--r--openssl-1.1.0h/crypto/comp/c_zlib.c615
-rw-r--r--openssl-1.1.0h/crypto/comp/comp_err.c48
-rw-r--r--openssl-1.1.0h/crypto/comp/comp_lcl.h30
-rw-r--r--openssl-1.1.0h/crypto/comp/comp_lib.c91
-rw-r--r--openssl-1.1.0h/crypto/conf/build.info4
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_api.c214
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_def.c642
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_def.h129
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_err.c81
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_lib.c365
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_mall.c29
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_mod.c549
-rw-r--r--openssl-1.1.0h/crypto/conf/conf_sap.c60
-rw-r--r--openssl-1.1.0h/crypto/conf/keysets.pl141
-rw-r--r--openssl-1.1.0h/crypto/cpt_err.c55
-rw-r--r--openssl-1.1.0h/crypto/cryptlib.c356
-rw-r--r--openssl-1.1.0h/crypto/ct/build.info3
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_b64.c164
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_err.c87
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_locl.h216
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_log.c306
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_oct.c407
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_policy.c98
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_prn.c127
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_sct.c393
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_sct_ctx.c263
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_vfy.c140
-rw-r--r--openssl-1.1.0h/crypto/ct/ct_x509v3.c104
-rw-r--r--openssl-1.1.0h/crypto/cversion.c65
-rw-r--r--openssl-1.1.0h/crypto/des/asm/crypt586.pl217
-rw-r--r--openssl-1.1.0h/crypto/des/asm/des-586.pl465
-rw-r--r--openssl-1.1.0h/crypto/des/asm/des_enc.m41972
-rw-r--r--openssl-1.1.0h/crypto/des/asm/desboth.pl86
-rw-r--r--openssl-1.1.0h/crypto/des/asm/dest4-sparcv9.pl627
-rw-r--r--openssl-1.1.0h/crypto/des/build.info17
-rw-r--r--openssl-1.1.0h/crypto/des/cbc_cksm.c54
-rw-r--r--openssl-1.1.0h/crypto/des/cbc_enc.c12
-rw-r--r--openssl-1.1.0h/crypto/des/cfb64ede.c190
-rw-r--r--openssl-1.1.0h/crypto/des/cfb64enc.c73
-rw-r--r--openssl-1.1.0h/crypto/des/cfb_enc.c150
-rw-r--r--openssl-1.1.0h/crypto/des/des_enc.c301
-rw-r--r--openssl-1.1.0h/crypto/des/des_locl.h231
-rw-r--r--openssl-1.1.0h/crypto/des/ecb3_enc.c33
-rw-r--r--openssl-1.1.0h/crypto/des/ecb_enc.c51
-rw-r--r--openssl-1.1.0h/crypto/des/fcrypt.c149
-rw-r--r--openssl-1.1.0h/crypto/des/fcrypt_b.c72
-rw-r--r--openssl-1.1.0h/crypto/des/ncbc_enc.c106
-rw-r--r--openssl-1.1.0h/crypto/des/ofb64ede.c62
-rw-r--r--openssl-1.1.0h/crypto/des/ofb64enc.c60
-rw-r--r--openssl-1.1.0h/crypto/des/ofb_enc.c82
-rw-r--r--openssl-1.1.0h/crypto/des/pcbc_enc.c66
-rw-r--r--openssl-1.1.0h/crypto/des/qud_cksm.c77
-rw-r--r--openssl-1.1.0h/crypto/des/rand_key.c21
-rw-r--r--openssl-1.1.0h/crypto/des/rpc_des.h76
-rw-r--r--openssl-1.1.0h/crypto/des/rpc_enc.c30
-rw-r--r--openssl-1.1.0h/crypto/des/set_key.c389
-rw-r--r--openssl-1.1.0h/crypto/des/spr.h163
-rw-r--r--openssl-1.1.0h/crypto/des/str2key.c97
-rw-r--r--openssl-1.1.0h/crypto/des/xcbc_enc.c103
-rw-r--r--openssl-1.1.0h/crypto/dh/build.info4
-rw-r--r--openssl-1.1.0h/crypto/dh/dh1024.pem5
-rw-r--r--openssl-1.1.0h/crypto/dh/dh192.pem3
-rw-r--r--openssl-1.1.0h/crypto/dh/dh2048.pem16
-rw-r--r--openssl-1.1.0h/crypto/dh/dh4096.pem14
-rw-r--r--openssl-1.1.0h/crypto/dh/dh512.pem4
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_ameth.c870
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_asn1.c138
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_check.c183
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_depr.c46
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_err.c73
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_gen.c130
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_kdf.c150
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_key.c227
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_lib.c263
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_locl.h56
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_meth.c173
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_pmeth.c501
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_prn.c30
-rw-r--r--openssl-1.1.0h/crypto/dh/dh_rfc5114.c41
-rw-r--r--openssl-1.1.0h/crypto/dllmain.c60
-rw-r--r--openssl-1.1.0h/crypto/dsa/build.info5
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_ameth.c572
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_asn1.c155
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_depr.c62
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_err.c76
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_gen.c603
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_key.c77
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_lib.c332
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_locl.h76
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_meth.c224
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_ossl.c365
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_pmeth.c273
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_prn.c69
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_sign.c24
-rw-r--r--openssl-1.1.0h/crypto/dsa/dsa_vrf.c19
-rw-r--r--openssl-1.1.0h/crypto/dso/build.info4
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_dl.c279
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_dlfcn.c354
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_err.c93
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_lib.c349
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_locl.h106
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_openssl.c22
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_vms.c468
-rw-r--r--openssl-1.1.0h/crypto/dso/dso_win32.c573
-rw-r--r--openssl-1.1.0h/crypto/ebcdic.c366
-rwxr-xr-xopenssl-1.1.0h/crypto/ec/asm/ecp_nistz256-armv4.pl1865
-rw-r--r--openssl-1.1.0h/crypto/ec/asm/ecp_nistz256-armv8.pl1558
-rwxr-xr-xopenssl-1.1.0h/crypto/ec/asm/ecp_nistz256-avx2.pl2100
-rwxr-xr-xopenssl-1.1.0h/crypto/ec/asm/ecp_nistz256-sparcv9.pl3061
-rwxr-xr-xopenssl-1.1.0h/crypto/ec/asm/ecp_nistz256-x86.pl1866
-rwxr-xr-xopenssl-1.1.0h/crypto/ec/asm/ecp_nistz256-x86_64.pl3083
-rw-r--r--openssl-1.1.0h/crypto/ec/build.info28
-rw-r--r--openssl-1.1.0h/crypto/ec/curve25519.c3400
-rw-r--r--openssl-1.1.0h/crypto/ec/ec2_mult.c418
-rw-r--r--openssl-1.1.0h/crypto/ec/ec2_oct.c352
-rw-r--r--openssl-1.1.0h/crypto/ec/ec2_smpl.c758
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_ameth.c885
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_asn1.c1237
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_check.c72
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_curve.c3169
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_cvt.c95
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_err.c290
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_key.c637
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_kmeth.c317
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_lcl.h613
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_lib.c1004
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_mult.c684
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_oct.c165
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_pmeth.c463
-rw-r--r--openssl-1.1.0h/crypto/ec/ec_print.c119
-rw-r--r--openssl-1.1.0h/crypto/ec/ecdh_kdf.c68
-rw-r--r--openssl-1.1.0h/crypto/ec/ecdh_ossl.c147
-rw-r--r--openssl-1.1.0h/crypto/ec/ecdsa_ossl.c476
-rw-r--r--openssl-1.1.0h/crypto/ec/ecdsa_sign.c52
-rw-r--r--openssl-1.1.0h/crypto/ec/ecdsa_vrf.c43
-rw-r--r--openssl-1.1.0h/crypto/ec/eck_prn.c273
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_mont.c242
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_nist.c167
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_nistp224.c1715
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_nistp256.c2350
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_nistp521.c2138
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_nistputil.c223
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_nistz256.c1559
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_nistz256_table.c9542
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_oct.c372
-rw-r--r--openssl-1.1.0h/crypto/ec/ecp_smpl.c1369
-rw-r--r--openssl-1.1.0h/crypto/ec/ecx_meth.c373
-rw-r--r--openssl-1.1.0h/crypto/engine/README211
-rw-r--r--openssl-1.1.0h/crypto/engine/build.info8
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_all.c31
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_cnf.c192
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_cryptodev.c1757
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_ctrl.c338
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_dyn.c510
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_err.c123
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_fat.c128
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_init.c108
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_int.h183
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_lib.c290
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_list.c354
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_openssl.c652
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_pkey.c140
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_rdrand.c110
-rw-r--r--openssl-1.1.0h/crypto/engine/eng_table.c308
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_asnmth.c207
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_cipher.c91
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_dh.c72
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_digest.c91
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_dsa.c72
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_eckey.c72
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_pkmeth.c114
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_rand.c72
-rw-r--r--openssl-1.1.0h/crypto/engine/tb_rsa.c72
-rw-r--r--openssl-1.1.0h/crypto/err/README44
-rw-r--r--openssl-1.1.0h/crypto/err/build.info3
-rw-r--r--openssl-1.1.0h/crypto/err/err.c793
-rw-r--r--openssl-1.1.0h/crypto/err/err_all.c109
-rw-r--r--openssl-1.1.0h/crypto/err/err_prn.c66
-rw-r--r--openssl-1.1.0h/crypto/err/openssl.ec99
-rw-r--r--openssl-1.1.0h/crypto/evp/bio_b64.c551
-rw-r--r--openssl-1.1.0h/crypto/evp/bio_enc.c450
-rw-r--r--openssl-1.1.0h/crypto/evp/bio_md.c232
-rw-r--r--openssl-1.1.0h/crypto/evp/bio_ok.c605
-rw-r--r--openssl-1.1.0h/crypto/evp/build.info22
-rw-r--r--openssl-1.1.0h/crypto/evp/c_allc.c220
-rw-r--r--openssl-1.1.0h/crypto/evp/c_alld.c49
-rw-r--r--openssl-1.1.0h/crypto/evp/cmeth_lib.c151
-rw-r--r--openssl-1.1.0h/crypto/evp/digest.c269
-rw-r--r--openssl-1.1.0h/crypto/evp/e_aes.c2705
-rw-r--r--openssl-1.1.0h/crypto/evp/e_aes_cbc_hmac_sha1.c964
-rw-r--r--openssl-1.1.0h/crypto/evp/e_aes_cbc_hmac_sha256.c950
-rw-r--r--openssl-1.1.0h/crypto/evp/e_bf.c38
-rw-r--r--openssl-1.1.0h/crypto/evp/e_camellia.c366
-rw-r--r--openssl-1.1.0h/crypto/evp/e_cast.c40
-rw-r--r--openssl-1.1.0h/crypto/evp/e_chacha20_poly1305.c454
-rw-r--r--openssl-1.1.0h/crypto/evp/e_des.c242
-rw-r--r--openssl-1.1.0h/crypto/evp/e_des3.c424
-rw-r--r--openssl-1.1.0h/crypto/evp/e_idea.c70
-rw-r--r--openssl-1.1.0h/crypto/evp/e_null.c50
-rw-r--r--openssl-1.1.0h/crypto/evp/e_old.c113
-rw-r--r--openssl-1.1.0h/crypto/evp/e_rc2.c189
-rw-r--r--openssl-1.1.0h/crypto/evp/e_rc4.c82
-rw-r--r--openssl-1.1.0h/crypto/evp/e_rc4_hmac_md5.c262
-rw-r--r--openssl-1.1.0h/crypto/evp/e_rc5.c74
-rw-r--r--openssl-1.1.0h/crypto/evp/e_seed.c39
-rw-r--r--openssl-1.1.0h/crypto/evp/e_xcbc_d.c83
-rw-r--r--openssl-1.1.0h/crypto/evp/encode.c404
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_cnf.c65
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_enc.c642
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_err.c186
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_key.c152
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_lib.c497
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_locl.h68
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_pbe.c259
-rw-r--r--openssl-1.1.0h/crypto/evp/evp_pkey.c150
-rw-r--r--openssl-1.1.0h/crypto/evp/m_md2.c56
-rw-r--r--openssl-1.1.0h/crypto/evp/m_md4.c55
-rw-r--r--openssl-1.1.0h/crypto/evp/m_md5.c55
-rw-r--r--openssl-1.1.0h/crypto/evp/m_md5_sha1.c142
-rw-r--r--openssl-1.1.0h/crypto/evp/m_mdc2.c55
-rw-r--r--openssl-1.1.0h/crypto/evp/m_null.c49
-rw-r--r--openssl-1.1.0h/crypto/evp/m_ripemd.c55
-rw-r--r--openssl-1.1.0h/crypto/evp/m_sha1.c248
-rw-r--r--openssl-1.1.0h/crypto/evp/m_sigver.c177
-rw-r--r--openssl-1.1.0h/crypto/evp/m_wp.c54
-rw-r--r--openssl-1.1.0h/crypto/evp/names.c178
-rw-r--r--openssl-1.1.0h/crypto/evp/p5_crpt.c103
-rw-r--r--openssl-1.1.0h/crypto/evp/p5_crpt2.c280
-rw-r--r--openssl-1.1.0h/crypto/evp/p_dec.c36
-rw-r--r--openssl-1.1.0h/crypto/evp/p_enc.c35
-rw-r--r--openssl-1.1.0h/crypto/evp/p_lib.c506
-rw-r--r--openssl-1.1.0h/crypto/evp/p_open.c73
-rw-r--r--openssl-1.1.0h/crypto/evp/p_seal.c70
-rw-r--r--openssl-1.1.0h/crypto/evp/p_sign.c61
-rw-r--r--openssl-1.1.0h/crypto/evp/p_verify.c55
-rw-r--r--openssl-1.1.0h/crypto/evp/pmeth_fn.c297
-rw-r--r--openssl-1.1.0h/crypto/evp/pmeth_gn.c169
-rw-r--r--openssl-1.1.0h/crypto/evp/pmeth_lib.c725
-rw-r--r--openssl-1.1.0h/crypto/evp/scrypt.c248
-rw-r--r--openssl-1.1.0h/crypto/ex_data.c399
-rw-r--r--openssl-1.1.0h/crypto/hmac/build.info3
-rw-r--r--openssl-1.1.0h/crypto/hmac/hm_ameth.c125
-rw-r--r--openssl-1.1.0h/crypto/hmac/hm_pmeth.c210
-rw-r--r--openssl-1.1.0h/crypto/hmac/hmac.c240
-rw-r--r--openssl-1.1.0h/crypto/hmac/hmac_lcl.h33
-rw-r--r--openssl-1.1.0h/crypto/ia64cpuid.S297
-rw-r--r--openssl-1.1.0h/crypto/idea/build.info3
-rw-r--r--openssl-1.1.0h/crypto/idea/i_cbc.c122
-rw-r--r--openssl-1.1.0h/crypto/idea/i_cfb64.c74
-rw-r--r--openssl-1.1.0h/crypto/idea/i_ecb.c34
-rw-r--r--openssl-1.1.0h/crypto/idea/i_ofb64.c61
-rw-r--r--openssl-1.1.0h/crypto/idea/i_skey.c112
-rw-r--r--openssl-1.1.0h/crypto/idea/idea_lcl.h117
-rw-r--r--openssl-1.1.0h/crypto/include/internal/asn1_int.h94
-rw-r--r--openssl-1.1.0h/crypto/include/internal/async.h14
-rw-r--r--openssl-1.1.0h/crypto/include/internal/bn_conf.h.in27
-rw-r--r--openssl-1.1.0h/crypto/include/internal/bn_dh.h17
-rw-r--r--openssl-1.1.0h/crypto/include/internal/bn_int.h82
-rw-r--r--openssl-1.1.0h/crypto/include/internal/bn_srp.h32
-rw-r--r--openssl-1.1.0h/crypto/include/internal/chacha.h49
-rw-r--r--openssl-1.1.0h/crypto/include/internal/cryptlib.h81
-rw-r--r--openssl-1.1.0h/crypto/include/internal/cryptlib_int.h31
-rw-r--r--openssl-1.1.0h/crypto/include/internal/dso_conf.h.in15
-rw-r--r--openssl-1.1.0h/crypto/include/internal/engine.h20
-rw-r--r--openssl-1.1.0h/crypto/include/internal/err_int.h17
-rw-r--r--openssl-1.1.0h/crypto/include/internal/evp_int.h390
-rw-r--r--openssl-1.1.0h/crypto/include/internal/md32_common.h383
-rw-r--r--openssl-1.1.0h/crypto/include/internal/objects.h12
-rw-r--r--openssl-1.1.0h/crypto/include/internal/poly1305.h19
-rw-r--r--openssl-1.1.0h/crypto/include/internal/rand.h20
-rw-r--r--openssl-1.1.0h/crypto/include/internal/x509_int.h267
-rw-r--r--openssl-1.1.0h/crypto/init.c676
-rw-r--r--openssl-1.1.0h/crypto/kdf/build.info3
-rw-r--r--openssl-1.1.0h/crypto/kdf/hkdf.c293
-rw-r--r--openssl-1.1.0h/crypto/kdf/kdf_err.c46
-rw-r--r--openssl-1.1.0h/crypto/kdf/tls1_prf.c265
-rw-r--r--openssl-1.1.0h/crypto/lhash/build.info3
-rw-r--r--openssl-1.1.0h/crypto/lhash/lh_stats.c130
-rw-r--r--openssl-1.1.0h/crypto/lhash/lhash.c372
-rw-r--r--openssl-1.1.0h/crypto/lhash/lhash_lcl.h49
-rw-r--r--openssl-1.1.0h/crypto/lhash/num.pl23
-rw-r--r--openssl-1.1.0h/crypto/md2/build.info3
-rw-r--r--openssl-1.1.0h/crypto/md2/md2_dgst.c173
-rw-r--r--openssl-1.1.0h/crypto/md2/md2_one.c47
-rw-r--r--openssl-1.1.0h/crypto/md4/build.info3
-rw-r--r--openssl-1.1.0h/crypto/md4/md4_dgst.c147
-rw-r--r--openssl-1.1.0h/crypto/md4/md4_locl.h60
-rw-r--r--openssl-1.1.0h/crypto/md4/md4_one.c47
-rw-r--r--openssl-1.1.0h/crypto/md5/asm/md5-586.pl318
-rw-r--r--openssl-1.1.0h/crypto/md5/asm/md5-ia64.S1002
-rw-r--r--openssl-1.1.0h/crypto/md5/asm/md5-sparcv9.pl437
-rwxr-xr-xopenssl-1.1.0h/crypto/md5/asm/md5-x86_64.pl380
-rw-r--r--openssl-1.1.0h/crypto/md5/build.info22
-rw-r--r--openssl-1.1.0h/crypto/md5/md5_dgst.c164
-rw-r--r--openssl-1.1.0h/crypto/md5/md5_locl.h80
-rw-r--r--openssl-1.1.0h/crypto/md5/md5_one.c47
-rw-r--r--openssl-1.1.0h/crypto/mdc2/build.info3
-rw-r--r--openssl-1.1.0h/crypto/mdc2/mdc2_one.c27
-rw-r--r--openssl-1.1.0h/crypto/mdc2/mdc2dgst.c147
-rw-r--r--openssl-1.1.0h/crypto/mem.c196
-rw-r--r--openssl-1.1.0h/crypto/mem_clr.c25
-rw-r--r--openssl-1.1.0h/crypto/mem_dbg.c629
-rw-r--r--openssl-1.1.0h/crypto/mem_sec.c630
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/aesni-gcm-x86_64.pl1106
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-alpha.pl467
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-armv4.pl554
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-c64xplus.pl247
-rwxr-xr-xopenssl-1.1.0h/crypto/modes/asm/ghash-ia64.pl470
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-parisc.pl738
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-s390x.pl258
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-sparcv9.pl581
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-x86.pl1405
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghash-x86_64.pl1762
-rwxr-xr-xopenssl-1.1.0h/crypto/modes/asm/ghashp8-ppc.pl670
-rw-r--r--openssl-1.1.0h/crypto/modes/asm/ghashv8-armx.pl430
-rw-r--r--openssl-1.1.0h/crypto/modes/build.info27
-rw-r--r--openssl-1.1.0h/crypto/modes/cbc128.c161
-rw-r--r--openssl-1.1.0h/crypto/modes/ccm128.c432
-rw-r--r--openssl-1.1.0h/crypto/modes/cfb128.c198
-rw-r--r--openssl-1.1.0h/crypto/modes/ctr128.c209
-rw-r--r--openssl-1.1.0h/crypto/modes/cts128.c523
-rw-r--r--openssl-1.1.0h/crypto/modes/gcm128.c2301
-rw-r--r--openssl-1.1.0h/crypto/modes/modes_lcl.h185
-rw-r--r--openssl-1.1.0h/crypto/modes/ocb128.c568
-rw-r--r--openssl-1.1.0h/crypto/modes/ofb128.c74
-rw-r--r--openssl-1.1.0h/crypto/modes/wrap128.c329
-rw-r--r--openssl-1.1.0h/crypto/modes/xts128.c157
-rw-r--r--openssl-1.1.0h/crypto/o_dir.c37
-rw-r--r--openssl-1.1.0h/crypto/o_fips.c34
-rw-r--r--openssl-1.1.0h/crypto/o_fopen.c103
-rw-r--r--openssl-1.1.0h/crypto/o_init.c34
-rw-r--r--openssl-1.1.0h/crypto/o_str.c251
-rw-r--r--openssl-1.1.0h/crypto/o_time.c196
-rw-r--r--openssl-1.1.0h/crypto/objects/README44
-rw-r--r--openssl-1.1.0h/crypto/objects/build.info3
-rw-r--r--openssl-1.1.0h/crypto/objects/o_names.c407
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_dat.c738
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_dat.h5101
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_dat.pl227
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_err.c50
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_lcl.h14
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_lib.c66
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_mac.num1060
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_xref.c165
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_xref.h118
-rw-r--r--openssl-1.1.0h/crypto/objects/obj_xref.txt60
-rw-r--r--openssl-1.1.0h/crypto/objects/objects.pl193
-rw-r--r--openssl-1.1.0h/crypto/objects/objects.txt1485
-rw-r--r--openssl-1.1.0h/crypto/objects/objxref.pl135
-rw-r--r--openssl-1.1.0h/crypto/ocsp/build.info4
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_asn.c135
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_cl.c365
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_err.c91
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_ext.c472
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_ht.c502
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_lcl.h232
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_lib.c222
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_prn.c246
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_srv.c277
-rw-r--r--openssl-1.1.0h/crypto/ocsp/ocsp_vfy.c435
-rw-r--r--openssl-1.1.0h/crypto/ocsp/v3_ocsp.c264
-rw-r--r--openssl-1.1.0h/crypto/pariscid.pl263
-rw-r--r--openssl-1.1.0h/crypto/pem/build.info4
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_all.c181
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_err.c115
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_info.c334
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_lib.c857
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_oth.c36
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_pk8.c214
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_pkey.c244
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_sign.c50
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_x509.c18
-rw-r--r--openssl-1.1.0h/crypto/pem/pem_xaux.c18
-rw-r--r--openssl-1.1.0h/crypto/pem/pvkfmt.c881
-rw-r--r--openssl-1.1.0h/crypto/perlasm/README124
-rwxr-xr-xopenssl-1.1.0h/crypto/perlasm/arm-xlate.pl177
-rw-r--r--openssl-1.1.0h/crypto/perlasm/cbc.pl356
-rwxr-xr-xopenssl-1.1.0h/crypto/perlasm/ppc-xlate.pl265
-rw-r--r--openssl-1.1.0h/crypto/perlasm/sparcv9_modes.pl1702
-rwxr-xr-xopenssl-1.1.0h/crypto/perlasm/x86_64-xlate.pl1186
-rw-r--r--openssl-1.1.0h/crypto/perlasm/x86asm.pl310
-rw-r--r--openssl-1.1.0h/crypto/perlasm/x86gas.pl265
-rw-r--r--openssl-1.1.0h/crypto/perlasm/x86masm.pl207
-rw-r--r--openssl-1.1.0h/crypto/perlasm/x86nasm.pl186
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/build.info5
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_add.c164
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_asn.c76
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_attr.c103
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_crpt.c70
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_crt.c291
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_decr.c155
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_init.c43
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_key.c205
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_kiss.c250
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_lcl.h43
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_mutl.c242
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_npas.c184
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_p8d.c23
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_p8e.c69
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_sbag.c170
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/p12_utl.c237
-rw-r--r--openssl-1.1.0h/crypto/pkcs12/pk12err.c95
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/bio_pk7.c24
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/build.info4
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_asn1.c202
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_attr.c121
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_dgst.c15
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_doit.c1180
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_enc.c25
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_lib.c589
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_mime.c49
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pk7_smime.c549
-rw-r--r--openssl-1.1.0h/crypto/pkcs7/pkcs7err.c131
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-armv4.pl1252
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-armv8.pl943
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-c64xplus.pl331
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-mips.pl425
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-ppc.pl644
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-ppcfp.pl739
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-s390x.pl227
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-sparcv9.pl1120
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-x86.pl1814
-rwxr-xr-xopenssl-1.1.0h/crypto/poly1305/asm/poly1305-x86_64.pl2268
-rw-r--r--openssl-1.1.0h/crypto/poly1305/build.info20
-rw-r--r--openssl-1.1.0h/crypto/poly1305/poly1305.c1037
-rw-r--r--openssl-1.1.0h/crypto/poly1305/poly1305_ieee754.c472
-rw-r--r--openssl-1.1.0h/crypto/ppc_arch.h26
-rw-r--r--openssl-1.1.0h/crypto/ppccap.c343
-rwxr-xr-xopenssl-1.1.0h/crypto/ppccpuid.pl301
-rw-r--r--openssl-1.1.0h/crypto/rand/build.info4
-rw-r--r--openssl-1.1.0h/crypto/rand/md_rand.c667
-rw-r--r--openssl-1.1.0h/crypto/rand/rand_egd.c249
-rw-r--r--openssl-1.1.0h/crypto/rand/rand_err.c43
-rw-r--r--openssl-1.1.0h/crypto/rand/rand_lcl.h46
-rw-r--r--openssl-1.1.0h/crypto/rand/rand_lib.c164
-rw-r--r--openssl-1.1.0h/crypto/rand/rand_unix.c324
-rw-r--r--openssl-1.1.0h/crypto/rand/rand_vms.c133
-rw-r--r--openssl-1.1.0h/crypto/rand/rand_win.c135
-rw-r--r--openssl-1.1.0h/crypto/rand/randfile.c364
-rw-r--r--openssl-1.1.0h/crypto/rc2/build.info3
-rw-r--r--openssl-1.1.0h/crypto/rc2/rc2_cbc.c179
-rw-r--r--openssl-1.1.0h/crypto/rc2/rc2_ecb.c41
-rw-r--r--openssl-1.1.0h/crypto/rc2/rc2_locl.h134
-rw-r--r--openssl-1.1.0h/crypto/rc2/rc2_skey.c98
-rw-r--r--openssl-1.1.0h/crypto/rc2/rc2cfb64.c74
-rw-r--r--openssl-1.1.0h/crypto/rc2/rc2ofb64.c61
-rw-r--r--openssl-1.1.0h/crypto/rc2/tab.c93
-rw-r--r--openssl-1.1.0h/crypto/rc4/asm/rc4-586.pl428
-rw-r--r--openssl-1.1.0h/crypto/rc4/asm/rc4-c64xplus.pl192
-rw-r--r--openssl-1.1.0h/crypto/rc4/asm/rc4-ia64.pl767
-rw-r--r--openssl-1.1.0h/crypto/rc4/asm/rc4-md5-x86_64.pl645
-rw-r--r--openssl-1.1.0h/crypto/rc4/asm/rc4-parisc.pl321
-rw-r--r--openssl-1.1.0h/crypto/rc4/asm/rc4-s390x.pl241
-rwxr-xr-xopenssl-1.1.0h/crypto/rc4/asm/rc4-x86_64.pl687
-rw-r--r--openssl-1.1.0h/crypto/rc4/build.info33
-rw-r--r--openssl-1.1.0h/crypto/rc4/rc4_enc.c86
-rw-r--r--openssl-1.1.0h/crypto/rc4/rc4_locl.h16
-rw-r--r--openssl-1.1.0h/crypto/rc4/rc4_skey.c58
-rw-r--r--openssl-1.1.0h/crypto/rc5/asm/rc5-586.pl122
-rw-r--r--openssl-1.1.0h/crypto/rc5/build.info6
-rw-r--r--openssl-1.1.0h/crypto/rc5/rc5_ecb.c32
-rw-r--r--openssl-1.1.0h/crypto/rc5/rc5_enc.c160
-rw-r--r--openssl-1.1.0h/crypto/rc5/rc5_locl.h186
-rw-r--r--openssl-1.1.0h/crypto/rc5/rc5_skey.c61
-rw-r--r--openssl-1.1.0h/crypto/rc5/rc5cfb64.c74
-rw-r--r--openssl-1.1.0h/crypto/rc5/rc5ofb64.c61
-rw-r--r--openssl-1.1.0h/crypto/ripemd/asm/rmd-586.pl603
-rw-r--r--openssl-1.1.0h/crypto/ripemd/build.info6
-rw-r--r--openssl-1.1.0h/crypto/ripemd/rmd_dgst.c282
-rw-r--r--openssl-1.1.0h/crypto/ripemd/rmd_locl.h88
-rw-r--r--openssl-1.1.0h/crypto/ripemd/rmd_one.c28
-rw-r--r--openssl-1.1.0h/crypto/ripemd/rmdconst.h350
-rw-r--r--openssl-1.1.0h/crypto/rsa/build.info6
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_ameth.c867
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_asn1.c81
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_chk.c156
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_crpt.c178
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_depr.c61
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_err.c185
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_gen.c218
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_lib.c288
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_locl.h96
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_meth.c273
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_none.c43
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_null.c93
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_oaep.c286
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_ossl.c800
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_pk1.c243
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_pmeth.c668
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_prn.c42
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_pss.c253
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_saos.c94
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_sign.c248
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_ssl.c100
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_x931.c116
-rw-r--r--openssl-1.1.0h/crypto/rsa/rsa_x931g.c197
-rw-r--r--openssl-1.1.0h/crypto/s390xcap.c50
-rw-r--r--openssl-1.1.0h/crypto/s390xcpuid.S178
-rw-r--r--openssl-1.1.0h/crypto/seed/build.info2
-rw-r--r--openssl-1.1.0h/crypto/seed/seed.c590
-rw-r--r--openssl-1.1.0h/crypto/seed/seed_cbc.c23
-rw-r--r--openssl-1.1.0h/crypto/seed/seed_cfb.c20
-rw-r--r--openssl-1.1.0h/crypto/seed/seed_ecb.c19
-rw-r--r--openssl-1.1.0h/crypto/seed/seed_locl.h120
-rw-r--r--openssl-1.1.0h/crypto/seed/seed_ofb.c19
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-586.pl1488
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-alpha.pl329
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-armv4-large.pl742
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-armv8.pl363
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-c64xplus.pl337
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-ia64.pl314
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-mb-x86_64.pl1582
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-mips.pl457
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-parisc.pl267
-rwxr-xr-xopenssl-1.1.0h/crypto/sha/asm/sha1-ppc.pl351
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-s390x.pl247
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9.pl434
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-sparcv9a.pl608
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha1-thumb.pl266
-rwxr-xr-xopenssl-1.1.0h/crypto/sha/asm/sha1-x86_64.pl2077
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha256-586.pl1293
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha256-armv4.pl732
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha256-c64xplus.pl320
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha256-mb-x86_64.pl1568
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha512-586.pl924
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha512-armv4.pl668
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha512-armv8.pl446
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha512-c64xplus.pl438
-rwxr-xr-xopenssl-1.1.0h/crypto/sha/asm/sha512-ia64.pl692
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha512-mips.pl519
-rwxr-xr-xopenssl-1.1.0h/crypto/sha/asm/sha512-parisc.pl800
-rwxr-xr-xopenssl-1.1.0h/crypto/sha/asm/sha512-ppc.pl799
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha512-s390x.pl322
-rw-r--r--openssl-1.1.0h/crypto/sha/asm/sha512-sparcv9.pl857
-rwxr-xr-xopenssl-1.1.0h/crypto/sha/asm/sha512-x86_64.pl2407
-rwxr-xr-xopenssl-1.1.0h/crypto/sha/asm/sha512p8-ppc.pl431
-rw-r--r--openssl-1.1.0h/crypto/sha/build.info69
-rw-r--r--openssl-1.1.0h/crypto/sha/sha1_one.c28
-rw-r--r--openssl-1.1.0h/crypto/sha/sha1dgst.c17
-rw-r--r--openssl-1.1.0h/crypto/sha/sha256.c386
-rw-r--r--openssl-1.1.0h/crypto/sha/sha512.c692
-rw-r--r--openssl-1.1.0h/crypto/sha/sha_locl.h424
-rw-r--r--openssl-1.1.0h/crypto/sparc_arch.h118
-rw-r--r--openssl-1.1.0h/crypto/sparccpuid.S582
-rw-r--r--openssl-1.1.0h/crypto/sparcv9cap.c294
-rw-r--r--openssl-1.1.0h/crypto/srp/build.info2
-rw-r--r--openssl-1.1.0h/crypto/srp/srp_lib.c280
-rw-r--r--openssl-1.1.0h/crypto/srp/srp_vfy.c680
-rw-r--r--openssl-1.1.0h/crypto/stack/build.info2
-rw-r--r--openssl-1.1.0h/crypto/stack/stack.c312
-rw-r--r--openssl-1.1.0h/crypto/threads_none.c124
-rw-r--r--openssl-1.1.0h/crypto/threads_pthread.c171
-rw-r--r--openssl-1.1.0h/crypto/threads_win.c136
-rw-r--r--openssl-1.1.0h/crypto/ts/build.info5
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_asn1.c259
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_conf.c468
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_err.c144
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_lcl.h183
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_lib.c93
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_req_print.c51
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_req_utils.c183
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_rsp_print.c195
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_rsp_sign.c904
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_rsp_utils.c365
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_rsp_verify.c635
-rw-r--r--openssl-1.1.0h/crypto/ts/ts_verify_ctx.c146
-rw-r--r--openssl-1.1.0h/crypto/txt_db/build.info2
-rw-r--r--openssl-1.1.0h/crypto/txt_db/txt_db.c318
-rw-r--r--openssl-1.1.0h/crypto/ui/build.info3
-rw-r--r--openssl-1.1.0h/crypto/ui/ui_err.c72
-rw-r--r--openssl-1.1.0h/crypto/ui/ui_lib.c812
-rw-r--r--openssl-1.1.0h/crypto/ui/ui_locl.h97
-rw-r--r--openssl-1.1.0h/crypto/ui/ui_openssl.c712
-rw-r--r--openssl-1.1.0h/crypto/ui/ui_util.c51
-rw-r--r--openssl-1.1.0h/crypto/uid.c42
-rw-r--r--openssl-1.1.0h/crypto/vms_rms.h58
-rw-r--r--openssl-1.1.0h/crypto/whrlpool/asm/wp-mmx.pl507
-rw-r--r--openssl-1.1.0h/crypto/whrlpool/asm/wp-x86_64.pl600
-rw-r--r--openssl-1.1.0h/crypto/whrlpool/build.info7
-rw-r--r--openssl-1.1.0h/crypto/whrlpool/wp_block.c792
-rw-r--r--openssl-1.1.0h/crypto/whrlpool/wp_dgst.c266
-rw-r--r--openssl-1.1.0h/crypto/whrlpool/wp_locl.h12
-rw-r--r--openssl-1.1.0h/crypto/x509/build.info10
-rw-r--r--openssl-1.1.0h/crypto/x509/by_dir.c386
-rw-r--r--openssl-1.1.0h/crypto/x509/by_file.c221
-rw-r--r--openssl-1.1.0h/crypto/x509/t_crl.c89
-rw-r--r--openssl-1.1.0h/crypto/x509/t_req.c198
-rw-r--r--openssl-1.1.0h/crypto/x509/t_x509.c376
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_att.c329
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_cmp.c459
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_d2.c57
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_def.c43
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_err.c142
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_ext.c160
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_lcl.h142
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_lu.c846
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_obj.c182
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_r2x.c67
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_req.c298
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_set.c159
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_trs.c298
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_txt.c177
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_v3.c235
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_vfy.c3277
-rw-r--r--openssl-1.1.0h/crypto/x509/x509_vpm.c617
-rw-r--r--openssl-1.1.0h/crypto/x509/x509cset.c182
-rw-r--r--openssl-1.1.0h/crypto/x509/x509name.c358
-rw-r--r--openssl-1.1.0h/crypto/x509/x509rset.c40
-rw-r--r--openssl-1.1.0h/crypto/x509/x509spki.c75
-rw-r--r--openssl-1.1.0h/crypto/x509/x509type.c77
-rw-r--r--openssl-1.1.0h/crypto/x509/x_all.c526
-rw-r--r--openssl-1.1.0h/crypto/x509/x_attrib.c55
-rw-r--r--openssl-1.1.0h/crypto/x509/x_crl.c459
-rw-r--r--openssl-1.1.0h/crypto/x509/x_exten.c28
-rw-r--r--openssl-1.1.0h/crypto/x509/x_name.c549
-rw-r--r--openssl-1.1.0h/crypto/x509/x_pubkey.c374
-rw-r--r--openssl-1.1.0h/crypto/x509/x_req.c68
-rw-r--r--openssl-1.1.0h/crypto/x509/x_x509.c224
-rw-r--r--openssl-1.1.0h/crypto/x509/x_x509a.c169
-rw-r--r--openssl-1.1.0h/crypto/x509v3/build.info8
-rw-r--r--openssl-1.1.0h/crypto/x509v3/ext_dat.h24
-rw-r--r--openssl-1.1.0h/crypto/x509v3/pcy_cache.c216
-rw-r--r--openssl-1.1.0h/crypto/x509v3/pcy_data.c77
-rw-r--r--openssl-1.1.0h/crypto/x509v3/pcy_int.h167
-rw-r--r--openssl-1.1.0h/crypto/x509v3/pcy_lib.c108
-rw-r--r--openssl-1.1.0h/crypto/x509v3/pcy_map.c81
-rw-r--r--openssl-1.1.0h/crypto/x509v3/pcy_node.c139
-rw-r--r--openssl-1.1.0h/crypto/x509v3/pcy_tree.c700
-rw-r--r--openssl-1.1.0h/crypto/x509v3/tabtest.c42
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_addr.c1307
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_akey.c160
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_akeya.c23
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_alt.c576
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_asid.c852
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_bcons.c84
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_bitst.c93
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_conf.c511
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_cpols.c447
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_crld.c509
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_enum.c53
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_extku.c100
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_genn.c201
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_ia5.c65
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_info.c164
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_int.c43
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_lib.c371
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_ncons.c582
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_pci.c321
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_pcia.c60
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_pcons.c91
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_pku.c65
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_pmaps.c110
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_prn.c210
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_purp.c873
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_skey.c106
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_sxnet.c226
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_tlsf.c137
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3_utl.c1238
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3conf.c79
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3err.c188
-rw-r--r--openssl-1.1.0h/crypto/x509v3/v3prin.c50
-rw-r--r--openssl-1.1.0h/crypto/x86_64cpuid.pl475
-rw-r--r--openssl-1.1.0h/crypto/x86cpuid.pl559
961 files changed, 386215 insertions, 0 deletions
diff --git a/openssl-1.1.0h/crypto/LPdir_nyi.c b/openssl-1.1.0h/crypto/LPdir_nyi.c
new file mode 100644
index 0000000..049044c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/LPdir_nyi.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef LPDIR_H
+# include "LPdir.h"
+#endif
+
+struct LP_dir_context_st {
+ void *dummy;
+};
+const char *LP_find_file(LP_DIR_CTX **ctx, const char *directory)
+{
+ errno = EINVAL;
+ return 0;
+}
+
+int LP_find_file_end(LP_DIR_CTX **ctx)
+{
+ errno = EINVAL;
+ return 0;
+}
diff --git a/openssl-1.1.0h/crypto/LPdir_unix.c b/openssl-1.1.0h/crypto/LPdir_unix.c
new file mode 100644
index 0000000..1bb2940
--- /dev/null
+++ b/openssl-1.1.0h/crypto/LPdir_unix.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#ifndef LPDIR_H
+# include "LPdir.h"
+#endif
+
+/*
+ * The POSIXly macro for the maximum number of characters in a file path is
+ * NAME_MAX. However, some operating systems use PATH_MAX instead.
+ * Therefore, it seems natural to first check for PATH_MAX and use that, and
+ * if it doesn't exist, use NAME_MAX.
+ */
+#if defined(PATH_MAX)
+# define LP_ENTRY_SIZE PATH_MAX
+#elif defined(NAME_MAX)
+# define LP_ENTRY_SIZE NAME_MAX
+#endif
+
+/*
+ * Of course, there's the possibility that neither PATH_MAX nor NAME_MAX
+ * exist. It's also possible that NAME_MAX exists but is define to a very
+ * small value (HP-UX offers 14), so we need to check if we got a result, and
+ * if it meets a minimum standard, and create or change it if not.
+ */
+#if !defined(LP_ENTRY_SIZE) || LP_ENTRY_SIZE<255
+# undef LP_ENTRY_SIZE
+# define LP_ENTRY_SIZE 255
+#endif
+
+struct LP_dir_context_st {
+ DIR *dir;
+ char entry_name[LP_ENTRY_SIZE + 1];
+};
+
+const char *LP_find_file(LP_DIR_CTX **ctx, const char *directory)
+{
+ struct dirent *direntry = NULL;
+
+ if (ctx == NULL || directory == NULL) {
+ errno = EINVAL;
+ return 0;
+ }
+
+ errno = 0;
+ if (*ctx == NULL) {
+ *ctx = malloc(sizeof(**ctx));
+ if (*ctx == NULL) {
+ errno = ENOMEM;
+ return 0;
+ }
+ memset(*ctx, 0, sizeof(**ctx));
+
+ (*ctx)->dir = opendir(directory);
+ if ((*ctx)->dir == NULL) {
+ int save_errno = errno; /* Probably not needed, but I'm paranoid */
+ free(*ctx);
+ *ctx = NULL;
+ errno = save_errno;
+ return 0;
+ }
+ }
+
+ direntry = readdir((*ctx)->dir);
+ if (direntry == NULL) {
+ return 0;
+ }
+
+ strncpy((*ctx)->entry_name, direntry->d_name,
+ sizeof((*ctx)->entry_name) - 1);
+ (*ctx)->entry_name[sizeof((*ctx)->entry_name) - 1] = '\0';
+ return (*ctx)->entry_name;
+}
+
+int LP_find_file_end(LP_DIR_CTX **ctx)
+{
+ if (ctx != NULL && *ctx != NULL) {
+ int ret = closedir((*ctx)->dir);
+
+ free(*ctx);
+ switch (ret) {
+ case 0:
+ return 1;
+ case -1:
+ return 0;
+ default:
+ break;
+ }
+ }
+ errno = EINVAL;
+ return 0;
+}
diff --git a/openssl-1.1.0h/crypto/LPdir_vms.c b/openssl-1.1.0h/crypto/LPdir_vms.c
new file mode 100644
index 0000000..1a5b60f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/LPdir_vms.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <descrip.h>
+#include <namdef.h>
+#include <rmsdef.h>
+#include <libfildef.h>
+#include <lib$routines.h>
+#include <strdef.h>
+#include <str$routines.h>
+#include <stsdef.h>
+#ifndef LPDIR_H
+# include "LPdir.h"
+#endif
+#include "vms_rms.h"
+
+/* Some compiler options hide EVMSERR. */
+#ifndef EVMSERR
+# define EVMSERR 65535 /* error for non-translatable VMS errors */
+#endif
+
+struct LP_dir_context_st {
+ unsigned long VMS_context;
+ char filespec[NAMX_MAXRSS + 1];
+ char result[NAMX_MAXRSS + 1];
+ struct dsc$descriptor_d filespec_dsc;
+ struct dsc$descriptor_d result_dsc;
+};
+
+const char *LP_find_file(LP_DIR_CTX **ctx, const char *directory)
+{
+ int status;
+ char *p, *r;
+ size_t l;
+ unsigned long flags = 0;
+
+/* Arrange 32-bit pointer to (copied) string storage, if needed. */
+#if __INITIAL_POINTER_SIZE == 64
+# pragma pointer_size save
+# pragma pointer_size 32
+ char *ctx_filespec_32p;
+# pragma pointer_size restore
+ char ctx_filespec_32[NAMX_MAXRSS + 1];
+#endif /* __INITIAL_POINTER_SIZE == 64 */
+
+#ifdef NAML$C_MAXRSS
+ flags |= LIB$M_FIL_LONG_NAMES;
+#endif
+
+ if (ctx == NULL || directory == NULL) {
+ errno = EINVAL;
+ return 0;
+ }
+
+ errno = 0;
+ if (*ctx == NULL) {
+ size_t filespeclen = strlen(directory);
+ char *filespec = NULL;
+
+ if (filespeclen == 0) {
+ errno = ENOENT;
+ return 0;
+ }
+
+ /* MUST be a VMS directory specification! Let's estimate if it is. */
+ if (directory[filespeclen - 1] != ']'
+ && directory[filespeclen - 1] != '>'
+ && directory[filespeclen - 1] != ':') {
+ errno = EINVAL;
+ return 0;
+ }
+
+ filespeclen += 4; /* "*.*;" */
+
+ if (filespeclen > NAMX_MAXRSS) {
+ errno = ENAMETOOLONG;
+ return 0;
+ }
+
+ *ctx = malloc(sizeof(**ctx));
+ if (*ctx == NULL) {
+ errno = ENOMEM;
+ return 0;
+ }
+ memset(*ctx, 0, sizeof(**ctx));
+
+ strcpy((*ctx)->filespec, directory);
+ strcat((*ctx)->filespec, "*.*;");
+
+/* Arrange 32-bit pointer to (copied) string storage, if needed. */
+#if __INITIAL_POINTER_SIZE == 64
+# define CTX_FILESPEC ctx_filespec_32p
+ /* Copy the file name to storage with a 32-bit pointer. */
+ ctx_filespec_32p = ctx_filespec_32;
+ strcpy(ctx_filespec_32p, (*ctx)->filespec);
+#else /* __INITIAL_POINTER_SIZE == 64 */
+# define CTX_FILESPEC (*ctx)->filespec
+#endif /* __INITIAL_POINTER_SIZE == 64 [else] */
+
+ (*ctx)->filespec_dsc.dsc$w_length = filespeclen;
+ (*ctx)->filespec_dsc.dsc$b_dtype = DSC$K_DTYPE_T;
+ (*ctx)->filespec_dsc.dsc$b_class = DSC$K_CLASS_S;
+ (*ctx)->filespec_dsc.dsc$a_pointer = CTX_FILESPEC;
+ }
+
+ (*ctx)->result_dsc.dsc$w_length = 0;
+ (*ctx)->result_dsc.dsc$b_dtype = DSC$K_DTYPE_T;
+ (*ctx)->result_dsc.dsc$b_class = DSC$K_CLASS_D;
+ (*ctx)->result_dsc.dsc$a_pointer = 0;
+
+ status = lib$find_file(&(*ctx)->filespec_dsc, &(*ctx)->result_dsc,
+ &(*ctx)->VMS_context, 0, 0, 0, &flags);
+
+ if (status == RMS$_NMF) {
+ errno = 0;
+ vaxc$errno = status;
+ return NULL;
+ }
+
+ if (!$VMS_STATUS_SUCCESS(status)) {
+ errno = EVMSERR;
+ vaxc$errno = status;
+ return NULL;
+ }
+
+ /*
+ * Quick, cheap and dirty way to discard any device and directory, since
+ * we only want file names
+ */
+ l = (*ctx)->result_dsc.dsc$w_length;
+ p = (*ctx)->result_dsc.dsc$a_pointer;
+ r = p;
+ for (; *p; p++) {
+ if (*p == '^' && p[1] != '\0') { /* Take care of ODS-5 escapes */
+ p++;
+ } else if (*p == ':' || *p == '>' || *p == ']') {
+ l -= p + 1 - r;
+ r = p + 1;
+ } else if (*p == ';') {
+ l = p - r;
+ break;
+ }
+ }
+
+ strncpy((*ctx)->result, r, l);
+ (*ctx)->result[l] = '\0';
+ str$free1_dx(&(*ctx)->result_dsc);
+
+ return (*ctx)->result;
+}
+
+int LP_find_file_end(LP_DIR_CTX **ctx)
+{
+ if (ctx != NULL && *ctx != NULL) {
+ int status = lib$find_file_end(&(*ctx)->VMS_context);
+
+ free(*ctx);
+
+ if (!$VMS_STATUS_SUCCESS(status)) {
+ errno = EVMSERR;
+ vaxc$errno = status;
+ return 0;
+ }
+ return 1;
+ }
+ errno = EINVAL;
+ return 0;
+}
diff --git a/openssl-1.1.0h/crypto/LPdir_win.c b/openssl-1.1.0h/crypto/LPdir_win.c
new file mode 100644
index 0000000..8f674d3
--- /dev/null
+++ b/openssl-1.1.0h/crypto/LPdir_win.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <windows.h>
+#include <tchar.h>
+#include "internal/numbers.h"
+#ifndef LPDIR_H
+# include "LPdir.h"
+#endif
+
+/*
+ * We're most likely overcautious here, but let's reserve for broken WinCE
+ * headers and explicitly opt for UNICODE call. Keep in mind that our WinCE
+ * builds are compiled with -DUNICODE [as well as -D_UNICODE].
+ */
+#if defined(LP_SYS_WINCE) && !defined(FindFirstFile)
+# define FindFirstFile FindFirstFileW
+#endif
+#if defined(LP_SYS_WINCE) && !defined(FindNextFile)
+# define FindNextFile FindNextFileW
+#endif
+
+#ifndef NAME_MAX
+# define NAME_MAX 255
+#endif
+
+#ifdef CP_UTF8
+# define CP_DEFAULT CP_UTF8
+#else
+# define CP_DEFAULT CP_ACP
+#endif
+
+struct LP_dir_context_st {
+ WIN32_FIND_DATA ctx;
+ HANDLE handle;
+ char entry_name[NAME_MAX + 1];
+};
+
+const char *LP_find_file(LP_DIR_CTX **ctx, const char *directory)
+{
+ if (ctx == NULL || directory == NULL) {
+ errno = EINVAL;
+ return 0;
+ }
+
+ errno = 0;
+ if (*ctx == NULL) {
+ size_t dirlen = strlen(directory);
+
+ if (dirlen == 0 || dirlen > INT_MAX - 3) {
+ errno = ENOENT;
+ return 0;
+ }
+
+ *ctx = malloc(sizeof(**ctx));
+ if (*ctx == NULL) {
+ errno = ENOMEM;
+ return 0;
+ }
+ memset(*ctx, 0, sizeof(**ctx));
+
+ if (sizeof(TCHAR) != sizeof(char)) {
+ TCHAR *wdir = NULL;
+ /* len_0 denotes string length *with* trailing 0 */
+ size_t index = 0, len_0 = dirlen + 1;
+#ifdef LP_MULTIBYTE_AVAILABLE
+ int sz = 0;
+ UINT cp;
+
+ do {
+# ifdef CP_UTF8
+ if ((sz = MultiByteToWideChar((cp = CP_UTF8), 0,
+ directory, len_0,
+ NULL, 0)) > 0 ||
+ GetLastError() != ERROR_NO_UNICODE_TRANSLATION)
+ break;
+# endif
+ sz = MultiByteToWideChar((cp = CP_ACP), 0,
+ directory, len_0,
+ NULL, 0);
+ } while (0);
+
+ if (sz > 0) {
+ /*
+ * allocate two additional characters in case we need to
+ * concatenate asterisk, |sz| covers trailing '\0'!
+ */
+ wdir = _alloca((sz + 2) * sizeof(TCHAR));
+ if (!MultiByteToWideChar(cp, 0, directory, len_0,
+ (WCHAR *)wdir, sz)) {
+ free(*ctx);
+ *ctx = NULL;
+ errno = EINVAL;
+ return 0;
+ }
+ } else
+#endif
+ {
+ sz = len_0;
+ /*
+ * allocate two additional characters in case we need to
+ * concatenate asterisk, |sz| covers trailing '\0'!
+ */
+ wdir = _alloca((sz + 2) * sizeof(TCHAR));
+ for (index = 0; index < len_0; index++)
+ wdir[index] = (TCHAR)directory[index];
+ }
+
+ sz--; /* wdir[sz] is trailing '\0' now */
+ if (wdir[sz - 1] != TEXT('*')) {
+ if (wdir[sz - 1] != TEXT('/') && wdir[sz - 1] != TEXT('\\'))
+ _tcscpy(wdir + sz, TEXT("/*"));
+ else
+ _tcscpy(wdir + sz, TEXT("*"));
+ }
+
+ (*ctx)->handle = FindFirstFile(wdir, &(*ctx)->ctx);
+ } else {
+ if (directory[dirlen - 1] != '*') {
+ char *buf = _alloca(dirlen + 3);
+
+ strcpy(buf, directory);
+ if (buf[dirlen - 1] != '/' && buf[dirlen - 1] != '\\')
+ strcpy(buf + dirlen, "/*");
+ else
+ strcpy(buf + dirlen, "*");
+
+ directory = buf;
+ }
+
+ (*ctx)->handle = FindFirstFile((TCHAR *)directory, &(*ctx)->ctx);
+ }
+
+ if ((*ctx)->handle == INVALID_HANDLE_VALUE) {
+ free(*ctx);
+ *ctx = NULL;
+ errno = EINVAL;
+ return 0;
+ }
+ } else {
+ if (FindNextFile((*ctx)->handle, &(*ctx)->ctx) == FALSE) {
+ return 0;
+ }
+ }
+ if (sizeof(TCHAR) != sizeof(char)) {
+ TCHAR *wdir = (*ctx)->ctx.cFileName;
+ size_t index, len_0 = 0;
+
+ while (wdir[len_0] && len_0 < (sizeof((*ctx)->entry_name) - 1))
+ len_0++;
+ len_0++;
+
+#ifdef LP_MULTIBYTE_AVAILABLE
+ if (!WideCharToMultiByte(CP_DEFAULT, 0, (WCHAR *)wdir, len_0,
+ (*ctx)->entry_name,
+ sizeof((*ctx)->entry_name), NULL, 0))
+#endif
+ for (index = 0; index < len_0; index++)
+ (*ctx)->entry_name[index] = (char)wdir[index];
+ } else
+ strncpy((*ctx)->entry_name, (const char *)(*ctx)->ctx.cFileName,
+ sizeof((*ctx)->entry_name) - 1);
+
+ (*ctx)->entry_name[sizeof((*ctx)->entry_name) - 1] = '\0';
+
+ return (*ctx)->entry_name;
+}
+
+int LP_find_file_end(LP_DIR_CTX **ctx)
+{
+ if (ctx != NULL && *ctx != NULL) {
+ FindClose((*ctx)->handle);
+ free(*ctx);
+ *ctx = NULL;
+ return 1;
+ }
+ errno = EINVAL;
+ return 0;
+}
diff --git a/openssl-1.1.0h/crypto/LPdir_win32.c b/openssl-1.1.0h/crypto/LPdir_win32.c
new file mode 100644
index 0000000..59ed485
--- /dev/null
+++ b/openssl-1.1.0h/crypto/LPdir_win32.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define LP_SYS_WIN32
+#define LP_MULTIBYTE_AVAILABLE
+#include "LPdir_win.c"
diff --git a/openssl-1.1.0h/crypto/LPdir_wince.c b/openssl-1.1.0h/crypto/LPdir_wince.c
new file mode 100644
index 0000000..dbc1052
--- /dev/null
+++ b/openssl-1.1.0h/crypto/LPdir_wince.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define LP_SYS_WINCE
+/*
+ * We might want to define LP_MULTIBYTE_AVAILABLE here. It's currently under
+ * investigation what the exact conditions would be
+ */
+#include "LPdir_win.c"
diff --git a/openssl-1.1.0h/crypto/aes/aes_cbc.c b/openssl-1.1.0h/crypto/aes/aes_cbc.c
new file mode 100644
index 0000000..342841f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_cbc.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/aes.h>
+#include <openssl/modes.h>
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec, const int enc)
+{
+
+ if (enc)
+ CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
+ (block128_f) AES_encrypt);
+ else
+ CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
+ (block128_f) AES_decrypt);
+}
diff --git a/openssl-1.1.0h/crypto/aes/aes_cfb.c b/openssl-1.1.0h/crypto/aes/aes_cfb.c
new file mode 100644
index 0000000..f010e3c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_cfb.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/aes.h>
+#include <openssl/modes.h>
+
+/*
+ * The input and output encrypted as though 128bit cfb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
+ */
+
+void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, int *num, const int enc)
+{
+
+ CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
+ (block128_f) AES_encrypt);
+}
+
+/* N.B. This expects the input to be packed, MS bit first */
+void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, int *num, const int enc)
+{
+ CRYPTO_cfb128_1_encrypt(in, out, length, key, ivec, num, enc,
+ (block128_f) AES_encrypt);
+}
+
+void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, int *num, const int enc)
+{
+ CRYPTO_cfb128_8_encrypt(in, out, length, key, ivec, num, enc,
+ (block128_f) AES_encrypt);
+}
diff --git a/openssl-1.1.0h/crypto/aes/aes_core.c b/openssl-1.1.0h/crypto/aes/aes_core.c
new file mode 100644
index 0000000..bd5c779
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_core.c
@@ -0,0 +1,1367 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/**
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Note: rewritten a little bit to provide error control and an OpenSSL-
+ compatible API */
+
+#include <assert.h>
+
+#include <stdlib.h>
+#include <openssl/crypto.h>
+#include <openssl/aes.h>
+#include "aes_locl.h"
+
+#ifndef AES_ASM
+/*-
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01];
+*/
+
+static const u32 Te0[256] = {
+ 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+ 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+ 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+ 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+ 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+ 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+ 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+ 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+ 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+ 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+ 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+ 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+ 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+ 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+ 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+ 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+ 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+ 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+ 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+ 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+ 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+ 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+ 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+ 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+ 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+ 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+ 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+ 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+ 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+ 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+ 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+ 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+ 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+ 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+ 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+ 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+ 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+ 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+ 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+ 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+ 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+ 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+ 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+ 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+ 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+ 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+ 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+ 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+ 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+ 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+ 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+ 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+ 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+ 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+ 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+ 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+ 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+ 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+ 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+ 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+ 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+ 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+ 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+ 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const u32 Te1[256] = {
+ 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+ 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+ 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+ 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+ 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+ 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+ 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+ 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+ 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+ 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+ 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+ 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+ 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+ 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+ 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+ 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+ 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+ 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+ 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+ 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+ 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+ 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+ 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+ 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+ 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+ 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+ 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+ 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+ 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+ 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+ 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+ 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+ 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+ 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+ 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+ 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+ 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+ 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+ 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+ 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+ 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+ 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+ 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+ 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+ 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+ 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+ 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+ 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+ 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+ 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+ 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+ 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+ 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+ 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+ 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+ 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+ 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+ 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+ 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+ 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+ 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+ 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+ 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+ 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const u32 Te2[256] = {
+ 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+ 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+ 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+ 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+ 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+ 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+ 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+ 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+ 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+ 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+ 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+ 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+ 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+ 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+ 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+ 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+ 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+ 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+ 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+ 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+ 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+ 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+ 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+ 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+ 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+ 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+ 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+ 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+ 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+ 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+ 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+ 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+ 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+ 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+ 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+ 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+ 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+ 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+ 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+ 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+ 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+ 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+ 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+ 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+ 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+ 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+ 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+ 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+ 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+ 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+ 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+ 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+ 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+ 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+ 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+ 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+ 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+ 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+ 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+ 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+ 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+ 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+ 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+ 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const u32 Te3[256] = {
+ 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+ 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+ 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+ 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+ 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+ 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+ 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+ 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+ 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+ 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+ 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+ 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+ 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+ 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+ 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+ 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+ 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+ 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+ 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+ 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+ 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+ 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+ 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+ 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+ 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+ 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+ 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+ 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+ 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+ 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+ 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+ 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+ 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+ 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+ 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+ 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+ 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+ 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+ 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+ 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+ 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+ 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+ 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+ 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+ 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+ 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+ 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+ 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+ 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+ 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+ 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+ 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+ 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+ 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+ 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+ 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+ 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+ 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+ 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+ 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+ 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+ 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+ 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+ 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+
+static const u32 Td0[256] = {
+ 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+ 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+ 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+ 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+ 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+ 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+ 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+ 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+ 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+ 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+ 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+ 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+ 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+ 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+ 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+ 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+ 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+ 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+ 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+ 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+ 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+ 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+ 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+ 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+ 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+ 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+ 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+ 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+ 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+ 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+ 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+ 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+ 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+ 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+ 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+ 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+ 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+ 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+ 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+ 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+ 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+ 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+ 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+ 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+ 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+ 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+ 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+ 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+ 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+ 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+ 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+ 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+ 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+ 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+ 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+ 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+ 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+ 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+ 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+ 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+ 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+ 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+ 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+ 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const u32 Td1[256] = {
+ 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+ 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+ 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+ 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+ 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+ 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+ 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+ 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+ 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+ 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+ 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+ 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+ 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+ 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+ 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+ 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+ 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+ 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+ 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+ 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+ 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+ 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+ 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+ 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+ 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+ 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+ 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+ 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+ 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+ 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+ 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+ 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+ 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+ 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+ 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+ 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+ 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+ 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+ 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+ 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+ 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+ 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+ 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+ 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+ 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+ 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+ 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+ 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+ 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+ 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+ 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+ 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+ 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+ 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+ 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+ 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+ 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+ 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+ 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+ 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+ 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+ 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+ 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+ 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const u32 Td2[256] = {
+ 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+ 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+ 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+ 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+ 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+ 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+ 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+ 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+ 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+ 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+ 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+ 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+ 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+ 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+ 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+ 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+ 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+ 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+ 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+ 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+ 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+ 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+ 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+ 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+ 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+ 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+ 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+ 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+ 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+ 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+ 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+ 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+ 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+ 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+ 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+ 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+ 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+ 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+ 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+ 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+ 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+ 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+ 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+ 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+ 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+ 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+ 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+ 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+ 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+ 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+ 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+ 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+ 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+ 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+ 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+ 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+ 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+ 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+ 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+ 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+ 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+ 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+ 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+ 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const u32 Td3[256] = {
+ 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+ 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+ 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+ 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+ 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+ 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+ 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+ 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+ 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+ 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+ 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+ 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+ 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+ 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+ 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+ 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+ 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+ 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+ 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+ 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+ 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+ 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+ 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+ 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+ 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+ 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+ 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+ 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+ 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+ 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+ 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+ 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+ 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+ 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+ 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+ 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+ 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+ 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+ 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+ 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+ 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+ 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+ 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+ 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+ 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+ 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+ 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+ 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+ 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+ 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+ 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+ 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+ 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+ 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+ 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+ 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+ 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+ 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+ 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+ 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+ 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+ 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+ 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+ 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const u8 Td4[256] = {
+ 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
+ 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
+ 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
+ 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
+ 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
+ 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
+ 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
+ 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
+ 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
+ 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
+ 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
+ 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
+ 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
+ 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
+ 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
+ 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
+ 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
+ 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
+ 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
+ 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
+ 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
+ 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
+ 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
+ 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
+ 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
+ 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
+ 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
+ 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
+ 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
+ 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
+ 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
+ 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
+};
+static const u32 rcon[] = {
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000,
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000,
+ 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+{
+
+ u32 *rk;
+ int i = 0;
+ u32 temp;
+
+ if (!userKey || !key)
+ return -1;
+ if (bits != 128 && bits != 192 && bits != 256)
+ return -2;
+
+ rk = key->rd_key;
+
+ if (bits == 128)
+ key->rounds = 10;
+ else if (bits == 192)
+ key->rounds = 12;
+ else
+ key->rounds = 14;
+
+ rk[0] = GETU32(userKey );
+ rk[1] = GETU32(userKey + 4);
+ rk[2] = GETU32(userKey + 8);
+ rk[3] = GETU32(userKey + 12);
+ if (bits == 128) {
+ while (1) {
+ temp = rk[3];
+ rk[4] = rk[0] ^
+ (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[5] = rk[1] ^ rk[4];
+ rk[6] = rk[2] ^ rk[5];
+ rk[7] = rk[3] ^ rk[6];
+ if (++i == 10) {
+ return 0;
+ }
+ rk += 4;
+ }
+ }
+ rk[4] = GETU32(userKey + 16);
+ rk[5] = GETU32(userKey + 20);
+ if (bits == 192) {
+ while (1) {
+ temp = rk[ 5];
+ rk[ 6] = rk[ 0] ^
+ (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 7] = rk[ 1] ^ rk[ 6];
+ rk[ 8] = rk[ 2] ^ rk[ 7];
+ rk[ 9] = rk[ 3] ^ rk[ 8];
+ if (++i == 8) {
+ return 0;
+ }
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ rk += 6;
+ }
+ }
+ rk[6] = GETU32(userKey + 24);
+ rk[7] = GETU32(userKey + 28);
+ if (bits == 256) {
+ while (1) {
+ temp = rk[ 7];
+ rk[ 8] = rk[ 0] ^
+ (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te3[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 9] = rk[ 1] ^ rk[ 8];
+ rk[10] = rk[ 2] ^ rk[ 9];
+ rk[11] = rk[ 3] ^ rk[10];
+ if (++i == 7) {
+ return 0;
+ }
+ temp = rk[11];
+ rk[12] = rk[ 4] ^
+ (Te2[(temp >> 24) ] & 0xff000000) ^
+ (Te3[(temp >> 16) & 0xff] & 0x00ff0000) ^
+ (Te0[(temp >> 8) & 0xff] & 0x0000ff00) ^
+ (Te1[(temp ) & 0xff] & 0x000000ff);
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+
+ rk += 8;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+{
+
+ u32 *rk;
+ int i, j, status;
+ u32 temp;
+
+ /* first, start with an encryption schedule */
+ status = AES_set_encrypt_key(userKey, bits, key);
+ if (status < 0)
+ return status;
+
+ rk = key->rd_key;
+
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ }
+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+ for (i = 1; i < (key->rounds); i++) {
+ rk += 4;
+ rk[0] =
+ Td0[Te1[(rk[0] >> 24) ] & 0xff] ^
+ Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(rk[0] >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(rk[0] ) & 0xff] & 0xff];
+ rk[1] =
+ Td0[Te1[(rk[1] >> 24) ] & 0xff] ^
+ Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(rk[1] >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(rk[1] ) & 0xff] & 0xff];
+ rk[2] =
+ Td0[Te1[(rk[2] >> 24) ] & 0xff] ^
+ Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(rk[2] >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(rk[2] ) & 0xff] & 0xff];
+ rk[3] =
+ Td0[Te1[(rk[3] >> 24) ] & 0xff] ^
+ Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
+ Td2[Te1[(rk[3] >> 8) & 0xff] & 0xff] ^
+ Td3[Te1[(rk[3] ) & 0xff] & 0xff];
+ }
+ return 0;
+}
+
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key) {
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+ if (key->rounds > 10) {
+ /* round 10: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+ if (key->rounds > 12) {
+ /* round 12: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+ }
+ }
+ rk += key->rounds << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = key->rounds >> 1;
+ for (;;) {
+ t0 =
+ Te0[(s0 >> 24) ] ^
+ Te1[(s1 >> 16) & 0xff] ^
+ Te2[(s2 >> 8) & 0xff] ^
+ Te3[(s3 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Te0[(s1 >> 24) ] ^
+ Te1[(s2 >> 16) & 0xff] ^
+ Te2[(s3 >> 8) & 0xff] ^
+ Te3[(s0 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Te0[(s2 >> 24) ] ^
+ Te1[(s3 >> 16) & 0xff] ^
+ Te2[(s0 >> 8) & 0xff] ^
+ Te3[(s1 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Te0[(s3 >> 24) ] ^
+ Te1[(s0 >> 16) & 0xff] ^
+ Te2[(s1 >> 8) & 0xff] ^
+ Te3[(s2 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Te0[(t0 >> 24) ] ^
+ Te1[(t1 >> 16) & 0xff] ^
+ Te2[(t2 >> 8) & 0xff] ^
+ Te3[(t3 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Te0[(t1 >> 24) ] ^
+ Te1[(t2 >> 16) & 0xff] ^
+ Te2[(t3 >> 8) & 0xff] ^
+ Te3[(t0 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Te0[(t2 >> 24) ] ^
+ Te1[(t3 >> 16) & 0xff] ^
+ Te2[(t0 >> 8) & 0xff] ^
+ Te3[(t1 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Te0[(t3 >> 24) ] ^
+ Te1[(t0 >> 16) & 0xff] ^
+ Te2[(t1 >> 8) & 0xff] ^
+ Te3[(t2 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ (Te2[(t0 >> 24) ] & 0xff000000) ^
+ (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te0[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te1[(t3 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ PUTU32(out , s0);
+ s1 =
+ (Te2[(t1 >> 24) ] & 0xff000000) ^
+ (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te0[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te1[(t0 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ PUTU32(out + 4, s1);
+ s2 =
+ (Te2[(t2 >> 24) ] & 0xff000000) ^
+ (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te0[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te1[(t1 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ PUTU32(out + 8, s2);
+ s3 =
+ (Te2[(t3 >> 24) ] & 0xff000000) ^
+ (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te0[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te1[(t2 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+ PUTU32(out + 12, s3);
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key)
+{
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+ if (key->rounds > 10) {
+ /* round 10: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+ if (key->rounds > 12) {
+ /* round 12: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+ }
+ }
+ rk += key->rounds << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = key->rounds >> 1;
+ for (;;) {
+ t0 =
+ Td0[(s0 >> 24) ] ^
+ Td1[(s3 >> 16) & 0xff] ^
+ Td2[(s2 >> 8) & 0xff] ^
+ Td3[(s1 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Td0[(s1 >> 24) ] ^
+ Td1[(s0 >> 16) & 0xff] ^
+ Td2[(s3 >> 8) & 0xff] ^
+ Td3[(s2 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Td0[(s2 >> 24) ] ^
+ Td1[(s1 >> 16) & 0xff] ^
+ Td2[(s0 >> 8) & 0xff] ^
+ Td3[(s3 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Td0[(s3 >> 24) ] ^
+ Td1[(s2 >> 16) & 0xff] ^
+ Td2[(s1 >> 8) & 0xff] ^
+ Td3[(s0 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Td0[(t0 >> 24) ] ^
+ Td1[(t3 >> 16) & 0xff] ^
+ Td2[(t2 >> 8) & 0xff] ^
+ Td3[(t1 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Td0[(t1 >> 24) ] ^
+ Td1[(t0 >> 16) & 0xff] ^
+ Td2[(t3 >> 8) & 0xff] ^
+ Td3[(t2 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Td0[(t2 >> 24) ] ^
+ Td1[(t1 >> 16) & 0xff] ^
+ Td2[(t0 >> 8) & 0xff] ^
+ Td3[(t3 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Td0[(t3 >> 24) ] ^
+ Td1[(t2 >> 16) & 0xff] ^
+ Td2[(t1 >> 8) & 0xff] ^
+ Td3[(t0 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ ((u32)Td4[(t0 >> 24) ] << 24) ^
+ ((u32)Td4[(t3 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(t2 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(t1 ) & 0xff]) ^
+ rk[0];
+ PUTU32(out , s0);
+ s1 =
+ ((u32)Td4[(t1 >> 24) ] << 24) ^
+ ((u32)Td4[(t0 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(t3 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(t2 ) & 0xff]) ^
+ rk[1];
+ PUTU32(out + 4, s1);
+ s2 =
+ ((u32)Td4[(t2 >> 24) ] << 24) ^
+ ((u32)Td4[(t1 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(t0 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(t3 ) & 0xff]) ^
+ rk[2];
+ PUTU32(out + 8, s2);
+ s3 =
+ ((u32)Td4[(t3 >> 24) ] << 24) ^
+ ((u32)Td4[(t2 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(t1 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(t0 ) & 0xff]) ^
+ rk[3];
+ PUTU32(out + 12, s3);
+}
+
+#else /* AES_ASM */
+
+static const u8 Te4[256] = {
+ 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
+ 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
+ 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
+ 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
+ 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
+ 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
+ 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
+ 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
+ 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
+ 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
+ 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
+ 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
+ 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
+ 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
+ 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
+ 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
+ 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
+ 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
+ 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
+ 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
+ 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
+ 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
+ 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
+ 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
+ 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
+ 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
+ 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
+ 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
+ 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
+ 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
+ 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
+ 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
+};
+static const u32 rcon[] = {
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000,
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000,
+ 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+{
+ u32 *rk;
+ int i = 0;
+ u32 temp;
+
+ if (!userKey || !key)
+ return -1;
+ if (bits != 128 && bits != 192 && bits != 256)
+ return -2;
+
+ rk = key->rd_key;
+
+ if (bits == 128)
+ key->rounds = 10;
+ else if (bits == 192)
+ key->rounds = 12;
+ else
+ key->rounds = 14;
+
+ rk[0] = GETU32(userKey );
+ rk[1] = GETU32(userKey + 4);
+ rk[2] = GETU32(userKey + 8);
+ rk[3] = GETU32(userKey + 12);
+ if (bits == 128) {
+ while (1) {
+ temp = rk[3];
+ rk[4] = rk[0] ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 24) ^
+ ((u32)Te4[(temp >> 8) & 0xff] << 16) ^
+ ((u32)Te4[(temp ) & 0xff] << 8) ^
+ ((u32)Te4[(temp >> 24) ]) ^
+ rcon[i];
+ rk[5] = rk[1] ^ rk[4];
+ rk[6] = rk[2] ^ rk[5];
+ rk[7] = rk[3] ^ rk[6];
+ if (++i == 10) {
+ return 0;
+ }
+ rk += 4;
+ }
+ }
+ rk[4] = GETU32(userKey + 16);
+ rk[5] = GETU32(userKey + 20);
+ if (bits == 192) {
+ while (1) {
+ temp = rk[ 5];
+ rk[ 6] = rk[ 0] ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 24) ^
+ ((u32)Te4[(temp >> 8) & 0xff] << 16) ^
+ ((u32)Te4[(temp ) & 0xff] << 8) ^
+ ((u32)Te4[(temp >> 24) ]) ^
+ rcon[i];
+ rk[ 7] = rk[ 1] ^ rk[ 6];
+ rk[ 8] = rk[ 2] ^ rk[ 7];
+ rk[ 9] = rk[ 3] ^ rk[ 8];
+ if (++i == 8) {
+ return 0;
+ }
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ rk += 6;
+ }
+ }
+ rk[6] = GETU32(userKey + 24);
+ rk[7] = GETU32(userKey + 28);
+ if (bits == 256) {
+ while (1) {
+ temp = rk[ 7];
+ rk[ 8] = rk[ 0] ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 24) ^
+ ((u32)Te4[(temp >> 8) & 0xff] << 16) ^
+ ((u32)Te4[(temp ) & 0xff] << 8) ^
+ ((u32)Te4[(temp >> 24) ]) ^
+ rcon[i];
+ rk[ 9] = rk[ 1] ^ rk[ 8];
+ rk[10] = rk[ 2] ^ rk[ 9];
+ rk[11] = rk[ 3] ^ rk[10];
+ if (++i == 7) {
+ return 0;
+ }
+ temp = rk[11];
+ rk[12] = rk[ 4] ^
+ ((u32)Te4[(temp >> 24) ] << 24) ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 16) ^
+ ((u32)Te4[(temp >> 8) & 0xff] << 8) ^
+ ((u32)Te4[(temp ) & 0xff]);
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+
+ rk += 8;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+{
+
+ u32 *rk;
+ int i, j, status;
+ u32 temp;
+
+ /* first, start with an encryption schedule */
+ status = AES_set_encrypt_key(userKey, bits, key);
+ if (status < 0)
+ return status;
+
+ rk = key->rd_key;
+
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ }
+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+ for (i = 1; i < (key->rounds); i++) {
+ rk += 4;
+ for (j = 0; j < 4; j++) {
+ u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+ tp1 = rk[j];
+ m = tp1 & 0x80808080;
+ tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp2 & 0x80808080;
+ tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp4 & 0x80808080;
+ tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ tp9 = tp8 ^ tp1;
+ tpb = tp9 ^ tp2;
+ tpd = tp9 ^ tp4;
+ tpe = tp8 ^ tp4 ^ tp2;
+#if defined(ROTATE)
+ rk[j] = tpe ^ ROTATE(tpd,16) ^
+ ROTATE(tp9,24) ^ ROTATE(tpb,8);
+#else
+ rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
+ (tp9 >> 8) ^ (tp9 << 24) ^
+ (tpb >> 24) ^ (tpb << 8);
+#endif
+ }
+ }
+ return 0;
+}
+
+#endif /* AES_ASM */
diff --git a/openssl-1.1.0h/crypto/aes/aes_ecb.c b/openssl-1.1.0h/crypto/aes/aes_ecb.c
new file mode 100644
index 0000000..29bfc1a
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_ecb.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <assert.h>
+
+#include <openssl/aes.h>
+#include "aes_locl.h"
+
+void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key, const int enc)
+{
+
+ assert(in && out && key);
+ assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
+
+ if (AES_ENCRYPT == enc)
+ AES_encrypt(in, out, key);
+ else
+ AES_decrypt(in, out, key);
+}
diff --git a/openssl-1.1.0h/crypto/aes/aes_ige.c b/openssl-1.1.0h/crypto/aes/aes_ige.c
new file mode 100644
index 0000000..75f796c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_ige.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "internal/cryptlib.h"
+
+#include <openssl/aes.h>
+#include "aes_locl.h"
+
+#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
+typedef struct {
+ unsigned long data[N_WORDS];
+} aes_block_t;
+
+/* XXX: probably some better way to do this */
+#if defined(__i386__) || defined(__x86_64__)
+# define UNALIGNED_MEMOPS_ARE_FAST 1
+#else
+# define UNALIGNED_MEMOPS_ARE_FAST 0
+#endif
+
+#if UNALIGNED_MEMOPS_ARE_FAST
+# define load_block(d, s) (d) = *(const aes_block_t *)(s)
+# define store_block(d, s) *(aes_block_t *)(d) = (s)
+#else
+# define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
+# define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
+#endif
+
+/* N.B. The IV for this mode is _twice_ the block size */
+
+void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, const int enc)
+{
+ size_t n;
+ size_t len = length;
+
+ if (length == 0)
+ return;
+
+ OPENSSL_assert(in && out && key && ivec);
+ OPENSSL_assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
+ OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
+
+ len = length / AES_BLOCK_SIZE;
+
+ if (AES_ENCRYPT == enc) {
+ if (in != out &&
+ (UNALIGNED_MEMOPS_ARE_FAST
+ || ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(long) ==
+ 0)) {
+ aes_block_t *ivp = (aes_block_t *) ivec;
+ aes_block_t *iv2p = (aes_block_t *) (ivec + AES_BLOCK_SIZE);
+
+ while (len) {
+ aes_block_t *inp = (aes_block_t *) in;
+ aes_block_t *outp = (aes_block_t *) out;
+
+ for (n = 0; n < N_WORDS; ++n)
+ outp->data[n] = inp->data[n] ^ ivp->data[n];
+ AES_encrypt((unsigned char *)outp->data,
+ (unsigned char *)outp->data, key);
+ for (n = 0; n < N_WORDS; ++n)
+ outp->data[n] ^= iv2p->data[n];
+ ivp = outp;
+ iv2p = inp;
+ --len;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
+ memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
+ } else {
+ aes_block_t tmp, tmp2;
+ aes_block_t iv;
+ aes_block_t iv2;
+
+ load_block(iv, ivec);
+ load_block(iv2, ivec + AES_BLOCK_SIZE);
+
+ while (len) {
+ load_block(tmp, in);
+ for (n = 0; n < N_WORDS; ++n)
+ tmp2.data[n] = tmp.data[n] ^ iv.data[n];
+ AES_encrypt((unsigned char *)tmp2.data,
+ (unsigned char *)tmp2.data, key);
+ for (n = 0; n < N_WORDS; ++n)
+ tmp2.data[n] ^= iv2.data[n];
+ store_block(out, tmp2);
+ iv = tmp2;
+ iv2 = tmp;
+ --len;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ memcpy(ivec, iv.data, AES_BLOCK_SIZE);
+ memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
+ }
+ } else {
+ if (in != out &&
+ (UNALIGNED_MEMOPS_ARE_FAST
+ || ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(long) ==
+ 0)) {
+ aes_block_t *ivp = (aes_block_t *) ivec;
+ aes_block_t *iv2p = (aes_block_t *) (ivec + AES_BLOCK_SIZE);
+
+ while (len) {
+ aes_block_t tmp;
+ aes_block_t *inp = (aes_block_t *) in;
+ aes_block_t *outp = (aes_block_t *) out;
+
+ for (n = 0; n < N_WORDS; ++n)
+ tmp.data[n] = inp->data[n] ^ iv2p->data[n];
+ AES_decrypt((unsigned char *)tmp.data,
+ (unsigned char *)outp->data, key);
+ for (n = 0; n < N_WORDS; ++n)
+ outp->data[n] ^= ivp->data[n];
+ ivp = inp;
+ iv2p = outp;
+ --len;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
+ memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
+ } else {
+ aes_block_t tmp, tmp2;
+ aes_block_t iv;
+ aes_block_t iv2;
+
+ load_block(iv, ivec);
+ load_block(iv2, ivec + AES_BLOCK_SIZE);
+
+ while (len) {
+ load_block(tmp, in);
+ tmp2 = tmp;
+ for (n = 0; n < N_WORDS; ++n)
+ tmp.data[n] ^= iv2.data[n];
+ AES_decrypt((unsigned char *)tmp.data,
+ (unsigned char *)tmp.data, key);
+ for (n = 0; n < N_WORDS; ++n)
+ tmp.data[n] ^= iv.data[n];
+ store_block(out, tmp);
+ iv = tmp2;
+ iv2 = tmp;
+ --len;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ memcpy(ivec, iv.data, AES_BLOCK_SIZE);
+ memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
+ }
+ }
+}
+
+/*
+ * Note that its effectively impossible to do biIGE in anything other
+ * than a single pass, so no provision is made for chaining.
+ */
+
+/* N.B. The IV for this mode is _four times_ the block size */
+
+void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ const AES_KEY *key2, const unsigned char *ivec,
+ const int enc)
+{
+ size_t n;
+ size_t len = length;
+ unsigned char tmp[AES_BLOCK_SIZE];
+ unsigned char tmp2[AES_BLOCK_SIZE];
+ unsigned char tmp3[AES_BLOCK_SIZE];
+ unsigned char prev[AES_BLOCK_SIZE];
+ const unsigned char *iv;
+ const unsigned char *iv2;
+
+ OPENSSL_assert(in && out && key && ivec);
+ OPENSSL_assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
+ OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
+
+ if (AES_ENCRYPT == enc) {
+ /*
+ * XXX: Do a separate case for when in != out (strictly should check
+ * for overlap, too)
+ */
+
+ /* First the forward pass */
+ iv = ivec;
+ iv2 = ivec + AES_BLOCK_SIZE;
+ while (len >= AES_BLOCK_SIZE) {
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ out[n] = in[n] ^ iv[n];
+ AES_encrypt(out, out, key);
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ out[n] ^= iv2[n];
+ iv = out;
+ memcpy(prev, in, AES_BLOCK_SIZE);
+ iv2 = prev;
+ len -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+
+ /* And now backwards */
+ iv = ivec + AES_BLOCK_SIZE * 2;
+ iv2 = ivec + AES_BLOCK_SIZE * 3;
+ len = length;
+ while (len >= AES_BLOCK_SIZE) {
+ out -= AES_BLOCK_SIZE;
+ /*
+ * XXX: reduce copies by alternating between buffers
+ */
+ memcpy(tmp, out, AES_BLOCK_SIZE);
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ out[n] ^= iv[n];
+ /*
+ * hexdump(stdout, "out ^ iv", out, AES_BLOCK_SIZE);
+ */
+ AES_encrypt(out, out, key);
+ /*
+ * hexdump(stdout,"enc", out, AES_BLOCK_SIZE);
+ */
+ /*
+ * hexdump(stdout,"iv2", iv2, AES_BLOCK_SIZE);
+ */
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ out[n] ^= iv2[n];
+ /*
+ * hexdump(stdout,"out", out, AES_BLOCK_SIZE);
+ */
+ iv = out;
+ memcpy(prev, tmp, AES_BLOCK_SIZE);
+ iv2 = prev;
+ len -= AES_BLOCK_SIZE;
+ }
+ } else {
+ /* First backwards */
+ iv = ivec + AES_BLOCK_SIZE * 2;
+ iv2 = ivec + AES_BLOCK_SIZE * 3;
+ in += length;
+ out += length;
+ while (len >= AES_BLOCK_SIZE) {
+ in -= AES_BLOCK_SIZE;
+ out -= AES_BLOCK_SIZE;
+ memcpy(tmp, in, AES_BLOCK_SIZE);
+ memcpy(tmp2, in, AES_BLOCK_SIZE);
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ tmp[n] ^= iv2[n];
+ AES_decrypt(tmp, out, key);
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ out[n] ^= iv[n];
+ memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
+ iv = tmp3;
+ iv2 = out;
+ len -= AES_BLOCK_SIZE;
+ }
+
+ /* And now forwards */
+ iv = ivec;
+ iv2 = ivec + AES_BLOCK_SIZE;
+ len = length;
+ while (len >= AES_BLOCK_SIZE) {
+ memcpy(tmp, out, AES_BLOCK_SIZE);
+ memcpy(tmp2, out, AES_BLOCK_SIZE);
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ tmp[n] ^= iv2[n];
+ AES_decrypt(tmp, out, key);
+ for (n = 0; n < AES_BLOCK_SIZE; ++n)
+ out[n] ^= iv[n];
+ memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
+ iv = tmp3;
+ iv2 = out;
+ len -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ }
+}
diff --git a/openssl-1.1.0h/crypto/aes/aes_locl.h b/openssl-1.1.0h/crypto/aes/aes_locl.h
new file mode 100644
index 0000000..adee29d
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_locl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef HEADER_AES_LOCL_H
+# define HEADER_AES_LOCL_H
+
+# include <openssl/e_os2.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+# if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+# define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
+# define GETU32(p) SWAP(*((u32 *)(p)))
+# define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
+# else
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
+# endif
+
+# ifdef AES_LONG
+typedef unsigned long u32;
+# else
+typedef unsigned int u32;
+# endif
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+# define MAXKC (256/32)
+# define MAXKB (256/8)
+# define MAXNR 14
+
+/* This controls loop-unrolling in aes_core.c */
+# undef FULL_UNROLL
+
+#endif /* !HEADER_AES_LOCL_H */
diff --git a/openssl-1.1.0h/crypto/aes/aes_misc.c b/openssl-1.1.0h/crypto/aes/aes_misc.c
new file mode 100644
index 0000000..7403c84
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_misc.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/opensslv.h>
+#include <openssl/aes.h>
+#include "aes_locl.h"
+
+const char *AES_options(void)
+{
+#ifdef FULL_UNROLL
+ return "aes(full)";
+#else
+ return "aes(partial)";
+#endif
+}
diff --git a/openssl-1.1.0h/crypto/aes/aes_ofb.c b/openssl-1.1.0h/crypto/aes/aes_ofb.c
new file mode 100644
index 0000000..215b538
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_ofb.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/aes.h>
+#include <openssl/modes.h>
+
+void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, int *num)
+{
+ CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
+ (block128_f) AES_encrypt);
+}
diff --git a/openssl-1.1.0h/crypto/aes/aes_wrap.c b/openssl-1.1.0h/crypto/aes/aes_wrap.c
new file mode 100644
index 0000000..cae0b21
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_wrap.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "internal/cryptlib.h"
+#include <openssl/aes.h>
+#include <openssl/modes.h>
+
+int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, unsigned int inlen)
+{
+ return CRYPTO_128_wrap(key, iv, out, in, inlen, (block128_f) AES_encrypt);
+}
+
+int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
+ unsigned char *out,
+ const unsigned char *in, unsigned int inlen)
+{
+ return CRYPTO_128_unwrap(key, iv, out, in, inlen,
+ (block128_f) AES_decrypt);
+}
diff --git a/openssl-1.1.0h/crypto/aes/aes_x86core.c b/openssl-1.1.0h/crypto/aes/aes_x86core.c
new file mode 100644
index 0000000..95b49bb
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/aes_x86core.c
@@ -0,0 +1,1075 @@
+/*
+ * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/**
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is experimental x86[_64] derivative. It assumes little-endian
+ * byte order and expects CPU to sustain unaligned memory references.
+ * It is used as playground for cache-time attack mitigations and
+ * serves as reference C implementation for x86[_64] assembler.
+ *
+ * <appro@fy.chalmers.se>
+ */
+
+
+#include <assert.h>
+
+#include <stdlib.h>
+#include <openssl/aes.h>
+#include "aes_locl.h"
+
+/*
+ * These two parameters control which table, 256-byte or 2KB, is
+ * referenced in outer and respectively inner rounds.
+ */
+#define AES_COMPACT_IN_OUTER_ROUNDS
+#ifdef AES_COMPACT_IN_OUTER_ROUNDS
+/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
+ * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
+ * by factor of ~2. */
+# undef AES_COMPACT_IN_INNER_ROUNDS
+#endif
+
+#if 1
+static void prefetch256(const void *table)
+{
+ volatile unsigned long *t=(void *)table,ret;
+ unsigned long sum;
+ int i;
+
+ /* 32 is common least cache-line size */
+ for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i];
+
+ ret = sum;
+}
+#else
+# define prefetch256(t)
+#endif
+
+#undef GETU32
+#define GETU32(p) (*((u32*)(p)))
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef unsigned __int64 u64;
+#define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef unsigned long u64;
+#define U64(C) C##UL
+#else
+typedef unsigned long long u64;
+#define U64(C) C##ULL
+#endif
+
+#undef ROTATE
+#if defined(_MSC_VER)
+# define ROTATE(a,n) _lrotl(a,n)
+#elif defined(__ICC)
+# define ROTATE(a,n) _rotl(a,n)
+#elif defined(__GNUC__) && __GNUC__>=2
+# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
+# define ROTATE(a,n) ({ register unsigned int ret; \
+ asm ( \
+ "roll %1,%0" \
+ : "=r"(ret) \
+ : "I"(n), "0"(a) \
+ : "cc"); \
+ ret; \
+ })
+# endif
+#endif
+/*-
+Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+*/
+#define Te0 (u32)((u64*)((u8*)Te+0))
+#define Te1 (u32)((u64*)((u8*)Te+3))
+#define Te2 (u32)((u64*)((u8*)Te+2))
+#define Te3 (u32)((u64*)((u8*)Te+1))
+/*-
+Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01];
+*/
+#define Td0 (u32)((u64*)((u8*)Td+0))
+#define Td1 (u32)((u64*)((u8*)Td+3))
+#define Td2 (u32)((u64*)((u8*)Td+2))
+#define Td3 (u32)((u64*)((u8*)Td+1))
+
+static const u64 Te[256] = {
+ U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
+ U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
+ U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
+ U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
+ U64(0x5030306050303060), U64(0x0301010203010102),
+ U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
+ U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
+ U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
+ U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
+ U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
+ U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
+ U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
+ U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
+ U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
+ U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
+ U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
+ U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
+ U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
+ U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
+ U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
+ U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
+ U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
+ U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
+ U64(0x5331316253313162), U64(0x3f15152a3f15152a),
+ U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
+ U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
+ U64(0x2818183028181830), U64(0xa1969637a1969637),
+ U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
+ U64(0x0907070e0907070e), U64(0x3612122436121224),
+ U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
+ U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
+ U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
+ U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
+ U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
+ U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
+ U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
+ U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
+ U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
+ U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
+ U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
+ U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
+ U64(0x0000000000000000), U64(0x2cededc12cededc1),
+ U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
+ U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
+ U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
+ U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
+ U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
+ U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
+ U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
+ U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
+ U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
+ U64(0x5533336655333366), U64(0x9485851194858511),
+ U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
+ U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
+ U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
+ U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
+ U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
+ U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
+ U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
+ U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
+ U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
+ U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
+ U64(0x3010102030101020), U64(0x1affffe51affffe5),
+ U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
+ U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
+ U64(0x3513132635131326), U64(0x2fececc32fececc3),
+ U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
+ U64(0xcc444488cc444488), U64(0x3917172e3917172e),
+ U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
+ U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
+ U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
+ U64(0x2b1919322b191932), U64(0x957373e6957373e6),
+ U64(0xa06060c0a06060c0), U64(0x9881811998818119),
+ U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
+ U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
+ U64(0xab90903bab90903b), U64(0x8388880b8388880b),
+ U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
+ U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
+ U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
+ U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
+ U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
+ U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
+ U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
+ U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
+ U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
+ U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
+ U64(0xa8919139a8919139), U64(0xa4959531a4959531),
+ U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
+ U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
+ U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
+ U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
+ U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
+ U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
+ U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
+ U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
+ U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
+ U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
+ U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
+ U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
+ U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
+ U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
+ U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
+ U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
+ U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
+ U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
+ U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
+ U64(0xd8484890d8484890), U64(0x0503030605030306),
+ U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
+ U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
+ U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
+ U64(0x9186861791868617), U64(0x58c1c19958c1c199),
+ U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
+ U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
+ U64(0xb398982bb398982b), U64(0x3311112233111122),
+ U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
+ U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
+ U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
+ U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
+ U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
+ U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
+ U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
+ U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
+ U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
+ U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
+ U64(0xc3414182c3414182), U64(0xb0999929b0999929),
+ U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
+ U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
+ U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
+};
+
+static const u8 Te4[256] = {
+ 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
+ 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
+ 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
+ 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
+ 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
+ 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
+ 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
+ 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
+ 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
+ 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
+ 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
+ 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
+ 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
+ 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
+ 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
+ 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
+ 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
+ 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
+ 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
+ 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
+ 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
+ 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
+ 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
+ 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
+ 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
+ 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
+ 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
+ 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
+ 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
+ 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
+ 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
+ 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
+};
+
+static const u64 Td[256] = {
+ U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
+ U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
+ U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
+ U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
+ U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
+ U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
+ U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
+ U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
+ U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
+ U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
+ U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
+ U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
+ U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
+ U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
+ U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
+ U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
+ U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
+ U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
+ U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
+ U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
+ U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
+ U64(0x6033519760335197), U64(0x457f5362457f5362),
+ U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
+ U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
+ U64(0x5868487058684870), U64(0x19fd458f19fd458f),
+ U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
+ U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
+ U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
+ U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
+ U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
+ U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
+ U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
+ U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
+ U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
+ U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
+ U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
+ U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
+ U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
+ U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
+ U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
+ U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
+ U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
+ U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
+ U64(0x6fd406046fd40604), U64(0xff155060ff155060),
+ U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
+ U64(0xcc434089cc434089), U64(0x779ed967779ed967),
+ U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
+ U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
+ U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
+ U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
+ U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
+ U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
+ U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
+ U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
+ U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
+ U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
+ U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
+ U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
+ U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
+ U64(0x694b775a694b775a), U64(0x161a121c161a121c),
+ U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
+ U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
+ U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
+ U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
+ U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
+ U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
+ U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
+ U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
+ U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
+ U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
+ U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
+ U64(0x4022971340229713), U64(0x2011c6842011c684),
+ U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
+ U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
+ U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
+ U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
+ U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
+ U64(0xfa489411fa489411), U64(0x2264e9472264e947),
+ U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
+ U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
+ U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
+ U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
+ U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
+ U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
+ U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
+ U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
+ U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
+ U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
+ U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
+ U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
+ U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
+ U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
+ U64(0x097826cd097826cd), U64(0xf418596ef418596e),
+ U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
+ U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
+ U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
+ U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
+ U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
+ U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
+ U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
+ U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
+ U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
+ U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
+ U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
+ U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
+ U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
+ U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
+ U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
+ U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
+ U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
+ U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
+ U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
+ U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
+ U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
+ U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
+ U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
+ U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
+ U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
+ U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
+ U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
+ U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
+ U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
+ U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
+ U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
+ U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
+ U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
+ U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
+ U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
+};
+static const u8 Td4[256] = {
+ 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
+ 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
+ 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
+ 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
+ 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
+ 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
+ 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
+ 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
+ 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
+ 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
+ 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
+ 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
+ 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
+ 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
+ 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
+ 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
+ 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
+ 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
+ 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
+ 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
+ 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
+ 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
+ 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
+ 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
+ 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
+ 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
+ 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
+ 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
+ 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
+ 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
+ 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
+ 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
+};
+
+static const u32 rcon[] = {
+ 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
+ 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
+ 0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+{
+
+ u32 *rk;
+ int i = 0;
+ u32 temp;
+
+ if (!userKey || !key)
+ return -1;
+ if (bits != 128 && bits != 192 && bits != 256)
+ return -2;
+
+ rk = key->rd_key;
+
+ if (bits==128)
+ key->rounds = 10;
+ else if (bits==192)
+ key->rounds = 12;
+ else
+ key->rounds = 14;
+
+ rk[0] = GETU32(userKey );
+ rk[1] = GETU32(userKey + 4);
+ rk[2] = GETU32(userKey + 8);
+ rk[3] = GETU32(userKey + 12);
+ if (bits == 128) {
+ while (1) {
+ temp = rk[3];
+ rk[4] = rk[0] ^
+ ((u32)Te4[(temp >> 8) & 0xff] ) ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 8) ^
+ ((u32)Te4[(temp >> 24) ] << 16) ^
+ ((u32)Te4[(temp ) & 0xff] << 24) ^
+ rcon[i];
+ rk[5] = rk[1] ^ rk[4];
+ rk[6] = rk[2] ^ rk[5];
+ rk[7] = rk[3] ^ rk[6];
+ if (++i == 10) {
+ return 0;
+ }
+ rk += 4;
+ }
+ }
+ rk[4] = GETU32(userKey + 16);
+ rk[5] = GETU32(userKey + 20);
+ if (bits == 192) {
+ while (1) {
+ temp = rk[ 5];
+ rk[ 6] = rk[ 0] ^
+ ((u32)Te4[(temp >> 8) & 0xff] ) ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 8) ^
+ ((u32)Te4[(temp >> 24) ] << 16) ^
+ ((u32)Te4[(temp ) & 0xff] << 24) ^
+ rcon[i];
+ rk[ 7] = rk[ 1] ^ rk[ 6];
+ rk[ 8] = rk[ 2] ^ rk[ 7];
+ rk[ 9] = rk[ 3] ^ rk[ 8];
+ if (++i == 8) {
+ return 0;
+ }
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ rk += 6;
+ }
+ }
+ rk[6] = GETU32(userKey + 24);
+ rk[7] = GETU32(userKey + 28);
+ if (bits == 256) {
+ while (1) {
+ temp = rk[ 7];
+ rk[ 8] = rk[ 0] ^
+ ((u32)Te4[(temp >> 8) & 0xff] ) ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 8) ^
+ ((u32)Te4[(temp >> 24) ] << 16) ^
+ ((u32)Te4[(temp ) & 0xff] << 24) ^
+ rcon[i];
+ rk[ 9] = rk[ 1] ^ rk[ 8];
+ rk[10] = rk[ 2] ^ rk[ 9];
+ rk[11] = rk[ 3] ^ rk[10];
+ if (++i == 7) {
+ return 0;
+ }
+ temp = rk[11];
+ rk[12] = rk[ 4] ^
+ ((u32)Te4[(temp ) & 0xff] ) ^
+ ((u32)Te4[(temp >> 8) & 0xff] << 8) ^
+ ((u32)Te4[(temp >> 16) & 0xff] << 16) ^
+ ((u32)Te4[(temp >> 24) ] << 24);
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+
+ rk += 8;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+{
+
+ u32 *rk;
+ int i, j, status;
+ u32 temp;
+
+ /* first, start with an encryption schedule */
+ status = AES_set_encrypt_key(userKey, bits, key);
+ if (status < 0)
+ return status;
+
+ rk = key->rd_key;
+
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ }
+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+ for (i = 1; i < (key->rounds); i++) {
+ rk += 4;
+#if 1
+ for (j = 0; j < 4; j++) {
+ u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+ tp1 = rk[j];
+ m = tp1 & 0x80808080;
+ tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp2 & 0x80808080;
+ tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp4 & 0x80808080;
+ tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ tp9 = tp8 ^ tp1;
+ tpb = tp9 ^ tp2;
+ tpd = tp9 ^ tp4;
+ tpe = tp8 ^ tp4 ^ tp2;
+#if defined(ROTATE)
+ rk[j] = tpe ^ ROTATE(tpd,16) ^
+ ROTATE(tp9,8) ^ ROTATE(tpb,24);
+#else
+ rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
+ (tp9 >> 24) ^ (tp9 << 8) ^
+ (tpb >> 8) ^ (tpb << 24);
+#endif
+ }
+#else
+ rk[0] =
+ Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^
+ Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^
+ Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
+ Td3[Te2[(rk[0] >> 24) ] & 0xff];
+ rk[1] =
+ Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^
+ Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^
+ Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
+ Td3[Te2[(rk[1] >> 24) ] & 0xff];
+ rk[2] =
+ Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^
+ Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^
+ Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
+ Td3[Te2[(rk[2] >> 24) ] & 0xff];
+ rk[3] =
+ Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^
+ Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^
+ Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
+ Td3[Te2[(rk[3] >> 24) ] & 0xff];
+#endif
+ }
+ return 0;
+}
+
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key)
+{
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t[4];
+ int r;
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+
+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
+ prefetch256(Te4);
+
+ t[0] = (u32)Te4[(s0 ) & 0xff] ^
+ (u32)Te4[(s1 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s2 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s3 >> 24) ] << 24;
+ t[1] = (u32)Te4[(s1 ) & 0xff] ^
+ (u32)Te4[(s2 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s3 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s0 >> 24) ] << 24;
+ t[2] = (u32)Te4[(s2 ) & 0xff] ^
+ (u32)Te4[(s3 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s0 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s1 >> 24) ] << 24;
+ t[3] = (u32)Te4[(s3 ) & 0xff] ^
+ (u32)Te4[(s0 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s1 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s2 >> 24) ] << 24;
+
+ /* now do the linear transform using words */
+ { int i;
+ u32 r0, r1, r2;
+
+ for (i = 0; i < 4; i++) {
+ r0 = t[i];
+ r1 = r0 & 0x80808080;
+ r2 = ((r0 & 0x7f7f7f7f) << 1) ^
+ ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
+#if defined(ROTATE)
+ t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
+ ROTATE(r0,16) ^ ROTATE(r0,8);
+#else
+ t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
+ (r0 << 16) ^ (r0 >> 16) ^
+ (r0 << 8) ^ (r0 >> 24);
+#endif
+ t[i] ^= rk[4+i];
+ }
+ }
+#else
+ t[0] = Te0[(s0 ) & 0xff] ^
+ Te1[(s1 >> 8) & 0xff] ^
+ Te2[(s2 >> 16) & 0xff] ^
+ Te3[(s3 >> 24) ] ^
+ rk[4];
+ t[1] = Te0[(s1 ) & 0xff] ^
+ Te1[(s2 >> 8) & 0xff] ^
+ Te2[(s3 >> 16) & 0xff] ^
+ Te3[(s0 >> 24) ] ^
+ rk[5];
+ t[2] = Te0[(s2 ) & 0xff] ^
+ Te1[(s3 >> 8) & 0xff] ^
+ Te2[(s0 >> 16) & 0xff] ^
+ Te3[(s1 >> 24) ] ^
+ rk[6];
+ t[3] = Te0[(s3 ) & 0xff] ^
+ Te1[(s0 >> 8) & 0xff] ^
+ Te2[(s1 >> 16) & 0xff] ^
+ Te3[(s2 >> 24) ] ^
+ rk[7];
+#endif
+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+
+ /*
+ * Nr - 2 full rounds:
+ */
+ for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
+#if defined(AES_COMPACT_IN_INNER_ROUNDS)
+ t[0] = (u32)Te4[(s0 ) & 0xff] ^
+ (u32)Te4[(s1 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s2 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s3 >> 24) ] << 24;
+ t[1] = (u32)Te4[(s1 ) & 0xff] ^
+ (u32)Te4[(s2 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s3 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s0 >> 24) ] << 24;
+ t[2] = (u32)Te4[(s2 ) & 0xff] ^
+ (u32)Te4[(s3 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s0 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s1 >> 24) ] << 24;
+ t[3] = (u32)Te4[(s3 ) & 0xff] ^
+ (u32)Te4[(s0 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s1 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s2 >> 24) ] << 24;
+
+ /* now do the linear transform using words */
+ {
+ int i;
+ u32 r0, r1, r2;
+
+ for (i = 0; i < 4; i++) {
+ r0 = t[i];
+ r1 = r0 & 0x80808080;
+ r2 = ((r0 & 0x7f7f7f7f) << 1) ^
+ ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
+#if defined(ROTATE)
+ t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
+ ROTATE(r0,16) ^ ROTATE(r0,8);
+#else
+ t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
+ (r0 << 16) ^ (r0 >> 16) ^
+ (r0 << 8) ^ (r0 >> 24);
+#endif
+ t[i] ^= rk[i];
+ }
+ }
+#else
+ t[0] = Te0[(s0 ) & 0xff] ^
+ Te1[(s1 >> 8) & 0xff] ^
+ Te2[(s2 >> 16) & 0xff] ^
+ Te3[(s3 >> 24) ] ^
+ rk[0];
+ t[1] = Te0[(s1 ) & 0xff] ^
+ Te1[(s2 >> 8) & 0xff] ^
+ Te2[(s3 >> 16) & 0xff] ^
+ Te3[(s0 >> 24) ] ^
+ rk[1];
+ t[2] = Te0[(s2 ) & 0xff] ^
+ Te1[(s3 >> 8) & 0xff] ^
+ Te2[(s0 >> 16) & 0xff] ^
+ Te3[(s1 >> 24) ] ^
+ rk[2];
+ t[3] = Te0[(s3 ) & 0xff] ^
+ Te1[(s0 >> 8) & 0xff] ^
+ Te2[(s1 >> 16) & 0xff] ^
+ Te3[(s2 >> 24) ] ^
+ rk[3];
+#endif
+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+ }
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
+ prefetch256(Te4);
+
+ *(u32*)(out+0) =
+ (u32)Te4[(s0 ) & 0xff] ^
+ (u32)Te4[(s1 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s2 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s3 >> 24) ] << 24 ^
+ rk[0];
+ *(u32*)(out+4) =
+ (u32)Te4[(s1 ) & 0xff] ^
+ (u32)Te4[(s2 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s3 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s0 >> 24) ] << 24 ^
+ rk[1];
+ *(u32*)(out+8) =
+ (u32)Te4[(s2 ) & 0xff] ^
+ (u32)Te4[(s3 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s0 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s1 >> 24) ] << 24 ^
+ rk[2];
+ *(u32*)(out+12) =
+ (u32)Te4[(s3 ) & 0xff] ^
+ (u32)Te4[(s0 >> 8) & 0xff] << 8 ^
+ (u32)Te4[(s1 >> 16) & 0xff] << 16 ^
+ (u32)Te4[(s2 >> 24) ] << 24 ^
+ rk[3];
+#else
+ *(u32*)(out+0) =
+ (Te2[(s0 ) & 0xff] & 0x000000ffU) ^
+ (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^
+ (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
+ (Te1[(s3 >> 24) ] & 0xff000000U) ^
+ rk[0];
+ *(u32*)(out+4) =
+ (Te2[(s1 ) & 0xff] & 0x000000ffU) ^
+ (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^
+ (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
+ (Te1[(s0 >> 24) ] & 0xff000000U) ^
+ rk[1];
+ *(u32*)(out+8) =
+ (Te2[(s2 ) & 0xff] & 0x000000ffU) ^
+ (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^
+ (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
+ (Te1[(s1 >> 24) ] & 0xff000000U) ^
+ rk[2];
+ *(u32*)(out+12) =
+ (Te2[(s3 ) & 0xff] & 0x000000ffU) ^
+ (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^
+ (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
+ (Te1[(s2 >> 24) ] & 0xff000000U) ^
+ rk[3];
+#endif
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key)
+{
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t[4];
+ int r;
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+
+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
+ prefetch256(Td4);
+
+ t[0] = (u32)Td4[(s0 ) & 0xff] ^
+ (u32)Td4[(s3 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s2 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s1 >> 24) ] << 24;
+ t[1] = (u32)Td4[(s1 ) & 0xff] ^
+ (u32)Td4[(s0 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s3 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s2 >> 24) ] << 24;
+ t[2] = (u32)Td4[(s2 ) & 0xff] ^
+ (u32)Td4[(s1 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s0 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s3 >> 24) ] << 24;
+ t[3] = (u32)Td4[(s3 ) & 0xff] ^
+ (u32)Td4[(s2 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s1 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s0 >> 24) ] << 24;
+
+ /* now do the linear transform using words */
+ {
+ int i;
+ u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+ for (i = 0; i < 4; i++) {
+ tp1 = t[i];
+ m = tp1 & 0x80808080;
+ tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp2 & 0x80808080;
+ tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp4 & 0x80808080;
+ tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ tp9 = tp8 ^ tp1;
+ tpb = tp9 ^ tp2;
+ tpd = tp9 ^ tp4;
+ tpe = tp8 ^ tp4 ^ tp2;
+#if defined(ROTATE)
+ t[i] = tpe ^ ROTATE(tpd,16) ^
+ ROTATE(tp9,8) ^ ROTATE(tpb,24);
+#else
+ t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
+ (tp9 >> 24) ^ (tp9 << 8) ^
+ (tpb >> 8) ^ (tpb << 24);
+#endif
+ t[i] ^= rk[4+i];
+ }
+ }
+#else
+ t[0] = Td0[(s0 ) & 0xff] ^
+ Td1[(s3 >> 8) & 0xff] ^
+ Td2[(s2 >> 16) & 0xff] ^
+ Td3[(s1 >> 24) ] ^
+ rk[4];
+ t[1] = Td0[(s1 ) & 0xff] ^
+ Td1[(s0 >> 8) & 0xff] ^
+ Td2[(s3 >> 16) & 0xff] ^
+ Td3[(s2 >> 24) ] ^
+ rk[5];
+ t[2] = Td0[(s2 ) & 0xff] ^
+ Td1[(s1 >> 8) & 0xff] ^
+ Td2[(s0 >> 16) & 0xff] ^
+ Td3[(s3 >> 24) ] ^
+ rk[6];
+ t[3] = Td0[(s3 ) & 0xff] ^
+ Td1[(s2 >> 8) & 0xff] ^
+ Td2[(s1 >> 16) & 0xff] ^
+ Td3[(s0 >> 24) ] ^
+ rk[7];
+#endif
+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+
+ /*
+ * Nr - 2 full rounds:
+ */
+ for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
+#if defined(AES_COMPACT_IN_INNER_ROUNDS)
+ t[0] = (u32)Td4[(s0 ) & 0xff] ^
+ (u32)Td4[(s3 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s2 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s1 >> 24) ] << 24;
+ t[1] = (u32)Td4[(s1 ) & 0xff] ^
+ (u32)Td4[(s0 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s3 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s2 >> 24) ] << 24;
+ t[2] = (u32)Td4[(s2 ) & 0xff] ^
+ (u32)Td4[(s1 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s0 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s3 >> 24) ] << 24;
+ t[3] = (u32)Td4[(s3 ) & 0xff] ^
+ (u32)Td4[(s2 >> 8) & 0xff] << 8 ^
+ (u32)Td4[(s1 >> 16) & 0xff] << 16 ^
+ (u32)Td4[(s0 >> 24) ] << 24;
+
+ /* now do the linear transform using words */
+ {
+ int i;
+ u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
+
+ for (i = 0; i < 4; i++) {
+ tp1 = t[i];
+ m = tp1 & 0x80808080;
+ tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp2 & 0x80808080;
+ tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ m = tp4 & 0x80808080;
+ tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
+ ((m - (m >> 7)) & 0x1b1b1b1b);
+ tp9 = tp8 ^ tp1;
+ tpb = tp9 ^ tp2;
+ tpd = tp9 ^ tp4;
+ tpe = tp8 ^ tp4 ^ tp2;
+#if defined(ROTATE)
+ t[i] = tpe ^ ROTATE(tpd,16) ^
+ ROTATE(tp9,8) ^ ROTATE(tpb,24);
+#else
+ t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
+ (tp9 >> 24) ^ (tp9 << 8) ^
+ (tpb >> 8) ^ (tpb << 24);
+#endif
+ t[i] ^= rk[i];
+ }
+ }
+#else
+ t[0] = Td0[(s0 ) & 0xff] ^
+ Td1[(s3 >> 8) & 0xff] ^
+ Td2[(s2 >> 16) & 0xff] ^
+ Td3[(s1 >> 24) ] ^
+ rk[0];
+ t[1] = Td0[(s1 ) & 0xff] ^
+ Td1[(s0 >> 8) & 0xff] ^
+ Td2[(s3 >> 16) & 0xff] ^
+ Td3[(s2 >> 24) ] ^
+ rk[1];
+ t[2] = Td0[(s2 ) & 0xff] ^
+ Td1[(s1 >> 8) & 0xff] ^
+ Td2[(s0 >> 16) & 0xff] ^
+ Td3[(s3 >> 24) ] ^
+ rk[2];
+ t[3] = Td0[(s3 ) & 0xff] ^
+ Td1[(s2 >> 8) & 0xff] ^
+ Td2[(s1 >> 16) & 0xff] ^
+ Td3[(s0 >> 24) ] ^
+ rk[3];
+#endif
+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
+ }
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ prefetch256(Td4);
+
+ *(u32*)(out+0) =
+ ((u32)Td4[(s0 ) & 0xff]) ^
+ ((u32)Td4[(s3 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(s2 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(s1 >> 24) ] << 24) ^
+ rk[0];
+ *(u32*)(out+4) =
+ ((u32)Td4[(s1 ) & 0xff]) ^
+ ((u32)Td4[(s0 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(s3 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(s2 >> 24) ] << 24) ^
+ rk[1];
+ *(u32*)(out+8) =
+ ((u32)Td4[(s2 ) & 0xff]) ^
+ ((u32)Td4[(s1 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(s0 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(s3 >> 24) ] << 24) ^
+ rk[2];
+ *(u32*)(out+12) =
+ ((u32)Td4[(s3 ) & 0xff]) ^
+ ((u32)Td4[(s2 >> 8) & 0xff] << 8) ^
+ ((u32)Td4[(s1 >> 16) & 0xff] << 16) ^
+ ((u32)Td4[(s0 >> 24) ] << 24) ^
+ rk[3];
+}
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-586.pl b/openssl-1.1.0h/crypto/aes/asm/aes-586.pl
new file mode 100755
index 0000000..1ba3565
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-586.pl
@@ -0,0 +1,3000 @@
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Version 4.3.
+#
+# You might fail to appreciate this module performance from the first
+# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
+# to be *the* best Intel C compiler without -KPIC, performance appears
+# to be virtually identical... But try to re-configure with shared
+# library support... Aha! Intel compiler "suddenly" lags behind by 30%
+# [on P4, more on others]:-) And if compared to position-independent
+# code generated by GNU C, this code performs *more* than *twice* as
+# fast! Yes, all this buzz about PIC means that unlike other hand-
+# coded implementations, this one was explicitly designed to be safe
+# to use even in shared library context... This also means that this
+# code isn't necessarily absolutely fastest "ever," because in order
+# to achieve position independence an extra register has to be
+# off-loaded to stack, which affects the benchmark result.
+#
+# Special note about instruction choice. Do you recall RC4_INT code
+# performing poorly on P4? It might be the time to figure out why.
+# RC4_INT code implies effective address calculations in base+offset*4
+# form. Trouble is that it seems that offset scaling turned to be
+# critical path... At least eliminating scaling resulted in 2.8x RC4
+# performance improvement [as you might recall]. As AES code is hungry
+# for scaling too, I [try to] avoid the latter by favoring off-by-2
+# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
+#
+# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
+# void. Performance improvement with off-by-2 shifts was observed on
+# intermediate implementation, which was spilling yet another register
+# to stack... Final offset*4 code below runs just a tad faster on P4,
+# but exhibits up to 10% improvement on other cores.
+#
+# Second version is "monolithic" replacement for aes_core.c, which in
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# This made it possible to implement little-endian variant of the
+# algorithm without modifying the base C code. Motivating factor for
+# the undertaken effort was that it appeared that in tight IA-32
+# register window little-endian flavor could achieve slightly higher
+# Instruction Level Parallelism, and it indeed resulted in up to 15%
+# better performance on most recent µ-archs...
+#
+# Third version adds AES_cbc_encrypt implementation, which resulted in
+# up to 40% performance imrovement of CBC benchmark results. 40% was
+# observed on P4 core, where "overall" imrovement coefficient, i.e. if
+# compared to PIC generated by GCC and in CBC mode, was observed to be
+# as large as 4x:-) CBC performance is virtually identical to ECB now
+# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
+# Opteron, because certain function prologues and epilogues are
+# effectively taken out of the loop...
+#
+# Version 3.2 implements compressed tables and prefetch of these tables
+# in CBC[!] mode. Former means that 3/4 of table references are now
+# misaligned, which unfortunately has negative impact on elder IA-32
+# implementations, Pentium suffered 30% penalty, PIII - 10%.
+#
+# Version 3.3 avoids L1 cache aliasing between stack frame and
+# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
+# latter is achieved by copying the key schedule to controlled place in
+# stack. This unfortunately has rather strong impact on small block CBC
+# performance, ~2x deterioration on 16-byte block if compared to 3.3.
+#
+# Version 3.5 checks if there is L1 cache aliasing between user-supplied
+# key schedule and S-boxes and abstains from copying the former if
+# there is no. This allows end-user to consciously retain small block
+# performance by aligning key schedule in specific manner.
+#
+# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
+#
+# Current ECB performance numbers for 128-bit key in CPU cycles per
+# processed byte [measure commonly used by AES benchmarkers] are:
+#
+# small footprint fully unrolled
+# P4 24 22
+# AMD K8 20 19
+# PIII 25 23
+# Pentium 81 78
+#
+# Version 3.7 reimplements outer rounds as "compact." Meaning that
+# first and last rounds reference compact 256 bytes S-box. This means
+# that first round consumes a lot more CPU cycles and that encrypt
+# and decrypt performance becomes asymmetric. Encrypt performance
+# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
+# aggressively pre-fetched.
+#
+# Version 4.0 effectively rolls back to 3.6 and instead implements
+# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
+# which use exclusively 256 byte S-box. These functions are to be
+# called in modes not concealing plain text, such as ECB, or when
+# we're asked to process smaller amount of data [or unconditionally
+# on hyper-threading CPU]. Currently it's called unconditionally from
+# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
+# still needs to be modified to switch between slower and faster
+# mode when appropriate... But in either case benchmark landscape
+# changes dramatically and below numbers are CPU cycles per processed
+# byte for 128-bit key.
+#
+# ECB encrypt ECB decrypt CBC large chunk
+# P4 52[54] 83[95] 23
+# AMD K8 46[41] 66[70] 18
+# PIII 41[50] 60[77] 24
+# Core 2 31[36] 45[64] 18.5
+# Atom 76[100] 96[138] 60
+# Pentium 115 150 77
+#
+# Version 4.1 switches to compact S-box even in key schedule setup.
+#
+# Version 4.2 prefetches compact S-box in every SSE round or in other
+# words every cache-line is *guaranteed* to be accessed within ~50
+# cycles window. Why just SSE? Because it's needed on hyper-threading
+# CPU! Which is also why it's prefetched with 64 byte stride. Best
+# part is that it has no negative effect on performance:-)
+#
+# Version 4.3 implements switch between compact and non-compact block
+# functions in AES_cbc_encrypt depending on how much data was asked
+# to be processed in one stroke.
+#
+######################################################################
+# Timing attacks are classified in two classes: synchronous when
+# attacker consciously initiates cryptographic operation and collects
+# timing data of various character afterwards, and asynchronous when
+# malicious code is executed on same CPU simultaneously with AES,
+# instruments itself and performs statistical analysis of this data.
+#
+# As far as synchronous attacks go the root to the AES timing
+# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
+# are referred to in single 128-bit block operation. Well, in C
+# implementation with 4 distinct tables it's actually as little as 40
+# references per 256 elements table, but anyway... Secondly, even
+# though S-box elements are clustered into smaller amount of cache-
+# lines, smaller than 160 and even 40, it turned out that for certain
+# plain-text pattern[s] or simply put chosen plain-text and given key
+# few cache-lines remain unaccessed during block operation. Now, if
+# attacker can figure out this access pattern, he can deduct the key
+# [or at least part of it]. The natural way to mitigate this kind of
+# attacks is to minimize the amount of cache-lines in S-box and/or
+# prefetch them to ensure that every one is accessed for more uniform
+# timing. But note that *if* plain-text was concealed in such way that
+# input to block function is distributed *uniformly*, then attack
+# wouldn't apply. Now note that some encryption modes, most notably
+# CBC, do mask the plain-text in this exact way [secure cipher output
+# is distributed uniformly]. Yes, one still might find input that
+# would reveal the information about given key, but if amount of
+# candidate inputs to be tried is larger than amount of possible key
+# combinations then attack becomes infeasible. This is why revised
+# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
+# of data is to be processed in one stroke. The current size limit of
+# 512 bytes is chosen to provide same [diminishigly low] probability
+# for cache-line to remain untouched in large chunk operation with
+# large S-box as for single block operation with compact S-box and
+# surely needs more careful consideration...
+#
+# As for asynchronous attacks. There are two flavours: attacker code
+# being interleaved with AES on hyper-threading CPU at *instruction*
+# level, and two processes time sharing single core. As for latter.
+# Two vectors. 1. Given that attacker process has higher priority,
+# yield execution to process performing AES just before timer fires
+# off the scheduler, immediately regain control of CPU and analyze the
+# cache state. For this attack to be efficient attacker would have to
+# effectively slow down the operation by several *orders* of magnitute,
+# by ratio of time slice to duration of handful of AES rounds, which
+# unlikely to remain unnoticed. Not to mention that this also means
+# that he would spend correspondigly more time to collect enough
+# statistical data to mount the attack. It's probably appropriate to
+# say that if adeversary reckons that this attack is beneficial and
+# risks to be noticed, you probably have larger problems having him
+# mere opportunity. In other words suggested code design expects you
+# to preclude/mitigate this attack by overall system security design.
+# 2. Attacker manages to make his code interrupt driven. In order for
+# this kind of attack to be feasible, interrupt rate has to be high
+# enough, again comparable to duration of handful of AES rounds. But
+# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
+# generates interrupts at such raging rate...
+#
+# And now back to the former, hyper-threading CPU or more specifically
+# Intel P4. Recall that asynchronous attack implies that malicious
+# code instruments itself. And naturally instrumentation granularity
+# has be noticeably lower than duration of codepath accessing S-box.
+# Given that all cache-lines are accessed during that time that is.
+# Current implementation accesses *all* cache-lines within ~50 cycles
+# window, which is actually *less* than RDTSC latency on Intel P4!
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
+&static_label("AES_Te");
+&static_label("AES_Td");
+
+$s0="eax";
+$s1="ebx";
+$s2="ecx";
+$s3="edx";
+$key="edi";
+$acc="esi";
+$tbl="ebp";
+
+# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
+# by caller
+$__ra=&DWP(0,"esp"); # return address
+$__s0=&DWP(4,"esp"); # s0 backing store
+$__s1=&DWP(8,"esp"); # s1 backing store
+$__s2=&DWP(12,"esp"); # s2 backing store
+$__s3=&DWP(16,"esp"); # s3 backing store
+$__key=&DWP(20,"esp"); # pointer to key schedule
+$__end=&DWP(24,"esp"); # pointer to end of key schedule
+$__tbl=&DWP(28,"esp"); # %ebp backing store
+
+# stack frame layout in AES_[en|crypt] routines, which differs from
+# above by 4 and overlaps by %ebp backing store
+$_tbl=&DWP(24,"esp");
+$_esp=&DWP(28,"esp");
+
+sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
+
+$speed_limit=512; # chunks smaller than $speed_limit are
+ # processed with compact routine in CBC mode
+$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
+ # recent µ-archs], but ~5 times smaller!
+ # I favor compact code to minimize cache
+ # contention and in hope to "collect" 5% back
+ # in real-life applications...
+
+$vertical_spin=0; # shift "verticaly" defaults to 0, because of
+ # its proof-of-concept status...
+# Note that there is no decvert(), as well as last encryption round is
+# performed with "horizontal" shifts. This is because this "vertical"
+# implementation [one which groups shifts on a given $s[i] to form a
+# "column," unlike "horizontal" one, which groups shifts on different
+# $s[i] to form a "row"] is work in progress. It was observed to run
+# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
+# whole 12% slower:-( So we face a trade-off... Shall it be resolved
+# some day? Till then the code is considered experimental and by
+# default remains dormant...
+
+sub encvert()
+{ my ($te,@s) = @_;
+ my ($v0,$v1) = ($acc,$key);
+
+ &mov ($v0,$s[3]); # copy s3
+ &mov (&DWP(4,"esp"),$s[2]); # save s2
+ &mov ($v1,$s[0]); # copy s0
+ &mov (&DWP(8,"esp"),$s[1]); # save s1
+
+ &movz ($s[2],&HB($s[0]));
+ &and ($s[0],0xFF);
+ &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
+ &shr ($v1,16);
+ &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
+ &movz ($s[1],&HB($v1));
+ &and ($v1,0xFF);
+ &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
+ &mov ($v1,$v0);
+ &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
+
+ &and ($v0,0xFF);
+ &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
+ &movz ($v0,&HB($v1));
+ &shr ($v1,16);
+ &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
+ &movz ($v0,&HB($v1));
+ &and ($v1,0xFF);
+ &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
+ &mov ($v1,&DWP(4,"esp")); # restore s2
+ &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
+
+ &mov ($v0,$v1);
+ &and ($v1,0xFF);
+ &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
+ &movz ($v1,&HB($v0));
+ &shr ($v0,16);
+ &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
+ &movz ($v1,&HB($v0));
+ &and ($v0,0xFF);
+ &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
+ &mov ($v0,&DWP(8,"esp")); # restore s1
+ &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
+
+ &mov ($v1,$v0);
+ &and ($v0,0xFF);
+ &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
+ &movz ($v0,&HB($v1));
+ &shr ($v1,16);
+ &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
+ &movz ($v0,&HB($v1));
+ &and ($v1,0xFF);
+ &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
+ &mov ($key,$__key); # reincarnate v1 as key
+ &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
+}
+
+# Another experimental routine, which features "horizontal spin," but
+# eliminates one reference to stack. Strangely enough runs slower...
+sub enchoriz()
+{ my ($v0,$v1) = ($key,$acc);
+
+ &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
+ &rotr ($s2,8); # 8,11,10, 9
+ &mov ($v1,&DWP(0,$te,$v0,8)); # 0
+ &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
+ &rotr ($s3,16); # 13,12,15,14
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 5
+ &movz ($v0,&HB($s2)); # 8,11,10*, 9
+ &rotr ($s0,16); # 1, 0, 3, 2
+ &xor ($v1,&DWP(2,$te,$v0,8)); # 10
+ &movz ($v0,&HB($s3)); # 13,12,15*,14
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
+ &mov ($__s0,$v1); # t[0] saved
+
+ &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
+ &shr ($s1,16); # -, -, 7, 6
+ &mov ($v1,&DWP(0,$te,$v0,8)); # 4
+ &movz ($v0,&LB($s3)); # 13,12,15,14*
+ &xor ($v1,&DWP(2,$te,$v0,8)); # 14
+ &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
+ &and ($s3,0xffff0000); # 13,12, -, -
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 3
+ &movz ($v0,&LB($s2)); # 8,11,10, 9*
+ &or ($s3,$s1); # 13,12, 7, 6
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
+ &mov ($s1,$v1); # s[1]=t[1]
+
+ &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
+ &shr ($s2,16); # -, -, 8,11
+ &mov ($v1,&DWP(2,$te,$v0,8)); # 2
+ &movz ($v0,&HB($s3)); # 13,12, 7*, 6
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 7
+ &movz ($v0,&HB($s2)); # -, -, 8*,11
+ &xor ($v1,&DWP(0,$te,$v0,8)); # 8
+ &mov ($v0,$s3);
+ &shr ($v0,24); # 13
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
+
+ &movz ($v0,&LB($s2)); # -, -, 8,11*
+ &shr ($s0,24); # 1*
+ &mov ($s2,&DWP(1,$te,$v0,8)); # 11
+ &xor ($s2,&DWP(3,$te,$s0,8)); # 1
+ &mov ($s0,$__s0); # s[0]=t[0]
+ &movz ($v0,&LB($s3)); # 13,12, 7, 6*
+ &shr ($s3,16); # , ,13,12
+ &xor ($s2,&DWP(2,$te,$v0,8)); # 6
+ &mov ($key,$__key); # reincarnate v0 as key
+ &and ($s3,0xff); # , ,13,12*
+ &mov ($s3,&DWP(0,$te,$s3,8)); # 12
+ &xor ($s3,$s2); # s[2]=t[3] collected
+ &mov ($s2,$v1); # s[2]=t[2]
+}
+
+# More experimental code... SSE one... Even though this one eliminates
+# *all* references to stack, it's not faster...
+sub sse_encbody()
+{
+ &movz ($acc,&LB("eax")); # 0
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
+ &movz ("edx",&HB("eax")); # 1
+ &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
+ &shr ("eax",16); # 5, 4
+
+ &movz ($acc,&LB("ebx")); # 10
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
+ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
+ &movz ($acc,&HB("ebx")); # 11
+ &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
+ &shr ("ebx",16); # 15,14
+
+ &movz ($acc,&HB("eax")); # 5
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
+ &movq ("mm3",QWP(16,$key));
+ &movz ($acc,&HB("ebx")); # 15
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
+ &movd ("mm0","ecx"); # t[0] collected
+
+ &movz ($acc,&LB("eax")); # 4
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
+ &movd ("eax","mm2"); # 7, 6, 3, 2
+ &movz ($acc,&LB("ebx")); # 14
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
+ &movd ("ebx","mm6"); # 13,12, 9, 8
+
+ &movz ($acc,&HB("eax")); # 3
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
+ &movz ($acc,&HB("ebx")); # 9
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
+ &movd ("mm1","ecx"); # t[1] collected
+
+ &movz ($acc,&LB("eax")); # 2
+ &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
+ &shr ("eax",16); # 7, 6
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
+ &movz ($acc,&LB("ebx")); # 8
+ &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
+ &shr ("ebx",16); # 13,12
+
+ &movz ($acc,&HB("eax")); # 7
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
+ &pxor ("mm0","mm3");
+ &movz ("eax",&LB("eax")); # 6
+ &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
+ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
+ &movz ($acc,&HB("ebx")); # 13
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
+ &xor ("ecx",&DWP(24,$key)); # t[2]
+ &movd ("mm4","ecx"); # t[2] collected
+ &movz ("ebx",&LB("ebx")); # 12
+ &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
+ &shr ("ecx",16);
+ &movd ("eax","mm1"); # 5, 4, 1, 0
+ &mov ("ebx",&DWP(28,$key)); # t[3]
+ &xor ("ebx","edx");
+ &movd ("mm5","ebx"); # t[3] collected
+ &and ("ebx",0xffff0000);
+ &or ("ebx","ecx");
+
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
+}
+
+######################################################################
+# "Compact" block function
+######################################################################
+
+sub enccompact()
+{ my $Fn = \&mov;
+ while ($#_>5) { pop(@_); $Fn=sub{}; }
+ my ($i,$te,@s)=@_;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
+
+ # $Fn is used in first compact round and its purpose is to
+ # void restoration of some values from stack, so that after
+ # 4xenccompact with extra argument $key value is left there...
+ if ($i==3) { &$Fn ($key,$__key); }##%edx
+ else { &mov ($out,$s[0]); }
+ &and ($out,0xFF);
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
+ &movz ($out,&BP(-128,$te,$out,1));
+
+ if ($i==3) { $tmp=$s[1]; }##%eax
+ &movz ($tmp,&HB($s[1]));
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
+ &shl ($tmp,8);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
+ else { &mov ($tmp,$s[2]);
+ &shr ($tmp,16); }
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
+ &and ($tmp,0xFF);
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
+ &shl ($tmp,16);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
+ else { &mov ($tmp,$s[3]);
+ &shr ($tmp,24); }
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
+ &shl ($tmp,24);
+ &xor ($out,$tmp);
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
+ if ($i==3) { &mov ($s[3],$acc); }
+ &comment();
+}
+
+sub enctransform()
+{ my @s = ($s0,$s1,$s2,$s3);
+ my $i = shift;
+ my $tmp = $tbl;
+ my $r2 = $key ;
+
+ &and ($tmp,$s[$i]);
+ &lea ($r2,&DWP(0,$s[$i],$s[$i]));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
+ &and ($r2,0xfefefefe);
+ &sub ($acc,$tmp);
+ &mov ($tmp,$s[$i]);
+ &and ($acc,0x1b1b1b1b);
+ &rotr ($tmp,16);
+ &xor ($acc,$r2); # r2
+ &mov ($r2,$s[$i]);
+
+ &xor ($s[$i],$acc); # r0 ^ r2
+ &rotr ($r2,16+8);
+ &xor ($acc,$tmp);
+ &rotl ($s[$i],24);
+ &xor ($acc,$r2);
+ &mov ($tmp,0x80808080) if ($i!=1);
+ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
+}
+
+&function_begin_B("_x86_AES_encrypt_compact");
+ # note that caller is expected to allocate stack frame for me!
+ &mov ($__key,$key); # save key
+
+ &xor ($s0,&DWP(0,$key)); # xor with key
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov ($__end,$acc); # end of key schedule
+
+ # prefetch Te4
+ &mov ($key,&DWP(0-128,$tbl));
+ &mov ($acc,&DWP(32-128,$tbl));
+ &mov ($key,&DWP(64-128,$tbl));
+ &mov ($acc,&DWP(96-128,$tbl));
+ &mov ($key,&DWP(128-128,$tbl));
+ &mov ($acc,&DWP(160-128,$tbl));
+ &mov ($key,&DWP(192-128,$tbl));
+ &mov ($acc,&DWP(224-128,$tbl));
+
+ &set_label("loop",16);
+
+ &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
+ &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
+ &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
+ &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
+ &mov ($tbl,0x80808080);
+ &enctransform(2);
+ &enctransform(3);
+ &enctransform(0);
+ &enctransform(1);
+ &mov ($key,$__key);
+ &mov ($tbl,$__tbl);
+ &add ($key,16); # advance rd_key
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
+ &jb (&label("loop"));
+
+ &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
+ &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
+ &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
+ &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
+
+ &xor ($s0,&DWP(16,$key));
+ &xor ($s1,&DWP(20,$key));
+ &xor ($s2,&DWP(24,$key));
+ &xor ($s3,&DWP(28,$key));
+
+ &ret ();
+&function_end_B("_x86_AES_encrypt_compact");
+
+######################################################################
+# "Compact" SSE block function.
+######################################################################
+#
+# Performance is not actually extraordinary in comparison to pure
+# x86 code. In particular encrypt performance is virtually the same.
+# Decrypt performance on the other hand is 15-20% better on newer
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
+# better on PIII:-) And additionally on the pros side this code
+# eliminates redundant references to stack and thus relieves/
+# minimizes the pressure on the memory bus.
+#
+# MMX register layout lsb
+# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+# | mm4 | mm0 |
+# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+# | s3 | s2 | s1 | s0 |
+# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
+# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+#
+# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
+# In this terms encryption and decryption "compact" permutation
+# matrices can be depicted as following:
+#
+# encryption lsb # decryption lsb
+# +----++----+----+----+----+ # +----++----+----+----+----+
+# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
+# +----++----+----+----+----+ # +----++----+----+----+----+
+# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
+# +----++----+----+----+----+ # +----++----+----+----+----+
+# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
+# +----++----+----+----+----+ # +----++----+----+----+----+
+# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
+# +----++----+----+----+----+ # +----++----+----+----+----+
+#
+######################################################################
+# Why not xmm registers? Short answer. It was actually tested and
+# was not any faster, but *contrary*, most notably on Intel CPUs.
+# Longer answer. Main advantage of using mm registers is that movd
+# latency is lower, especially on Intel P4. While arithmetic
+# instructions are twice as many, they can be scheduled every cycle
+# and not every second one when they are operating on xmm register,
+# so that "arithmetic throughput" remains virtually the same. And
+# finally the code can be executed even on elder SSE-only CPUs:-)
+
+sub sse_enccompact()
+{
+ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
+ &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
+ &movd ("eax","mm1"); # 5, 4, 1, 0
+ &movd ("ebx","mm5"); # 15,14,11,10
+ &mov ($__key,$key);
+
+ &movz ($acc,&LB("eax")); # 0
+ &movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
+ &shr ("eax",16); # 5, 4
+ &shl ("edx",8); # 1
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
+ &shl ($acc,16); # 10
+ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
+ &or ("ecx",$acc); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 5
+ &shl ($acc,24); # 11
+ &shr ("ebx",16); # 15,14
+ &or ("edx",$acc); # 11
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
+ &shl ($acc,8); # 5
+ &or ("ecx",$acc); # 5
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&LB("eax")); # 4
+ &shl ($acc,24); # 15
+ &or ("ecx",$acc); # 15
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
+ &movd ("eax","mm2"); # 7, 6, 3, 2
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 3
+ &shl ("ecx",16); # 14
+ &movd ("ebx","mm6"); # 13,12, 9, 8
+ &or ("ecx",$acc); # 14
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
+ &movz ($key,&HB("ebx")); # 9
+ &shl ($acc,24); # 3
+ &or ("ecx",$acc); # 3
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("ebx")); # 8
+ &shl ($acc,8); # 9
+ &shr ("ebx",16); # 13,12
+ &or ("ecx",$acc); # 9
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
+ &movz ($key,&LB("eax")); # 2
+ &shr ("eax",16); # 7, 6
+ &movd ("mm1","ecx"); # t[1] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
+ &movz ($key,&HB("eax")); # 7
+ &shl ("ecx",16); # 2
+ &and ("eax",0xff); # 6
+ &or ("ecx",$acc); # 2
+
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
+ &shl ($acc,24); # 7
+ &and ("ebx",0xff); # 12
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
+ &or ("ecx",$acc); # 7
+ &shl ("eax",16); # 6
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
+ &or ("edx","eax"); # 6
+ &shl ($acc,8); # 13
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
+ &or ("ecx",$acc); # 13
+ &or ("edx","ebx"); # 12
+ &mov ($key,$__key);
+ &movd ("mm4","ecx"); # t[2] collected
+ &movd ("mm5","edx"); # t[3] collected
+
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
+}
+
+ if (!$x86only) {
+&function_begin_B("_sse_AES_encrypt_compact");
+ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
+ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
+
+ # note that caller is expected to allocate stack frame for me!
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov ($__end,$acc); # end of key schedule
+
+ &mov ($s0,0x1b1b1b1b); # magic constant
+ &mov (&DWP(8,"esp"),$s0);
+ &mov (&DWP(12,"esp"),$s0);
+
+ # prefetch Te4
+ &mov ($s0,&DWP(0-128,$tbl));
+ &mov ($s1,&DWP(32-128,$tbl));
+ &mov ($s2,&DWP(64-128,$tbl));
+ &mov ($s3,&DWP(96-128,$tbl));
+ &mov ($s0,&DWP(128-128,$tbl));
+ &mov ($s1,&DWP(160-128,$tbl));
+ &mov ($s2,&DWP(192-128,$tbl));
+ &mov ($s3,&DWP(224-128,$tbl));
+
+ &set_label("loop",16);
+ &sse_enccompact();
+ &add ($key,16);
+ &cmp ($key,$__end);
+ &ja (&label("out"));
+
+ &movq ("mm2",&QWP(8,"esp"));
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
+ &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
+ &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
+ &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
+ &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
+ &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
+ &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
+
+ &movq ("mm2","mm3"); &movq ("mm6","mm7");
+ &pslld ("mm3",8); &pslld ("mm7",8);
+ &psrld ("mm2",24); &psrld ("mm6",24);
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
+
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
+ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
+ &psrld ("mm1",8); &psrld ("mm5",8);
+ &mov ($s0,&DWP(0-128,$tbl));
+ &pslld ("mm3",24); &pslld ("mm7",24);
+ &mov ($s1,&DWP(64-128,$tbl));
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
+ &mov ($s2,&DWP(128-128,$tbl));
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
+ &mov ($s3,&DWP(192-128,$tbl));
+
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
+ &jmp (&label("loop"));
+
+ &set_label("out",16);
+ &pxor ("mm0",&QWP(0,$key));
+ &pxor ("mm4",&QWP(8,$key));
+
+ &ret ();
+&function_end_B("_sse_AES_encrypt_compact");
+ }
+
+######################################################################
+# Vanilla block function.
+######################################################################
+
+sub encstep()
+{ my ($i,$te,@s) = @_;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
+
+ # lines marked with #%e?x[i] denote "reordered" instructions...
+ if ($i==3) { &mov ($key,$__key); }##%edx
+ else { &mov ($out,$s[0]);
+ &and ($out,0xFF); }
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
+ &mov ($out,&DWP(0,$te,$out,8));
+
+ if ($i==3) { $tmp=$s[1]; }##%eax
+ &movz ($tmp,&HB($s[1]));
+ &xor ($out,&DWP(3,$te,$tmp,8));
+
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
+ else { &mov ($tmp,$s[2]);
+ &shr ($tmp,16); }
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
+ &and ($tmp,0xFF);
+ &xor ($out,&DWP(2,$te,$tmp,8));
+
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
+ else { &mov ($tmp,$s[3]);
+ &shr ($tmp,24) }
+ &xor ($out,&DWP(1,$te,$tmp,8));
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
+ if ($i==3) { &mov ($s[3],$acc); }
+ &comment();
+}
+
+sub enclast()
+{ my ($i,$te,@s)=@_;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
+
+ if ($i==3) { &mov ($key,$__key); }##%edx
+ else { &mov ($out,$s[0]); }
+ &and ($out,0xFF);
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
+ &mov ($out,&DWP(2,$te,$out,8));
+ &and ($out,0x000000ff);
+
+ if ($i==3) { $tmp=$s[1]; }##%eax
+ &movz ($tmp,&HB($s[1]));
+ &mov ($tmp,&DWP(0,$te,$tmp,8));
+ &and ($tmp,0x0000ff00);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
+ else { &mov ($tmp,$s[2]);
+ &shr ($tmp,16); }
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
+ &and ($tmp,0xFF);
+ &mov ($tmp,&DWP(0,$te,$tmp,8));
+ &and ($tmp,0x00ff0000);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
+ else { &mov ($tmp,$s[3]);
+ &shr ($tmp,24); }
+ &mov ($tmp,&DWP(2,$te,$tmp,8));
+ &and ($tmp,0xff000000);
+ &xor ($out,$tmp);
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
+ if ($i==3) { &mov ($s[3],$acc); }
+}
+
+&function_begin_B("_x86_AES_encrypt");
+ if ($vertical_spin) {
+ # I need high parts of volatile registers to be accessible...
+ &exch ($s1="edi",$key="ebx");
+ &mov ($s2="esi",$acc="ecx");
+ }
+
+ # note that caller is expected to allocate stack frame for me!
+ &mov ($__key,$key); # save key
+
+ &xor ($s0,&DWP(0,$key)); # xor with key
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
+
+ if ($small_footprint) {
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov ($__end,$acc); # end of key schedule
+
+ &set_label("loop",16);
+ if ($vertical_spin) {
+ &encvert($tbl,$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
+ }
+ &add ($key,16); # advance rd_key
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
+ &jb (&label("loop"));
+ }
+ else {
+ &cmp ($acc,10);
+ &jle (&label("10rounds"));
+ &cmp ($acc,12);
+ &jle (&label("12rounds"));
+
+ &set_label("14rounds",4);
+ for ($i=1;$i<3;$i++) {
+ if ($vertical_spin) {
+ &encvert($tbl,$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
+ }
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
+ }
+ &add ($key,32);
+ &mov ($__key,$key); # advance rd_key
+ &set_label("12rounds",4);
+ for ($i=1;$i<3;$i++) {
+ if ($vertical_spin) {
+ &encvert($tbl,$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
+ }
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
+ }
+ &add ($key,32);
+ &mov ($__key,$key); # advance rd_key
+ &set_label("10rounds",4);
+ for ($i=1;$i<10;$i++) {
+ if ($vertical_spin) {
+ &encvert($tbl,$s0,$s1,$s2,$s3);
+ } else {
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
+ }
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
+ }
+ }
+
+ if ($vertical_spin) {
+ # "reincarnate" some registers for "horizontal" spin...
+ &mov ($s1="ebx",$key="edi");
+ &mov ($s2="ecx",$acc="esi");
+ }
+ &enclast(0,$tbl,$s0,$s1,$s2,$s3);
+ &enclast(1,$tbl,$s1,$s2,$s3,$s0);
+ &enclast(2,$tbl,$s2,$s3,$s0,$s1);
+ &enclast(3,$tbl,$s3,$s0,$s1,$s2);
+
+ &add ($key,$small_footprint?16:160);
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &ret ();
+
+&set_label("AES_Te",64); # Yes! I keep it in the code segment!
+ &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
+ &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
+ &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
+ &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
+ &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
+ &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
+ &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
+ &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
+ &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
+ &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
+ &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
+ &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
+ &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
+ &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
+ &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
+ &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
+ &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
+ &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
+ &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
+ &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
+ &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
+ &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
+ &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
+ &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
+ &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
+ &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
+ &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
+ &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
+ &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
+ &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
+ &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
+ &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
+ &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
+ &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
+ &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
+ &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
+ &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
+ &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
+ &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
+ &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
+ &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
+ &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
+ &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
+ &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
+ &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
+ &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
+ &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
+ &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
+ &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
+ &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
+ &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
+ &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
+ &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
+ &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
+ &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
+ &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
+ &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
+ &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
+ &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
+ &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
+ &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
+ &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
+ &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
+ &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
+
+#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+#rcon:
+ &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
+ &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
+ &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
+ &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+&function_end_B("_x86_AES_encrypt");
+
+# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
+&function_begin("AES_encrypt");
+ &mov ($acc,&wparam(0)); # load inp
+ &mov ($key,&wparam(2)); # load key
+
+ &mov ($s0,"esp");
+ &sub ("esp",36);
+ &and ("esp",-64); # align to cache-line
+
+ # place stack frame just "above" the key schedule
+ &lea ($s1,&DWP(-64-63,$key));
+ &sub ($s1,"esp");
+ &neg ($s1);
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ("esp",$s1);
+ &add ("esp",4); # 4 is reserved for caller's return address
+ &mov ($_esp,$s0); # save stack pointer
+
+ &call (&label("pic_point")); # make it PIC!
+ &set_label("pic_point");
+ &blindpop($tbl);
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
+ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+
+ # pick Te4 copy which can't "overlap" with stack frame or key schedule
+ &lea ($s1,&DWP(768-4,"esp"));
+ &sub ($s1,$tbl);
+ &and ($s1,0x300);
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
+
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),25); # check for SSE bit
+ &jnc (&label("x86"));
+
+ &movq ("mm0",&QWP(0,$acc));
+ &movq ("mm4",&QWP(8,$acc));
+ &call ("_sse_AES_encrypt_compact");
+ &mov ("esp",$_esp); # restore stack pointer
+ &mov ($acc,&wparam(1)); # load out
+ &movq (&QWP(0,$acc),"mm0"); # write output data
+ &movq (&QWP(8,$acc),"mm4");
+ &emms ();
+ &function_end_A();
+ }
+ &set_label("x86",16);
+ &mov ($_tbl,$tbl);
+ &mov ($s0,&DWP(0,$acc)); # load input data
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+ &call ("_x86_AES_encrypt_compact");
+ &mov ("esp",$_esp); # restore stack pointer
+ &mov ($acc,&wparam(1)); # load out
+ &mov (&DWP(0,$acc),$s0); # write output data
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+&function_end("AES_encrypt");
+
+#--------------------------------------------------------------------#
+
+######################################################################
+# "Compact" block function
+######################################################################
+
+sub deccompact()
+{ my $Fn = \&mov;
+ while ($#_>5) { pop(@_); $Fn=sub{}; }
+ my ($i,$td,@s)=@_;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
+
+ # $Fn is used in first compact round and its purpose is to
+ # void restoration of some values from stack, so that after
+ # 4xdeccompact with extra argument $key, $s0 and $s1 values
+ # are left there...
+ if($i==3) { &$Fn ($key,$__key); }
+ else { &mov ($out,$s[0]); }
+ &and ($out,0xFF);
+ &movz ($out,&BP(-128,$td,$out,1));
+
+ if ($i==3) { $tmp=$s[1]; }
+ &movz ($tmp,&HB($s[1]));
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
+ &shl ($tmp,8);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
+ else { mov ($tmp,$s[2]); }
+ &shr ($tmp,16);
+ &and ($tmp,0xFF);
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
+ &shl ($tmp,16);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
+ else { &mov ($tmp,$s[3]); }
+ &shr ($tmp,24);
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
+ &shl ($tmp,24);
+ &xor ($out,$tmp);
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
+ if ($i==3) { &$Fn ($s[3],$__s0); }
+}
+
+# must be called with 2,3,0,1 as argument sequence!!!
+sub dectransform()
+{ my @s = ($s0,$s1,$s2,$s3);
+ my $i = shift;
+ my $tmp = $key;
+ my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
+ my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
+ my $tp8 = $tbl;
+
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$s[$i]);
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
+ &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
+ &sub ($acc,$tmp);
+ &and ($tp2,0xfefefefe);
+ &and ($acc,0x1b1b1b1b);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
+
+ &and ($tmp,$tp2);
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
+ &sub ($acc,$tmp);
+ &and ($tp4,0xfefefefe);
+ &and ($acc,0x1b1b1b1b);
+ &xor ($tp2,$s[$i]); # tp2^tp1
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
+
+ &and ($tmp,$tp4);
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
+ &sub ($acc,$tmp);
+ &and ($tp8,0xfefefefe);
+ &and ($acc,0x1b1b1b1b);
+ &xor ($tp4,$s[$i]); # tp4^tp1
+ &rotl ($s[$i],8); # = ROTATE(tp1,8)
+ &xor ($tp8,$acc);
+
+ &xor ($s[$i],$tp2);
+ &xor ($tp2,$tp8);
+ &xor ($s[$i],$tp4);
+ &xor ($tp4,$tp8);
+ &rotl ($tp2,24);
+ &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
+ &rotl ($tp4,16);
+ &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
+ &rotl ($tp8,8);
+ &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
+ &mov ($s[0],$__s0) if($i==2); #prefetch $s0
+ &mov ($s[1],$__s1) if($i==3); #prefetch $s1
+ &mov ($s[2],$__s2) if($i==1);
+ &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
+
+ &mov ($s[3],$__s3) if($i==1);
+ &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
+}
+
+&function_begin_B("_x86_AES_decrypt_compact");
+ # note that caller is expected to allocate stack frame for me!
+ &mov ($__key,$key); # save key
+
+ &xor ($s0,&DWP(0,$key)); # xor with key
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
+
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov ($__end,$acc); # end of key schedule
+
+ # prefetch Td4
+ &mov ($key,&DWP(0-128,$tbl));
+ &mov ($acc,&DWP(32-128,$tbl));
+ &mov ($key,&DWP(64-128,$tbl));
+ &mov ($acc,&DWP(96-128,$tbl));
+ &mov ($key,&DWP(128-128,$tbl));
+ &mov ($acc,&DWP(160-128,$tbl));
+ &mov ($key,&DWP(192-128,$tbl));
+ &mov ($acc,&DWP(224-128,$tbl));
+
+ &set_label("loop",16);
+
+ &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
+ &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
+ &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
+ &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
+ &dectransform(2);
+ &dectransform(3);
+ &dectransform(0);
+ &dectransform(1);
+ &mov ($key,$__key);
+ &mov ($tbl,$__tbl);
+ &add ($key,16); # advance rd_key
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
+ &jb (&label("loop"));
+
+ &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
+ &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
+ &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
+ &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
+
+ &xor ($s0,&DWP(16,$key));
+ &xor ($s1,&DWP(20,$key));
+ &xor ($s2,&DWP(24,$key));
+ &xor ($s3,&DWP(28,$key));
+
+ &ret ();
+&function_end_B("_x86_AES_decrypt_compact");
+
+######################################################################
+# "Compact" SSE block function.
+######################################################################
+
+sub sse_deccompact()
+{
+ &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
+ &pshufw ("mm5","mm4",0x09); # 13,12,11,10
+ &movd ("eax","mm1"); # 7, 6, 1, 0
+ &movd ("ebx","mm5"); # 13,12,11,10
+ &mov ($__key,$key);
+
+ &movz ($acc,&LB("eax")); # 0
+ &movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
+ &shr ("eax",16); # 7, 6
+ &shl ("edx",8); # 1
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
+ &shl ($acc,16); # 10
+ &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
+ &or ("ecx",$acc); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 7
+ &shl ($acc,24); # 11
+ &shr ("ebx",16); # 13,12
+ &or ("edx",$acc); # 11
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
+ &shl ($acc,24); # 7
+ &or ("ecx",$acc); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
+ &movz ($key,&LB("eax")); # 6
+ &shl ($acc,8); # 13
+ &movd ("eax","mm2"); # 3, 2, 5, 4
+ &or ("ecx",$acc); # 13
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
+ &movz ($key,&LB("ebx")); # 12
+ &shl ($acc,16); # 6
+ &movd ("ebx","mm6"); # 9, 8,15,14
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
+ &movz ($key,&LB("eax")); # 4
+ &or ("ecx",$acc); # 12
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
+ &or ("edx",$acc); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 5
+ &shl ($acc,16); # 14
+ &shr ("eax",16); # 3, 2
+ &or ("edx",$acc); # 14
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
+ &shr ("ebx",16); # 9, 8
+ &shl ($acc,8); # 5
+ &movd ("mm1","edx"); # t[1] collected
+ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&HB("ebx")); # 9
+ &shl ("edx",24); # 15
+ &and ("ebx",0xff); # 8
+ &or ("edx",$acc); # 15
+
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
+
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("eax")); # 2
+ &shl ($acc,8); # 9
+ &movz ("eax",&HB("eax")); # 3
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
+ &or ("ecx",$acc); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
+ &or ("edx","ebx"); # 8
+ &shl ($acc,16); # 2
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
+ &or ("edx",$acc); # 2
+ &shl ("eax",24); # 3
+ &or ("ecx","eax"); # 3
+ &mov ($key,$__key);
+ &movd ("mm4","edx"); # t[2] collected
+ &movd ("mm5","ecx"); # t[3] collected
+
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
+}
+
+ if (!$x86only) {
+&function_begin_B("_sse_AES_decrypt_compact");
+ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
+ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
+
+ # note that caller is expected to allocate stack frame for me!
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov ($__end,$acc); # end of key schedule
+
+ &mov ($s0,0x1b1b1b1b); # magic constant
+ &mov (&DWP(8,"esp"),$s0);
+ &mov (&DWP(12,"esp"),$s0);
+
+ # prefetch Td4
+ &mov ($s0,&DWP(0-128,$tbl));
+ &mov ($s1,&DWP(32-128,$tbl));
+ &mov ($s2,&DWP(64-128,$tbl));
+ &mov ($s3,&DWP(96-128,$tbl));
+ &mov ($s0,&DWP(128-128,$tbl));
+ &mov ($s1,&DWP(160-128,$tbl));
+ &mov ($s2,&DWP(192-128,$tbl));
+ &mov ($s3,&DWP(224-128,$tbl));
+
+ &set_label("loop",16);
+ &sse_deccompact();
+ &add ($key,16);
+ &cmp ($key,$__end);
+ &ja (&label("out"));
+
+ # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
+ &movq ("mm3","mm0"); &movq ("mm7","mm4");
+ &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
+ &movq ("mm1","mm0"); &movq ("mm5","mm4");
+ &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
+ &pslld ("mm2",8); &pslld ("mm6",8);
+ &psrld ("mm3",8); &psrld ("mm7",8);
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
+ &pslld ("mm2",16); &pslld ("mm6",16);
+ &psrld ("mm3",16); &psrld ("mm7",16);
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
+
+ &movq ("mm3",&QWP(8,"esp"));
+ &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
+ &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
+ &pand ("mm2","mm3"); &pand ("mm6","mm3");
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
+ &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
+ &movq ("mm2","mm1"); &movq ("mm6","mm5");
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
+ &pslld ("mm3",24); &pslld ("mm7",24);
+ &psrld ("mm2",8); &psrld ("mm6",8);
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
+
+ &movq ("mm2",&QWP(8,"esp"));
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
+ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
+ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
+ &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
+
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
+ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
+ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
+ &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
+ &pslld ("mm1",8); &pslld ("mm5",8);
+ &psrld ("mm3",8); &psrld ("mm7",8);
+ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
+ &mov ($s0,&DWP(0-128,$tbl));
+ &pslld ("mm1",16); &pslld ("mm5",16);
+ &mov ($s1,&DWP(64-128,$tbl));
+ &psrld ("mm3",16); &psrld ("mm7",16);
+ &mov ($s2,&DWP(128-128,$tbl));
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
+ &mov ($s3,&DWP(192-128,$tbl));
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
+
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
+ &jmp (&label("loop"));
+
+ &set_label("out",16);
+ &pxor ("mm0",&QWP(0,$key));
+ &pxor ("mm4",&QWP(8,$key));
+
+ &ret ();
+&function_end_B("_sse_AES_decrypt_compact");
+ }
+
+######################################################################
+# Vanilla block function.
+######################################################################
+
+sub decstep()
+{ my ($i,$td,@s) = @_;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
+
+ # no instructions are reordered, as performance appears
+ # optimal... or rather that all attempts to reorder didn't
+ # result in better performance [which by the way is not a
+ # bit lower than ecryption].
+ if($i==3) { &mov ($key,$__key); }
+ else { &mov ($out,$s[0]); }
+ &and ($out,0xFF);
+ &mov ($out,&DWP(0,$td,$out,8));
+
+ if ($i==3) { $tmp=$s[1]; }
+ &movz ($tmp,&HB($s[1]));
+ &xor ($out,&DWP(3,$td,$tmp,8));
+
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
+ else { &mov ($tmp,$s[2]); }
+ &shr ($tmp,16);
+ &and ($tmp,0xFF);
+ &xor ($out,&DWP(2,$td,$tmp,8));
+
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
+ else { &mov ($tmp,$s[3]); }
+ &shr ($tmp,24);
+ &xor ($out,&DWP(1,$td,$tmp,8));
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
+ if ($i==3) { &mov ($s[3],$__s0); }
+ &comment();
+}
+
+sub declast()
+{ my ($i,$td,@s)=@_;
+ my $tmp = $key;
+ my $out = $i==3?$s[0]:$acc;
+
+ if($i==0) { &lea ($td,&DWP(2048+128,$td));
+ &mov ($tmp,&DWP(0-128,$td));
+ &mov ($acc,&DWP(32-128,$td));
+ &mov ($tmp,&DWP(64-128,$td));
+ &mov ($acc,&DWP(96-128,$td));
+ &mov ($tmp,&DWP(128-128,$td));
+ &mov ($acc,&DWP(160-128,$td));
+ &mov ($tmp,&DWP(192-128,$td));
+ &mov ($acc,&DWP(224-128,$td));
+ &lea ($td,&DWP(-128,$td)); }
+ if($i==3) { &mov ($key,$__key); }
+ else { &mov ($out,$s[0]); }
+ &and ($out,0xFF);
+ &movz ($out,&BP(0,$td,$out,1));
+
+ if ($i==3) { $tmp=$s[1]; }
+ &movz ($tmp,&HB($s[1]));
+ &movz ($tmp,&BP(0,$td,$tmp,1));
+ &shl ($tmp,8);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
+ else { mov ($tmp,$s[2]); }
+ &shr ($tmp,16);
+ &and ($tmp,0xFF);
+ &movz ($tmp,&BP(0,$td,$tmp,1));
+ &shl ($tmp,16);
+ &xor ($out,$tmp);
+
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
+ else { &mov ($tmp,$s[3]); }
+ &shr ($tmp,24);
+ &movz ($tmp,&BP(0,$td,$tmp,1));
+ &shl ($tmp,24);
+ &xor ($out,$tmp);
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
+ if ($i==3) { &mov ($s[3],$__s0);
+ &lea ($td,&DWP(-2048,$td)); }
+}
+
+&function_begin_B("_x86_AES_decrypt");
+ # note that caller is expected to allocate stack frame for me!
+ &mov ($__key,$key); # save key
+
+ &xor ($s0,&DWP(0,$key)); # xor with key
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
+
+ if ($small_footprint) {
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov ($__end,$acc); # end of key schedule
+ &set_label("loop",16);
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
+ &add ($key,16); # advance rd_key
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
+ &jb (&label("loop"));
+ }
+ else {
+ &cmp ($acc,10);
+ &jle (&label("10rounds"));
+ &cmp ($acc,12);
+ &jle (&label("12rounds"));
+
+ &set_label("14rounds",4);
+ for ($i=1;$i<3;$i++) {
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
+ }
+ &add ($key,32);
+ &mov ($__key,$key); # advance rd_key
+ &set_label("12rounds",4);
+ for ($i=1;$i<3;$i++) {
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
+ }
+ &add ($key,32);
+ &mov ($__key,$key); # advance rd_key
+ &set_label("10rounds",4);
+ for ($i=1;$i<10;$i++) {
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
+ &xor ($s0,&DWP(16*$i+0,$key));
+ &xor ($s1,&DWP(16*$i+4,$key));
+ &xor ($s2,&DWP(16*$i+8,$key));
+ &xor ($s3,&DWP(16*$i+12,$key));
+ }
+ }
+
+ &declast(0,$tbl,$s0,$s3,$s2,$s1);
+ &declast(1,$tbl,$s1,$s0,$s3,$s2);
+ &declast(2,$tbl,$s2,$s1,$s0,$s3);
+ &declast(3,$tbl,$s3,$s2,$s1,$s0);
+
+ &add ($key,$small_footprint?16:160);
+ &xor ($s0,&DWP(0,$key));
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &ret ();
+
+&set_label("AES_Td",64); # Yes! I keep it in the code segment!
+ &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
+ &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
+ &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
+ &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
+ &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
+ &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
+ &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
+ &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
+ &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
+ &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
+ &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
+ &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
+ &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
+ &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
+ &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
+ &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
+ &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
+ &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
+ &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
+ &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
+ &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
+ &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
+ &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
+ &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
+ &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
+ &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
+ &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
+ &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
+ &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
+ &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
+ &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
+ &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
+ &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
+ &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
+ &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
+ &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
+ &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
+ &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
+ &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
+ &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
+ &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
+ &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
+ &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
+ &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
+ &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
+ &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
+ &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
+ &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
+ &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
+ &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
+ &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
+ &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
+ &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
+ &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
+ &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
+ &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
+ &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
+ &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
+ &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
+ &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
+ &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
+ &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
+ &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
+ &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
+
+#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+&function_end_B("_x86_AES_decrypt");
+
+# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
+&function_begin("AES_decrypt");
+ &mov ($acc,&wparam(0)); # load inp
+ &mov ($key,&wparam(2)); # load key
+
+ &mov ($s0,"esp");
+ &sub ("esp",36);
+ &and ("esp",-64); # align to cache-line
+
+ # place stack frame just "above" the key schedule
+ &lea ($s1,&DWP(-64-63,$key));
+ &sub ($s1,"esp");
+ &neg ($s1);
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ("esp",$s1);
+ &add ("esp",4); # 4 is reserved for caller's return address
+ &mov ($_esp,$s0); # save stack pointer
+
+ &call (&label("pic_point")); # make it PIC!
+ &set_label("pic_point");
+ &blindpop($tbl);
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
+ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
+
+ # pick Td4 copy which can't "overlap" with stack frame or key schedule
+ &lea ($s1,&DWP(768-4,"esp"));
+ &sub ($s1,$tbl);
+ &and ($s1,0x300);
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
+
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),25); # check for SSE bit
+ &jnc (&label("x86"));
+
+ &movq ("mm0",&QWP(0,$acc));
+ &movq ("mm4",&QWP(8,$acc));
+ &call ("_sse_AES_decrypt_compact");
+ &mov ("esp",$_esp); # restore stack pointer
+ &mov ($acc,&wparam(1)); # load out
+ &movq (&QWP(0,$acc),"mm0"); # write output data
+ &movq (&QWP(8,$acc),"mm4");
+ &emms ();
+ &function_end_A();
+ }
+ &set_label("x86",16);
+ &mov ($_tbl,$tbl);
+ &mov ($s0,&DWP(0,$acc)); # load input data
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+ &call ("_x86_AES_decrypt_compact");
+ &mov ("esp",$_esp); # restore stack pointer
+ &mov ($acc,&wparam(1)); # load out
+ &mov (&DWP(0,$acc),$s0); # write output data
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+&function_end("AES_decrypt");
+
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+{
+# stack frame layout
+# -4(%esp) # return address 0(%esp)
+# 0(%esp) # s0 backing store 4(%esp)
+# 4(%esp) # s1 backing store 8(%esp)
+# 8(%esp) # s2 backing store 12(%esp)
+# 12(%esp) # s3 backing store 16(%esp)
+# 16(%esp) # key backup 20(%esp)
+# 20(%esp) # end of key schedule 24(%esp)
+# 24(%esp) # %ebp backup 28(%esp)
+# 28(%esp) # %esp backup
+my $_inp=&DWP(32,"esp"); # copy of wparam(0)
+my $_out=&DWP(36,"esp"); # copy of wparam(1)
+my $_len=&DWP(40,"esp"); # copy of wparam(2)
+my $_key=&DWP(44,"esp"); # copy of wparam(3)
+my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
+my $_tmp=&DWP(52,"esp"); # volatile variable
+#
+my $ivec=&DWP(60,"esp"); # ivec[16]
+my $aes_key=&DWP(76,"esp"); # copy of aes_key
+my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
+
+&function_begin("AES_cbc_encrypt");
+ &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
+ &cmp ($s2,0);
+ &je (&label("drop_out"));
+
+ &call (&label("pic_point")); # make it PIC!
+ &set_label("pic_point");
+ &blindpop($tbl);
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
+
+ &cmp (&wparam(5),0);
+ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+ &jne (&label("picked_te"));
+ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
+ &set_label("picked_te");
+
+ # one can argue if this is required
+ &pushf ();
+ &cld ();
+
+ &cmp ($s2,$speed_limit);
+ &jb (&label("slow_way"));
+ &test ($s2,15);
+ &jnz (&label("slow_way"));
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),28); # check for hyper-threading bit
+ &jc (&label("slow_way"));
+ }
+ # pre-allocate aligned stack frame...
+ &lea ($acc,&DWP(-80-244,"esp"));
+ &and ($acc,-64);
+
+ # ... and make sure it doesn't alias with $tbl modulo 4096
+ &mov ($s0,$tbl);
+ &lea ($s1,&DWP(2048+256,$tbl));
+ &mov ($s3,$acc);
+ &and ($s0,0xfff); # s = %ebp&0xfff
+ &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
+ &and ($s3,0xfff); # p = %esp&0xfff
+
+ &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
+ &jb (&label("tbl_break_out"));
+ &sub ($s3,$s1);
+ &sub ($acc,$s3);
+ &jmp (&label("tbl_ok"));
+ &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
+ &sub ($s3,$s0);
+ &and ($s3,0xfff);
+ &add ($s3,384);
+ &sub ($acc,$s3);
+ &set_label("tbl_ok",4);
+
+ &lea ($s3,&wparam(0)); # obtain pointer to parameter block
+ &exch ("esp",$acc); # allocate stack frame
+ &add ("esp",4); # reserve for return address!
+ &mov ($_tbl,$tbl); # save %ebp
+ &mov ($_esp,$acc); # save %esp
+
+ &mov ($s0,&DWP(0,$s3)); # load inp
+ &mov ($s1,&DWP(4,$s3)); # load out
+ #&mov ($s2,&DWP(8,$s3)); # load len
+ &mov ($key,&DWP(12,$s3)); # load key
+ &mov ($acc,&DWP(16,$s3)); # load ivp
+ &mov ($s3,&DWP(20,$s3)); # load enc flag
+
+ &mov ($_inp,$s0); # save copy of inp
+ &mov ($_out,$s1); # save copy of out
+ &mov ($_len,$s2); # save copy of len
+ &mov ($_key,$key); # save copy of key
+ &mov ($_ivp,$acc); # save copy of ivp
+
+ &mov ($mark,0); # copy of aes_key->rounds = 0;
+ # do we copy key schedule to stack?
+ &mov ($s1 eq "ebx" ? $s1 : "",$key);
+ &mov ($s2 eq "ecx" ? $s2 : "",244/4);
+ &sub ($s1,$tbl);
+ &mov ("esi",$key);
+ &and ($s1,0xfff);
+ &lea ("edi",$aes_key);
+ &cmp ($s1,2048+256);
+ &jb (&label("do_copy"));
+ &cmp ($s1,4096-244);
+ &jb (&label("skip_copy"));
+ &set_label("do_copy",4);
+ &mov ($_key,"edi");
+ &data_word(0xA5F3F689); # rep movsd
+ &set_label("skip_copy");
+
+ &mov ($key,16);
+ &set_label("prefetch_tbl",4);
+ &mov ($s0,&DWP(0,$tbl));
+ &mov ($s1,&DWP(32,$tbl));
+ &mov ($s2,&DWP(64,$tbl));
+ &mov ($acc,&DWP(96,$tbl));
+ &lea ($tbl,&DWP(128,$tbl));
+ &sub ($key,1);
+ &jnz (&label("prefetch_tbl"));
+ &sub ($tbl,2048);
+
+ &mov ($acc,$_inp);
+ &mov ($key,$_ivp);
+
+ &cmp ($s3,0);
+ &je (&label("fast_decrypt"));
+
+#----------------------------- ENCRYPT -----------------------------#
+ &mov ($s0,&DWP(0,$key)); # load iv
+ &mov ($s1,&DWP(4,$key));
+
+ &set_label("fast_enc_loop",16);
+ &mov ($s2,&DWP(8,$key));
+ &mov ($s3,&DWP(12,$key));
+
+ &xor ($s0,&DWP(0,$acc)); # xor input data
+ &xor ($s1,&DWP(4,$acc));
+ &xor ($s2,&DWP(8,$acc));
+ &xor ($s3,&DWP(12,$acc));
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_encrypt");
+
+ &mov ($acc,$_inp); # load inp
+ &mov ($key,$_out); # load out
+
+ &mov (&DWP(0,$key),$s0); # save output data
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($s2,$_len); # load len
+ &mov ($_inp,$acc); # save inp
+ &lea ($s3,&DWP(16,$key)); # advance out
+ &mov ($_out,$s3); # save out
+ &sub ($s2,16); # decrease len
+ &mov ($_len,$s2); # save len
+ &jnz (&label("fast_enc_loop"));
+ &mov ($acc,$_ivp); # load ivp
+ &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
+ &mov ($s3,&DWP(12,$key));
+ &mov (&DWP(0,$acc),$s0); # save ivec
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &cmp ($mark,0); # was the key schedule copied?
+ &mov ("edi",$_key);
+ &je (&label("skip_ezero"));
+ # zero copy of key schedule
+ &mov ("ecx",240/4);
+ &xor ("eax","eax");
+ &align (4);
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_ezero");
+ &mov ("esp",$_esp);
+ &popf ();
+ &set_label("drop_out");
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+#----------------------------- DECRYPT -----------------------------#
+&set_label("fast_decrypt",16);
+
+ &cmp ($acc,$_out);
+ &je (&label("fast_dec_in_place")); # in-place processing...
+
+ &mov ($_tmp,$key);
+
+ &align (4);
+ &set_label("fast_dec_loop",16);
+ &mov ($s0,&DWP(0,$acc)); # read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_decrypt");
+
+ &mov ($key,$_tmp); # load ivp
+ &mov ($acc,$_len); # load len
+ &xor ($s0,&DWP(0,$key)); # xor iv
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov ($key,$_out); # load out
+ &mov ($acc,$_inp); # load inp
+
+ &mov (&DWP(0,$key),$s0); # write output
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($s2,$_len); # load len
+ &mov ($_tmp,$acc); # save ivp
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &lea ($key,&DWP(16,$key)); # advance out
+ &mov ($_out,$key); # save out
+ &sub ($s2,16); # decrease len
+ &mov ($_len,$s2); # save len
+ &jnz (&label("fast_dec_loop"));
+ &mov ($key,$_tmp); # load temp ivp
+ &mov ($acc,$_ivp); # load user ivp
+ &mov ($s0,&DWP(0,$key)); # load iv
+ &mov ($s1,&DWP(4,$key));
+ &mov ($s2,&DWP(8,$key));
+ &mov ($s3,&DWP(12,$key));
+ &mov (&DWP(0,$acc),$s0); # copy back to user
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+ &jmp (&label("fast_dec_out"));
+
+ &set_label("fast_dec_in_place",16);
+ &set_label("fast_dec_in_place_loop");
+ &mov ($s0,&DWP(0,$acc)); # read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &lea ($key,$ivec);
+ &mov (&DWP(0,$key),$s0); # copy to temp
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_decrypt");
+
+ &mov ($key,$_ivp); # load ivp
+ &mov ($acc,$_out); # load out
+ &xor ($s0,&DWP(0,$key)); # xor iv
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &mov (&DWP(0,$acc),$s0); # write output
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &lea ($acc,&DWP(16,$acc)); # advance out
+ &mov ($_out,$acc); # save out
+
+ &lea ($acc,$ivec);
+ &mov ($s0,&DWP(0,$acc)); # read temp
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov (&DWP(0,$key),$s0); # copy iv
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($acc,$_inp); # load inp
+ &mov ($s2,$_len); # load len
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &sub ($s2,16); # decrease len
+ &mov ($_len,$s2); # save len
+ &jnz (&label("fast_dec_in_place_loop"));
+
+ &set_label("fast_dec_out",4);
+ &cmp ($mark,0); # was the key schedule copied?
+ &mov ("edi",$_key);
+ &je (&label("skip_dzero"));
+ # zero copy of key schedule
+ &mov ("ecx",240/4);
+ &xor ("eax","eax");
+ &align (4);
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_dzero");
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+#--------------------------- SLOW ROUTINE ---------------------------#
+&set_label("slow_way",16);
+
+ &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
+ &mov ($key,&wparam(3)); # load key
+
+ # pre-allocate aligned stack frame...
+ &lea ($acc,&DWP(-80,"esp"));
+ &and ($acc,-64);
+
+ # ... and make sure it doesn't alias with $key modulo 1024
+ &lea ($s1,&DWP(-80-63,$key));
+ &sub ($s1,$acc);
+ &neg ($s1);
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
+ &sub ($acc,$s1);
+
+ # pick S-box copy which can't overlap with stack frame or $key
+ &lea ($s1,&DWP(768,$acc));
+ &sub ($s1,$tbl);
+ &and ($s1,0x300);
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
+
+ &lea ($s3,&wparam(0)); # pointer to parameter block
+
+ &exch ("esp",$acc);
+ &add ("esp",4); # reserve for return address!
+ &mov ($_tbl,$tbl); # save %ebp
+ &mov ($_esp,$acc); # save %esp
+ &mov ($_tmp,$s0); # save OPENSSL_ia32cap
+
+ &mov ($s0,&DWP(0,$s3)); # load inp
+ &mov ($s1,&DWP(4,$s3)); # load out
+ #&mov ($s2,&DWP(8,$s3)); # load len
+ #&mov ($key,&DWP(12,$s3)); # load key
+ &mov ($acc,&DWP(16,$s3)); # load ivp
+ &mov ($s3,&DWP(20,$s3)); # load enc flag
+
+ &mov ($_inp,$s0); # save copy of inp
+ &mov ($_out,$s1); # save copy of out
+ &mov ($_len,$s2); # save copy of len
+ &mov ($_key,$key); # save copy of key
+ &mov ($_ivp,$acc); # save copy of ivp
+
+ &mov ($key,$acc);
+ &mov ($acc,$s0);
+
+ &cmp ($s3,0);
+ &je (&label("slow_decrypt"));
+
+#--------------------------- SLOW ENCRYPT ---------------------------#
+ &cmp ($s2,16);
+ &mov ($s3,$s1);
+ &jb (&label("slow_enc_tail"));
+
+ if (!$x86only) {
+ &bt ($_tmp,25); # check for SSE bit
+ &jnc (&label("slow_enc_x86"));
+
+ &movq ("mm0",&QWP(0,$key)); # load iv
+ &movq ("mm4",&QWP(8,$key));
+
+ &set_label("slow_enc_loop_sse",16);
+ &pxor ("mm0",&QWP(0,$acc)); # xor input data
+ &pxor ("mm4",&QWP(8,$acc));
+
+ &mov ($key,$_key);
+ &call ("_sse_AES_encrypt_compact");
+
+ &mov ($acc,$_inp); # load inp
+ &mov ($key,$_out); # load out
+ &mov ($s2,$_len); # load len
+
+ &movq (&QWP(0,$key),"mm0"); # save output data
+ &movq (&QWP(8,$key),"mm4");
+
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &lea ($s3,&DWP(16,$key)); # advance out
+ &mov ($_out,$s3); # save out
+ &sub ($s2,16); # decrease len
+ &cmp ($s2,16);
+ &mov ($_len,$s2); # save len
+ &jae (&label("slow_enc_loop_sse"));
+ &test ($s2,15);
+ &jnz (&label("slow_enc_tail"));
+ &mov ($acc,$_ivp); # load ivp
+ &movq (&QWP(0,$acc),"mm0"); # save ivec
+ &movq (&QWP(8,$acc),"mm4");
+ &emms ();
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+ }
+ &set_label("slow_enc_x86",16);
+ &mov ($s0,&DWP(0,$key)); # load iv
+ &mov ($s1,&DWP(4,$key));
+
+ &set_label("slow_enc_loop_x86",4);
+ &mov ($s2,&DWP(8,$key));
+ &mov ($s3,&DWP(12,$key));
+
+ &xor ($s0,&DWP(0,$acc)); # xor input data
+ &xor ($s1,&DWP(4,$acc));
+ &xor ($s2,&DWP(8,$acc));
+ &xor ($s3,&DWP(12,$acc));
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_encrypt_compact");
+
+ &mov ($acc,$_inp); # load inp
+ &mov ($key,$_out); # load out
+
+ &mov (&DWP(0,$key),$s0); # save output data
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($s2,$_len); # load len
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &lea ($s3,&DWP(16,$key)); # advance out
+ &mov ($_out,$s3); # save out
+ &sub ($s2,16); # decrease len
+ &cmp ($s2,16);
+ &mov ($_len,$s2); # save len
+ &jae (&label("slow_enc_loop_x86"));
+ &test ($s2,15);
+ &jnz (&label("slow_enc_tail"));
+ &mov ($acc,$_ivp); # load ivp
+ &mov ($s2,&DWP(8,$key)); # restore last dwords
+ &mov ($s3,&DWP(12,$key));
+ &mov (&DWP(0,$acc),$s0); # save ivec
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_enc_tail",16);
+ &emms () if (!$x86only);
+ &mov ($key eq "edi"? $key:"",$s3); # load out to edi
+ &mov ($s1,16);
+ &sub ($s1,$s2);
+ &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
+ &je (&label("enc_in_place"));
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy input
+ &jmp (&label("enc_skip_in_place"));
+ &set_label("enc_in_place");
+ &lea ($key,&DWP(0,$key,$s2));
+ &set_label("enc_skip_in_place");
+ &mov ($s2,$s1);
+ &xor ($s0,$s0);
+ &align (4);
+ &data_word(0xAAF3F689); # rep stosb # zero tail
+
+ &mov ($key,$_ivp); # restore ivp
+ &mov ($acc,$s3); # output as input
+ &mov ($s0,&DWP(0,$key));
+ &mov ($s1,&DWP(4,$key));
+ &mov ($_len,16); # len=16
+ &jmp (&label("slow_enc_loop_x86")); # one more spin...
+
+#--------------------------- SLOW DECRYPT ---------------------------#
+&set_label("slow_decrypt",16);
+ if (!$x86only) {
+ &bt ($_tmp,25); # check for SSE bit
+ &jnc (&label("slow_dec_loop_x86"));
+
+ &set_label("slow_dec_loop_sse",4);
+ &movq ("mm0",&QWP(0,$acc)); # read input
+ &movq ("mm4",&QWP(8,$acc));
+
+ &mov ($key,$_key);
+ &call ("_sse_AES_decrypt_compact");
+
+ &mov ($acc,$_inp); # load inp
+ &lea ($s0,$ivec);
+ &mov ($s1,$_out); # load out
+ &mov ($s2,$_len); # load len
+ &mov ($key,$_ivp); # load ivp
+
+ &movq ("mm1",&QWP(0,$acc)); # re-read input
+ &movq ("mm5",&QWP(8,$acc));
+
+ &pxor ("mm0",&QWP(0,$key)); # xor iv
+ &pxor ("mm4",&QWP(8,$key));
+
+ &movq (&QWP(0,$key),"mm1"); # copy input to iv
+ &movq (&QWP(8,$key),"mm5");
+
+ &sub ($s2,16); # decrease len
+ &jc (&label("slow_dec_partial_sse"));
+
+ &movq (&QWP(0,$s1),"mm0"); # write output
+ &movq (&QWP(8,$s1),"mm4");
+
+ &lea ($s1,&DWP(16,$s1)); # advance out
+ &mov ($_out,$s1); # save out
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &mov ($_len,$s2); # save len
+ &jnz (&label("slow_dec_loop_sse"));
+ &emms ();
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_dec_partial_sse",16);
+ &movq (&QWP(0,$s0),"mm0"); # save output to temp
+ &movq (&QWP(8,$s0),"mm4");
+ &emms ();
+
+ &add ($s2 eq "ecx" ? "ecx":"",16);
+ &mov ("edi",$s1); # out
+ &mov ("esi",$s0); # temp
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy partial output
+
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+ }
+ &set_label("slow_dec_loop_x86",16);
+ &mov ($s0,&DWP(0,$acc)); # read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &lea ($key,$ivec);
+ &mov (&DWP(0,$key),$s0); # copy to temp
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_decrypt_compact");
+
+ &mov ($key,$_ivp); # load ivp
+ &mov ($acc,$_len); # load len
+ &xor ($s0,&DWP(0,$key)); # xor iv
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &sub ($acc,16);
+ &jc (&label("slow_dec_partial_x86"));
+
+ &mov ($_len,$acc); # save len
+ &mov ($acc,$_out); # load out
+
+ &mov (&DWP(0,$acc),$s0); # write output
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &lea ($acc,&DWP(16,$acc)); # advance out
+ &mov ($_out,$acc); # save out
+
+ &lea ($acc,$ivec);
+ &mov ($s0,&DWP(0,$acc)); # read temp
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov (&DWP(0,$key),$s0); # copy it to iv
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($acc,$_inp); # load inp
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &jnz (&label("slow_dec_loop_x86"));
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_dec_partial_x86",16);
+ &lea ($acc,$ivec);
+ &mov (&DWP(0,$acc),$s0); # save output to temp
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &mov ($acc,$_inp);
+ &mov ($s0,&DWP(0,$acc)); # re-read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov (&DWP(0,$key),$s0); # copy it to iv
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ("ecx",$_len);
+ &mov ("edi",$_out);
+ &lea ("esi",$ivec);
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy partial output
+
+ &mov ("esp",$_esp);
+ &popf ();
+&function_end("AES_cbc_encrypt");
+}
+
+#------------------------------------------------------------------#
+
+sub enckey()
+{
+ &movz ("esi",&LB("edx")); # rk[i]>>0
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &movz ("esi",&HB("edx")); # rk[i]>>8
+ &shl ("ebx",24);
+ &xor ("eax","ebx");
+
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &shr ("edx",16);
+ &movz ("esi",&LB("edx")); # rk[i]>>16
+ &xor ("eax","ebx");
+
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &movz ("esi",&HB("edx")); # rk[i]>>24
+ &shl ("ebx",8);
+ &xor ("eax","ebx");
+
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &shl ("ebx",16);
+ &xor ("eax","ebx");
+
+ &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
+}
+
+&function_begin("_x86_AES_set_encrypt_key");
+ &mov ("esi",&wparam(1)); # user supplied key
+ &mov ("edi",&wparam(3)); # private key schedule
+
+ &test ("esi",-1);
+ &jz (&label("badpointer"));
+ &test ("edi",-1);
+ &jz (&label("badpointer"));
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop($tbl);
+ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+ &lea ($tbl,&DWP(2048+128,$tbl));
+
+ # prefetch Te4
+ &mov ("eax",&DWP(0-128,$tbl));
+ &mov ("ebx",&DWP(32-128,$tbl));
+ &mov ("ecx",&DWP(64-128,$tbl));
+ &mov ("edx",&DWP(96-128,$tbl));
+ &mov ("eax",&DWP(128-128,$tbl));
+ &mov ("ebx",&DWP(160-128,$tbl));
+ &mov ("ecx",&DWP(192-128,$tbl));
+ &mov ("edx",&DWP(224-128,$tbl));
+
+ &mov ("ecx",&wparam(2)); # number of bits in key
+ &cmp ("ecx",128);
+ &je (&label("10rounds"));
+ &cmp ("ecx",192);
+ &je (&label("12rounds"));
+ &cmp ("ecx",256);
+ &je (&label("14rounds"));
+ &mov ("eax",-2); # invalid number of bits
+ &jmp (&label("exit"));
+
+ &set_label("10rounds");
+ &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
+ &mov ("ebx",&DWP(4,"esi"));
+ &mov ("ecx",&DWP(8,"esi"));
+ &mov ("edx",&DWP(12,"esi"));
+ &mov (&DWP(0,"edi"),"eax");
+ &mov (&DWP(4,"edi"),"ebx");
+ &mov (&DWP(8,"edi"),"ecx");
+ &mov (&DWP(12,"edi"),"edx");
+
+ &xor ("ecx","ecx");
+ &jmp (&label("10shortcut"));
+
+ &align (4);
+ &set_label("10loop");
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
+ &mov ("edx",&DWP(12,"edi")); # rk[3]
+ &set_label("10shortcut");
+ &enckey ();
+
+ &mov (&DWP(16,"edi"),"eax"); # rk[4]
+ &xor ("eax",&DWP(4,"edi"));
+ &mov (&DWP(20,"edi"),"eax"); # rk[5]
+ &xor ("eax",&DWP(8,"edi"));
+ &mov (&DWP(24,"edi"),"eax"); # rk[6]
+ &xor ("eax",&DWP(12,"edi"));
+ &mov (&DWP(28,"edi"),"eax"); # rk[7]
+ &inc ("ecx");
+ &add ("edi",16);
+ &cmp ("ecx",10);
+ &jl (&label("10loop"));
+
+ &mov (&DWP(80,"edi"),10); # setup number of rounds
+ &xor ("eax","eax");
+ &jmp (&label("exit"));
+
+ &set_label("12rounds");
+ &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
+ &mov ("ebx",&DWP(4,"esi"));
+ &mov ("ecx",&DWP(8,"esi"));
+ &mov ("edx",&DWP(12,"esi"));
+ &mov (&DWP(0,"edi"),"eax");
+ &mov (&DWP(4,"edi"),"ebx");
+ &mov (&DWP(8,"edi"),"ecx");
+ &mov (&DWP(12,"edi"),"edx");
+ &mov ("ecx",&DWP(16,"esi"));
+ &mov ("edx",&DWP(20,"esi"));
+ &mov (&DWP(16,"edi"),"ecx");
+ &mov (&DWP(20,"edi"),"edx");
+
+ &xor ("ecx","ecx");
+ &jmp (&label("12shortcut"));
+
+ &align (4);
+ &set_label("12loop");
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
+ &mov ("edx",&DWP(20,"edi")); # rk[5]
+ &set_label("12shortcut");
+ &enckey ();
+
+ &mov (&DWP(24,"edi"),"eax"); # rk[6]
+ &xor ("eax",&DWP(4,"edi"));
+ &mov (&DWP(28,"edi"),"eax"); # rk[7]
+ &xor ("eax",&DWP(8,"edi"));
+ &mov (&DWP(32,"edi"),"eax"); # rk[8]
+ &xor ("eax",&DWP(12,"edi"));
+ &mov (&DWP(36,"edi"),"eax"); # rk[9]
+
+ &cmp ("ecx",7);
+ &je (&label("12break"));
+ &inc ("ecx");
+
+ &xor ("eax",&DWP(16,"edi"));
+ &mov (&DWP(40,"edi"),"eax"); # rk[10]
+ &xor ("eax",&DWP(20,"edi"));
+ &mov (&DWP(44,"edi"),"eax"); # rk[11]
+
+ &add ("edi",24);
+ &jmp (&label("12loop"));
+
+ &set_label("12break");
+ &mov (&DWP(72,"edi"),12); # setup number of rounds
+ &xor ("eax","eax");
+ &jmp (&label("exit"));
+
+ &set_label("14rounds");
+ &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
+ &mov ("ebx",&DWP(4,"esi"));
+ &mov ("ecx",&DWP(8,"esi"));
+ &mov ("edx",&DWP(12,"esi"));
+ &mov (&DWP(0,"edi"),"eax");
+ &mov (&DWP(4,"edi"),"ebx");
+ &mov (&DWP(8,"edi"),"ecx");
+ &mov (&DWP(12,"edi"),"edx");
+ &mov ("eax",&DWP(16,"esi"));
+ &mov ("ebx",&DWP(20,"esi"));
+ &mov ("ecx",&DWP(24,"esi"));
+ &mov ("edx",&DWP(28,"esi"));
+ &mov (&DWP(16,"edi"),"eax");
+ &mov (&DWP(20,"edi"),"ebx");
+ &mov (&DWP(24,"edi"),"ecx");
+ &mov (&DWP(28,"edi"),"edx");
+
+ &xor ("ecx","ecx");
+ &jmp (&label("14shortcut"));
+
+ &align (4);
+ &set_label("14loop");
+ &mov ("edx",&DWP(28,"edi")); # rk[7]
+ &set_label("14shortcut");
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
+
+ &enckey ();
+
+ &mov (&DWP(32,"edi"),"eax"); # rk[8]
+ &xor ("eax",&DWP(4,"edi"));
+ &mov (&DWP(36,"edi"),"eax"); # rk[9]
+ &xor ("eax",&DWP(8,"edi"));
+ &mov (&DWP(40,"edi"),"eax"); # rk[10]
+ &xor ("eax",&DWP(12,"edi"));
+ &mov (&DWP(44,"edi"),"eax"); # rk[11]
+
+ &cmp ("ecx",6);
+ &je (&label("14break"));
+ &inc ("ecx");
+
+ &mov ("edx","eax");
+ &mov ("eax",&DWP(16,"edi")); # rk[4]
+ &movz ("esi",&LB("edx")); # rk[11]>>0
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &movz ("esi",&HB("edx")); # rk[11]>>8
+ &xor ("eax","ebx");
+
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &shr ("edx",16);
+ &shl ("ebx",8);
+ &movz ("esi",&LB("edx")); # rk[11]>>16
+ &xor ("eax","ebx");
+
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &movz ("esi",&HB("edx")); # rk[11]>>24
+ &shl ("ebx",16);
+ &xor ("eax","ebx");
+
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
+ &shl ("ebx",24);
+ &xor ("eax","ebx");
+
+ &mov (&DWP(48,"edi"),"eax"); # rk[12]
+ &xor ("eax",&DWP(20,"edi"));
+ &mov (&DWP(52,"edi"),"eax"); # rk[13]
+ &xor ("eax",&DWP(24,"edi"));
+ &mov (&DWP(56,"edi"),"eax"); # rk[14]
+ &xor ("eax",&DWP(28,"edi"));
+ &mov (&DWP(60,"edi"),"eax"); # rk[15]
+
+ &add ("edi",32);
+ &jmp (&label("14loop"));
+
+ &set_label("14break");
+ &mov (&DWP(48,"edi"),14); # setup number of rounds
+ &xor ("eax","eax");
+ &jmp (&label("exit"));
+
+ &set_label("badpointer");
+ &mov ("eax",-1);
+ &set_label("exit");
+&function_end("_x86_AES_set_encrypt_key");
+
+# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+&function_begin_B("AES_set_encrypt_key");
+ &call ("_x86_AES_set_encrypt_key");
+ &ret ();
+&function_end_B("AES_set_encrypt_key");
+
+sub deckey()
+{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
+ my $tmp = $tbl;
+
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$tp1);
+ &lea ($tp2,&DWP(0,$tp1,$tp1));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
+ &sub ($acc,$tmp);
+ &and ($tp2,0xfefefefe);
+ &and ($acc,0x1b1b1b1b);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
+
+ &and ($tmp,$tp2);
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
+ &sub ($acc,$tmp);
+ &and ($tp4,0xfefefefe);
+ &and ($acc,0x1b1b1b1b);
+ &xor ($tp2,$tp1); # tp2^tp1
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
+
+ &and ($tmp,$tp4);
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
+ &xor ($tp4,$tp1); # tp4^tp1
+ &sub ($acc,$tmp);
+ &and ($tp8,0xfefefefe);
+ &and ($acc,0x1b1b1b1b);
+ &rotl ($tp1,8); # = ROTATE(tp1,8)
+ &xor ($tp8,$acc);
+
+ &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
+
+ &xor ($tp1,$tp2);
+ &xor ($tp2,$tp8);
+ &xor ($tp1,$tp4);
+ &rotl ($tp2,24);
+ &xor ($tp4,$tp8);
+ &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
+ &rotl ($tp4,16);
+ &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
+ &rotl ($tp8,8);
+ &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
+ &mov ($tp2,$tmp);
+ &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
+
+ &mov (&DWP(4*$i,$key),$tp1);
+}
+
+# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+&function_begin_B("AES_set_decrypt_key");
+ &call ("_x86_AES_set_encrypt_key");
+ &cmp ("eax",0);
+ &je (&label("proceed"));
+ &ret ();
+
+ &set_label("proceed");
+ &push ("ebp");
+ &push ("ebx");
+ &push ("esi");
+ &push ("edi");
+
+ &mov ("esi",&wparam(2));
+ &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
+ &lea ("ecx",&DWP(0,"","ecx",4));
+ &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
+
+ &set_label("invert",4); # invert order of chunks
+ &mov ("eax",&DWP(0,"esi"));
+ &mov ("ebx",&DWP(4,"esi"));
+ &mov ("ecx",&DWP(0,"edi"));
+ &mov ("edx",&DWP(4,"edi"));
+ &mov (&DWP(0,"edi"),"eax");
+ &mov (&DWP(4,"edi"),"ebx");
+ &mov (&DWP(0,"esi"),"ecx");
+ &mov (&DWP(4,"esi"),"edx");
+ &mov ("eax",&DWP(8,"esi"));
+ &mov ("ebx",&DWP(12,"esi"));
+ &mov ("ecx",&DWP(8,"edi"));
+ &mov ("edx",&DWP(12,"edi"));
+ &mov (&DWP(8,"edi"),"eax");
+ &mov (&DWP(12,"edi"),"ebx");
+ &mov (&DWP(8,"esi"),"ecx");
+ &mov (&DWP(12,"esi"),"edx");
+ &add ("esi",16);
+ &sub ("edi",16);
+ &cmp ("esi","edi");
+ &jne (&label("invert"));
+
+ &mov ($key,&wparam(2));
+ &mov ($acc,&DWP(240,$key)); # pull number of rounds
+ &lea ($acc,&DWP(-2,$acc,$acc));
+ &lea ($acc,&DWP(0,$key,$acc,8));
+ &mov (&wparam(2),$acc);
+
+ &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
+ &set_label("permute",4); # permute the key schedule
+ &add ($key,16);
+ &deckey (0,$key,$s0,$s1,$s2,$s3);
+ &deckey (1,$key,$s1,$s2,$s3,$s0);
+ &deckey (2,$key,$s2,$s3,$s0,$s1);
+ &deckey (3,$key,$s3,$s0,$s1,$s2);
+ &cmp ($key,&wparam(2));
+ &jb (&label("permute"));
+
+ &xor ("eax","eax"); # return success
+&function_end("AES_set_decrypt_key");
+&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-armv4.pl b/openssl-1.1.0h/crypto/aes/asm/aes-armv4.pl
new file mode 100644
index 0000000..9981589
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-armv4.pl
@@ -0,0 +1,1245 @@
+#! /usr/bin/env perl
+# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for ARMv4
+
+# January 2007.
+#
+# Code uses single 1K S-box and is >2 times faster than code generated
+# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
+# allows to merge logical or arithmetic operation with shift or rotate
+# in one instruction and emit combined result every cycle. The module
+# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
+# key [on single-issue Xscale PXA250 core].
+
+# May 2007.
+#
+# AES_set_[en|de]crypt_key is added.
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 12% improvement on
+# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+$s0="r0";
+$s1="r1";
+$s2="r2";
+$s3="r3";
+$t1="r4";
+$t2="r5";
+$t3="r6";
+$i1="r7";
+$i2="r8";
+$i3="r9";
+
+$tbl="r10";
+$key="r11";
+$rounds="r12";
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+.text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax unified
+.thumb
+#else
+.code 32
+#undef __thumb2__
+#endif
+
+.type AES_Te,%object
+.align 5
+AES_Te:
+.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+@ Te4[256]
+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+@ rcon[]
+.word 0x01000000, 0x02000000, 0x04000000, 0x08000000
+.word 0x10000000, 0x20000000, 0x40000000, 0x80000000
+.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
+.size AES_Te,.-AES_Te
+
+@ void AES_encrypt(const unsigned char *in, unsigned char *out,
+@ const AES_KEY *key) {
+.global AES_encrypt
+.type AES_encrypt,%function
+.align 5
+AES_encrypt:
+#ifndef __thumb2__
+ sub r3,pc,#8 @ AES_encrypt
+#else
+ adr r3,.
+#endif
+ stmdb sp!,{r1,r4-r12,lr}
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr $tbl,AES_Te
+#else
+ sub $tbl,r3,#AES_encrypt-AES_Te @ Te
+#endif
+ mov $rounds,r0 @ inp
+ mov $key,r2
+#if __ARM_ARCH__<7
+ ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
+ ldrb $t1,[$rounds,#2] @ manner...
+ ldrb $t2,[$rounds,#1]
+ ldrb $t3,[$rounds,#0]
+ orr $s0,$s0,$t1,lsl#8
+ ldrb $s1,[$rounds,#7]
+ orr $s0,$s0,$t2,lsl#16
+ ldrb $t1,[$rounds,#6]
+ orr $s0,$s0,$t3,lsl#24
+ ldrb $t2,[$rounds,#5]
+ ldrb $t3,[$rounds,#4]
+ orr $s1,$s1,$t1,lsl#8
+ ldrb $s2,[$rounds,#11]
+ orr $s1,$s1,$t2,lsl#16
+ ldrb $t1,[$rounds,#10]
+ orr $s1,$s1,$t3,lsl#24
+ ldrb $t2,[$rounds,#9]
+ ldrb $t3,[$rounds,#8]
+ orr $s2,$s2,$t1,lsl#8
+ ldrb $s3,[$rounds,#15]
+ orr $s2,$s2,$t2,lsl#16
+ ldrb $t1,[$rounds,#14]
+ orr $s2,$s2,$t3,lsl#24
+ ldrb $t2,[$rounds,#13]
+ ldrb $t3,[$rounds,#12]
+ orr $s3,$s3,$t1,lsl#8
+ orr $s3,$s3,$t2,lsl#16
+ orr $s3,$s3,$t3,lsl#24
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+#endif
+ bl _armv4_AES_encrypt
+
+ ldr $rounds,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$rounds,#0]
+ str $s1,[$rounds,#4]
+ str $s2,[$rounds,#8]
+ str $s3,[$rounds,#12]
+#else
+ mov $t1,$s0,lsr#24 @ write output in endian-neutral
+ mov $t2,$s0,lsr#16 @ manner...
+ mov $t3,$s0,lsr#8
+ strb $t1,[$rounds,#0]
+ strb $t2,[$rounds,#1]
+ mov $t1,$s1,lsr#24
+ strb $t3,[$rounds,#2]
+ mov $t2,$s1,lsr#16
+ strb $s0,[$rounds,#3]
+ mov $t3,$s1,lsr#8
+ strb $t1,[$rounds,#4]
+ strb $t2,[$rounds,#5]
+ mov $t1,$s2,lsr#24
+ strb $t3,[$rounds,#6]
+ mov $t2,$s2,lsr#16
+ strb $s1,[$rounds,#7]
+ mov $t3,$s2,lsr#8
+ strb $t1,[$rounds,#8]
+ strb $t2,[$rounds,#9]
+ mov $t1,$s3,lsr#24
+ strb $t3,[$rounds,#10]
+ mov $t2,$s3,lsr#16
+ strb $s2,[$rounds,#11]
+ mov $t3,$s3,lsr#8
+ strb $t1,[$rounds,#12]
+ strb $t2,[$rounds,#13]
+ strb $t3,[$rounds,#14]
+ strb $s3,[$rounds,#15]
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size AES_encrypt,.-AES_encrypt
+
+.type _armv4_AES_encrypt,%function
+.align 2
+_armv4_AES_encrypt:
+ str lr,[sp,#-4]! @ push lr
+ ldmia $key!,{$t1-$i1}
+ eor $s0,$s0,$t1
+ ldr $rounds,[$key,#240-16]
+ eor $s1,$s1,$t2
+ eor $s2,$s2,$t3
+ eor $s3,$s3,$i1
+ sub $rounds,$rounds,#1
+ mov lr,#255
+
+ and $i1,lr,$s0
+ and $i2,lr,$s0,lsr#8
+ and $i3,lr,$s0,lsr#16
+ mov $s0,$s0,lsr#24
+.Lenc_loop:
+ ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
+ and $i1,lr,$s1,lsr#16 @ i0
+ ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
+ and $i2,lr,$s1
+ ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
+ and $i3,lr,$s1,lsr#8
+ ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
+ mov $s1,$s1,lsr#24
+
+ ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
+ ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
+ ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
+ eor $s0,$s0,$i1,ror#8
+ ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
+ and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$t2,$i2,ror#8
+ and $i2,lr,$s2,lsr#16 @ i1
+ eor $t3,$t3,$i3,ror#8
+ and $i3,lr,$s2
+ ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
+ eor $s1,$s1,$t1,ror#24
+ ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
+ mov $s2,$s2,lsr#24
+
+ ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
+ eor $s0,$s0,$i1,ror#16
+ ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
+ and $i1,lr,$s3 @ i0
+ eor $s1,$s1,$i2,ror#8
+ and $i2,lr,$s3,lsr#8 @ i1
+ eor $t3,$t3,$i3,ror#16
+ and $i3,lr,$s3,lsr#16 @ i2
+ ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
+ eor $s2,$s2,$t2,ror#16
+ ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
+ mov $s3,$s3,lsr#24
+
+ ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
+ eor $s0,$s0,$i1,ror#24
+ ldr $i1,[$key],#16
+ eor $s1,$s1,$i2,ror#16
+ ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
+ eor $s2,$s2,$i3,ror#8
+ ldr $t1,[$key,#-12]
+ eor $s3,$s3,$t3,ror#8
+
+ ldr $t2,[$key,#-8]
+ eor $s0,$s0,$i1
+ ldr $t3,[$key,#-4]
+ and $i1,lr,$s0
+ eor $s1,$s1,$t1
+ and $i2,lr,$s0,lsr#8
+ eor $s2,$s2,$t2
+ and $i3,lr,$s0,lsr#16
+ eor $s3,$s3,$t3
+ mov $s0,$s0,lsr#24
+
+ subs $rounds,$rounds,#1
+ bne .Lenc_loop
+
+ add $tbl,$tbl,#2
+
+ ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
+ and $i1,lr,$s1,lsr#16 @ i0
+ ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
+ and $i2,lr,$s1
+ ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
+ and $i3,lr,$s1,lsr#8
+ ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
+ mov $s1,$s1,lsr#24
+
+ ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
+ ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
+ eor $s0,$i1,$s0,lsl#8
+ ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
+ and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$i2,$t2,lsl#8
+ and $i2,lr,$s2,lsr#16 @ i1
+ eor $t3,$i3,$t3,lsl#8
+ and $i3,lr,$s2
+ ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
+ eor $s1,$t1,$s1,lsl#24
+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
+ mov $s2,$s2,lsr#24
+
+ ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
+ eor $s0,$i1,$s0,lsl#8
+ ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
+ and $i1,lr,$s3 @ i0
+ eor $s1,$s1,$i2,lsl#16
+ and $i2,lr,$s3,lsr#8 @ i1
+ eor $t3,$i3,$t3,lsl#8
+ and $i3,lr,$s3,lsr#16 @ i2
+ ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
+ eor $s2,$t2,$s2,lsl#24
+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
+ mov $s3,$s3,lsr#24
+
+ ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
+ eor $s0,$i1,$s0,lsl#8
+ ldr $i1,[$key,#0]
+ ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
+ eor $s1,$s1,$i2,lsl#8
+ ldr $t1,[$key,#4]
+ eor $s2,$s2,$i3,lsl#16
+ ldr $t2,[$key,#8]
+ eor $s3,$t3,$s3,lsl#24
+ ldr $t3,[$key,#12]
+
+ eor $s0,$s0,$i1
+ eor $s1,$s1,$t1
+ eor $s2,$s2,$t2
+ eor $s3,$s3,$t3
+
+ sub $tbl,$tbl,#2
+ ldr pc,[sp],#4 @ pop and return
+.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
+
+.global AES_set_encrypt_key
+.type AES_set_encrypt_key,%function
+.align 5
+AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
+#ifndef __thumb2__
+ sub r3,pc,#8 @ AES_set_encrypt_key
+#else
+ adr r3,.
+#endif
+ teq r0,#0
+#ifdef __thumb2__
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ moveq r0,#-1
+ beq .Labrt
+ teq r2,#0
+#ifdef __thumb2__
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ moveq r0,#-1
+ beq .Labrt
+
+ teq r1,#128
+ beq .Lok
+ teq r1,#192
+ beq .Lok
+ teq r1,#256
+#ifdef __thumb2__
+ itt ne @ Thumb2 thing, sanity check in ARM
+#endif
+ movne r0,#-1
+ bne .Labrt
+
+.Lok: stmdb sp!,{r4-r12,lr}
+ mov $rounds,r0 @ inp
+ mov lr,r1 @ bits
+ mov $key,r2 @ key
+
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr $tbl,AES_Te+1024 @ Te4
+#else
+ sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
+#endif
+
+#if __ARM_ARCH__<7
+ ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
+ ldrb $t1,[$rounds,#2] @ manner...
+ ldrb $t2,[$rounds,#1]
+ ldrb $t3,[$rounds,#0]
+ orr $s0,$s0,$t1,lsl#8
+ ldrb $s1,[$rounds,#7]
+ orr $s0,$s0,$t2,lsl#16
+ ldrb $t1,[$rounds,#6]
+ orr $s0,$s0,$t3,lsl#24
+ ldrb $t2,[$rounds,#5]
+ ldrb $t3,[$rounds,#4]
+ orr $s1,$s1,$t1,lsl#8
+ ldrb $s2,[$rounds,#11]
+ orr $s1,$s1,$t2,lsl#16
+ ldrb $t1,[$rounds,#10]
+ orr $s1,$s1,$t3,lsl#24
+ ldrb $t2,[$rounds,#9]
+ ldrb $t3,[$rounds,#8]
+ orr $s2,$s2,$t1,lsl#8
+ ldrb $s3,[$rounds,#15]
+ orr $s2,$s2,$t2,lsl#16
+ ldrb $t1,[$rounds,#14]
+ orr $s2,$s2,$t3,lsl#24
+ ldrb $t2,[$rounds,#13]
+ ldrb $t3,[$rounds,#12]
+ orr $s3,$s3,$t1,lsl#8
+ str $s0,[$key],#16
+ orr $s3,$s3,$t2,lsl#16
+ str $s1,[$key,#-12]
+ orr $s3,$s3,$t3,lsl#24
+ str $s2,[$key,#-8]
+ str $s3,[$key,#-4]
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$key],#16
+ str $s1,[$key,#-12]
+ str $s2,[$key,#-8]
+ str $s3,[$key,#-4]
+#endif
+
+ teq lr,#128
+ bne .Lnot128
+ mov $rounds,#10
+ str $rounds,[$key,#240-16]
+ add $t3,$tbl,#256 @ rcon
+ mov lr,#255
+
+.L128_loop:
+ and $t2,lr,$s3,lsr#24
+ and $i1,lr,$s3,lsr#16
+ ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$s3,lsr#8
+ ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$s3
+ ldrb $i2,[$tbl,$i2]
+ orr $t2,$t2,$i1,lsl#24
+ ldrb $i3,[$tbl,$i3]
+ orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$t3],#4 @ rcon[i++]
+ orr $t2,$t2,$i3,lsl#8
+ eor $t2,$t2,$t1
+ eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
+ eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
+ str $s0,[$key],#16
+ eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
+ str $s1,[$key,#-12]
+ eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
+ str $s2,[$key,#-8]
+ subs $rounds,$rounds,#1
+ str $s3,[$key,#-4]
+ bne .L128_loop
+ sub r2,$key,#176
+ b .Ldone
+
+.Lnot128:
+#if __ARM_ARCH__<7
+ ldrb $i2,[$rounds,#19]
+ ldrb $t1,[$rounds,#18]
+ ldrb $t2,[$rounds,#17]
+ ldrb $t3,[$rounds,#16]
+ orr $i2,$i2,$t1,lsl#8
+ ldrb $i3,[$rounds,#23]
+ orr $i2,$i2,$t2,lsl#16
+ ldrb $t1,[$rounds,#22]
+ orr $i2,$i2,$t3,lsl#24
+ ldrb $t2,[$rounds,#21]
+ ldrb $t3,[$rounds,#20]
+ orr $i3,$i3,$t1,lsl#8
+ orr $i3,$i3,$t2,lsl#16
+ str $i2,[$key],#8
+ orr $i3,$i3,$t3,lsl#24
+ str $i3,[$key,#-4]
+#else
+ ldr $i2,[$rounds,#16]
+ ldr $i3,[$rounds,#20]
+#ifdef __ARMEL__
+ rev $i2,$i2
+ rev $i3,$i3
+#endif
+ str $i2,[$key],#8
+ str $i3,[$key,#-4]
+#endif
+
+ teq lr,#192
+ bne .Lnot192
+ mov $rounds,#12
+ str $rounds,[$key,#240-24]
+ add $t3,$tbl,#256 @ rcon
+ mov lr,#255
+ mov $rounds,#8
+
+.L192_loop:
+ and $t2,lr,$i3,lsr#24
+ and $i1,lr,$i3,lsr#16
+ ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$i3,lsr#8
+ ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$i3
+ ldrb $i2,[$tbl,$i2]
+ orr $t2,$t2,$i1,lsl#24
+ ldrb $i3,[$tbl,$i3]
+ orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$t3],#4 @ rcon[i++]
+ orr $t2,$t2,$i3,lsl#8
+ eor $i3,$t2,$t1
+ eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
+ eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
+ str $s0,[$key],#24
+ eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
+ str $s1,[$key,#-20]
+ eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
+ str $s2,[$key,#-16]
+ subs $rounds,$rounds,#1
+ str $s3,[$key,#-12]
+#ifdef __thumb2__
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ subeq r2,$key,#216
+ beq .Ldone
+
+ ldr $i1,[$key,#-32]
+ ldr $i2,[$key,#-28]
+ eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9]
+ eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10]
+ str $i1,[$key,#-8]
+ str $i3,[$key,#-4]
+ b .L192_loop
+
+.Lnot192:
+#if __ARM_ARCH__<7
+ ldrb $i2,[$rounds,#27]
+ ldrb $t1,[$rounds,#26]
+ ldrb $t2,[$rounds,#25]
+ ldrb $t3,[$rounds,#24]
+ orr $i2,$i2,$t1,lsl#8
+ ldrb $i3,[$rounds,#31]
+ orr $i2,$i2,$t2,lsl#16
+ ldrb $t1,[$rounds,#30]
+ orr $i2,$i2,$t3,lsl#24
+ ldrb $t2,[$rounds,#29]
+ ldrb $t3,[$rounds,#28]
+ orr $i3,$i3,$t1,lsl#8
+ orr $i3,$i3,$t2,lsl#16
+ str $i2,[$key],#8
+ orr $i3,$i3,$t3,lsl#24
+ str $i3,[$key,#-4]
+#else
+ ldr $i2,[$rounds,#24]
+ ldr $i3,[$rounds,#28]
+#ifdef __ARMEL__
+ rev $i2,$i2
+ rev $i3,$i3
+#endif
+ str $i2,[$key],#8
+ str $i3,[$key,#-4]
+#endif
+
+ mov $rounds,#14
+ str $rounds,[$key,#240-32]
+ add $t3,$tbl,#256 @ rcon
+ mov lr,#255
+ mov $rounds,#7
+
+.L256_loop:
+ and $t2,lr,$i3,lsr#24
+ and $i1,lr,$i3,lsr#16
+ ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$i3,lsr#8
+ ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$i3
+ ldrb $i2,[$tbl,$i2]
+ orr $t2,$t2,$i1,lsl#24
+ ldrb $i3,[$tbl,$i3]
+ orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$t3],#4 @ rcon[i++]
+ orr $t2,$t2,$i3,lsl#8
+ eor $i3,$t2,$t1
+ eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
+ eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
+ str $s0,[$key],#32
+ eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
+ str $s1,[$key,#-28]
+ eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
+ str $s2,[$key,#-24]
+ subs $rounds,$rounds,#1
+ str $s3,[$key,#-20]
+#ifdef __thumb2__
+ itt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ subeq r2,$key,#256
+ beq .Ldone
+
+ and $t2,lr,$s3
+ and $i1,lr,$s3,lsr#8
+ ldrb $t2,[$tbl,$t2]
+ and $i2,lr,$s3,lsr#16
+ ldrb $i1,[$tbl,$i1]
+ and $i3,lr,$s3,lsr#24
+ ldrb $i2,[$tbl,$i2]
+ orr $t2,$t2,$i1,lsl#8
+ ldrb $i3,[$tbl,$i3]
+ orr $t2,$t2,$i2,lsl#16
+ ldr $t1,[$key,#-48]
+ orr $t2,$t2,$i3,lsl#24
+
+ ldr $i1,[$key,#-44]
+ ldr $i2,[$key,#-40]
+ eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
+ ldr $i3,[$key,#-36]
+ eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
+ str $t1,[$key,#-16]
+ eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
+ str $i1,[$key,#-12]
+ eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
+ str $i2,[$key,#-8]
+ str $i3,[$key,#-4]
+ b .L256_loop
+
+.align 2
+.Ldone: mov r0,#0
+ ldmia sp!,{r4-r12,lr}
+.Labrt:
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global AES_set_decrypt_key
+.type AES_set_decrypt_key,%function
+.align 5
+AES_set_decrypt_key:
+ str lr,[sp,#-4]! @ push lr
+ bl _armv4_AES_set_encrypt_key
+ teq r0,#0
+ ldr lr,[sp],#4 @ pop lr
+ bne .Labrt
+
+ mov r0,r2 @ AES_set_encrypt_key preserves r2,
+ mov r1,r2 @ which is AES_KEY *key
+ b _armv4_AES_set_enc2dec_key
+.size AES_set_decrypt_key,.-AES_set_decrypt_key
+
+@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
+.global AES_set_enc2dec_key
+.type AES_set_enc2dec_key,%function
+.align 5
+AES_set_enc2dec_key:
+_armv4_AES_set_enc2dec_key:
+ stmdb sp!,{r4-r12,lr}
+
+ ldr $rounds,[r0,#240]
+ mov $i1,r0 @ input
+ add $i2,r0,$rounds,lsl#4
+ mov $key,r1 @ output
+ add $tbl,r1,$rounds,lsl#4
+ str $rounds,[r1,#240]
+
+.Linv: ldr $s0,[$i1],#16
+ ldr $s1,[$i1,#-12]
+ ldr $s2,[$i1,#-8]
+ ldr $s3,[$i1,#-4]
+ ldr $t1,[$i2],#-16
+ ldr $t2,[$i2,#16+4]
+ ldr $t3,[$i2,#16+8]
+ ldr $i3,[$i2,#16+12]
+ str $s0,[$tbl],#-16
+ str $s1,[$tbl,#16+4]
+ str $s2,[$tbl,#16+8]
+ str $s3,[$tbl,#16+12]
+ str $t1,[$key],#16
+ str $t2,[$key,#-12]
+ str $t3,[$key,#-8]
+ str $i3,[$key,#-4]
+ teq $i1,$i2
+ bne .Linv
+
+ ldr $s0,[$i1]
+ ldr $s1,[$i1,#4]
+ ldr $s2,[$i1,#8]
+ ldr $s3,[$i1,#12]
+ str $s0,[$key]
+ str $s1,[$key,#4]
+ str $s2,[$key,#8]
+ str $s3,[$key,#12]
+ sub $key,$key,$rounds,lsl#3
+___
+$mask80=$i1;
+$mask1b=$i2;
+$mask7f=$i3;
+$code.=<<___;
+ ldr $s0,[$key,#16]! @ prefetch tp1
+ mov $mask80,#0x80
+ mov $mask1b,#0x1b
+ orr $mask80,$mask80,#0x8000
+ orr $mask1b,$mask1b,#0x1b00
+ orr $mask80,$mask80,$mask80,lsl#16
+ orr $mask1b,$mask1b,$mask1b,lsl#16
+ sub $rounds,$rounds,#1
+ mvn $mask7f,$mask80
+ mov $rounds,$rounds,lsl#2 @ (rounds-1)*4
+
+.Lmix: and $t1,$s0,$mask80
+ and $s1,$s0,$mask7f
+ sub $t1,$t1,$t1,lsr#7
+ and $t1,$t1,$mask1b
+ eor $s1,$t1,$s1,lsl#1 @ tp2
+
+ and $t1,$s1,$mask80
+ and $s2,$s1,$mask7f
+ sub $t1,$t1,$t1,lsr#7
+ and $t1,$t1,$mask1b
+ eor $s2,$t1,$s2,lsl#1 @ tp4
+
+ and $t1,$s2,$mask80
+ and $s3,$s2,$mask7f
+ sub $t1,$t1,$t1,lsr#7
+ and $t1,$t1,$mask1b
+ eor $s3,$t1,$s3,lsl#1 @ tp8
+
+ eor $t1,$s1,$s2
+ eor $t2,$s0,$s3 @ tp9
+ eor $t1,$t1,$s3 @ tpe
+ eor $t1,$t1,$s1,ror#24
+ eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8)
+ eor $t1,$t1,$s2,ror#16
+ eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16)
+ eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24)
+
+ ldr $s0,[$key,#4] @ prefetch tp1
+ str $t1,[$key],#4
+ subs $rounds,$rounds,#1
+ bne .Lmix
+
+ mov r0,#0
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
+
+.type AES_Td,%object
+.align 5
+AES_Td:
+.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+@ Td4[256]
+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+.size AES_Td,.-AES_Td
+
+@ void AES_decrypt(const unsigned char *in, unsigned char *out,
+@ const AES_KEY *key) {
+.global AES_decrypt
+.type AES_decrypt,%function
+.align 5
+AES_decrypt:
+#ifndef __thumb2__
+ sub r3,pc,#8 @ AES_decrypt
+#else
+ adr r3,.
+#endif
+ stmdb sp!,{r1,r4-r12,lr}
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr $tbl,AES_Td
+#else
+ sub $tbl,r3,#AES_decrypt-AES_Td @ Td
+#endif
+ mov $rounds,r0 @ inp
+ mov $key,r2
+#if __ARM_ARCH__<7
+ ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
+ ldrb $t1,[$rounds,#2] @ manner...
+ ldrb $t2,[$rounds,#1]
+ ldrb $t3,[$rounds,#0]
+ orr $s0,$s0,$t1,lsl#8
+ ldrb $s1,[$rounds,#7]
+ orr $s0,$s0,$t2,lsl#16
+ ldrb $t1,[$rounds,#6]
+ orr $s0,$s0,$t3,lsl#24
+ ldrb $t2,[$rounds,#5]
+ ldrb $t3,[$rounds,#4]
+ orr $s1,$s1,$t1,lsl#8
+ ldrb $s2,[$rounds,#11]
+ orr $s1,$s1,$t2,lsl#16
+ ldrb $t1,[$rounds,#10]
+ orr $s1,$s1,$t3,lsl#24
+ ldrb $t2,[$rounds,#9]
+ ldrb $t3,[$rounds,#8]
+ orr $s2,$s2,$t1,lsl#8
+ ldrb $s3,[$rounds,#15]
+ orr $s2,$s2,$t2,lsl#16
+ ldrb $t1,[$rounds,#14]
+ orr $s2,$s2,$t3,lsl#24
+ ldrb $t2,[$rounds,#13]
+ ldrb $t3,[$rounds,#12]
+ orr $s3,$s3,$t1,lsl#8
+ orr $s3,$s3,$t2,lsl#16
+ orr $s3,$s3,$t3,lsl#24
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+#endif
+ bl _armv4_AES_decrypt
+
+ ldr $rounds,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$rounds,#0]
+ str $s1,[$rounds,#4]
+ str $s2,[$rounds,#8]
+ str $s3,[$rounds,#12]
+#else
+ mov $t1,$s0,lsr#24 @ write output in endian-neutral
+ mov $t2,$s0,lsr#16 @ manner...
+ mov $t3,$s0,lsr#8
+ strb $t1,[$rounds,#0]
+ strb $t2,[$rounds,#1]
+ mov $t1,$s1,lsr#24
+ strb $t3,[$rounds,#2]
+ mov $t2,$s1,lsr#16
+ strb $s0,[$rounds,#3]
+ mov $t3,$s1,lsr#8
+ strb $t1,[$rounds,#4]
+ strb $t2,[$rounds,#5]
+ mov $t1,$s2,lsr#24
+ strb $t3,[$rounds,#6]
+ mov $t2,$s2,lsr#16
+ strb $s1,[$rounds,#7]
+ mov $t3,$s2,lsr#8
+ strb $t1,[$rounds,#8]
+ strb $t2,[$rounds,#9]
+ mov $t1,$s3,lsr#24
+ strb $t3,[$rounds,#10]
+ mov $t2,$s3,lsr#16
+ strb $s2,[$rounds,#11]
+ mov $t3,$s3,lsr#8
+ strb $t1,[$rounds,#12]
+ strb $t2,[$rounds,#13]
+ strb $t3,[$rounds,#14]
+ strb $s3,[$rounds,#15]
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size AES_decrypt,.-AES_decrypt
+
+.type _armv4_AES_decrypt,%function
+.align 2
+_armv4_AES_decrypt:
+ str lr,[sp,#-4]! @ push lr
+ ldmia $key!,{$t1-$i1}
+ eor $s0,$s0,$t1
+ ldr $rounds,[$key,#240-16]
+ eor $s1,$s1,$t2
+ eor $s2,$s2,$t3
+ eor $s3,$s3,$i1
+ sub $rounds,$rounds,#1
+ mov lr,#255
+
+ and $i1,lr,$s0,lsr#16
+ and $i2,lr,$s0,lsr#8
+ and $i3,lr,$s0
+ mov $s0,$s0,lsr#24
+.Ldec_loop:
+ ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
+ and $i1,lr,$s1 @ i0
+ ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
+ and $i2,lr,$s1,lsr#16
+ ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
+ and $i3,lr,$s1,lsr#8
+ ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
+ mov $s1,$s1,lsr#24
+
+ ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
+ ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
+ ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
+ eor $s0,$s0,$i1,ror#24
+ ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
+ and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$i2,$t2,ror#8
+ and $i2,lr,$s2 @ i1
+ eor $t3,$i3,$t3,ror#8
+ and $i3,lr,$s2,lsr#16
+ ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
+ eor $s1,$s1,$t1,ror#8
+ ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
+ mov $s2,$s2,lsr#24
+
+ ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
+ eor $s0,$s0,$i1,ror#16
+ ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
+ and $i1,lr,$s3,lsr#16 @ i0
+ eor $s1,$s1,$i2,ror#24
+ and $i2,lr,$s3,lsr#8 @ i1
+ eor $t3,$i3,$t3,ror#8
+ and $i3,lr,$s3 @ i2
+ ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
+ eor $s2,$s2,$t2,ror#8
+ ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
+ mov $s3,$s3,lsr#24
+
+ ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
+ eor $s0,$s0,$i1,ror#8
+ ldr $i1,[$key],#16
+ eor $s1,$s1,$i2,ror#16
+ ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
+ eor $s2,$s2,$i3,ror#24
+
+ ldr $t1,[$key,#-12]
+ eor $s0,$s0,$i1
+ ldr $t2,[$key,#-8]
+ eor $s3,$s3,$t3,ror#8
+ ldr $t3,[$key,#-4]
+ and $i1,lr,$s0,lsr#16
+ eor $s1,$s1,$t1
+ and $i2,lr,$s0,lsr#8
+ eor $s2,$s2,$t2
+ and $i3,lr,$s0
+ eor $s3,$s3,$t3
+ mov $s0,$s0,lsr#24
+
+ subs $rounds,$rounds,#1
+ bne .Ldec_loop
+
+ add $tbl,$tbl,#1024
+
+ ldr $t2,[$tbl,#0] @ prefetch Td4
+ ldr $t3,[$tbl,#32]
+ ldr $t1,[$tbl,#64]
+ ldr $t2,[$tbl,#96]
+ ldr $t3,[$tbl,#128]
+ ldr $t1,[$tbl,#160]
+ ldr $t2,[$tbl,#192]
+ ldr $t3,[$tbl,#224]
+
+ ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
+ ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
+ and $i1,lr,$s1 @ i0
+ ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
+ and $i2,lr,$s1,lsr#16
+ ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
+ and $i3,lr,$s1,lsr#8
+
+ add $s1,$tbl,$s1,lsr#24
+ ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
+ ldrb $s1,[$s1] @ Td4[s1>>24]
+ ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
+ eor $s0,$i1,$s0,lsl#24
+ ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
+ eor $s1,$t1,$s1,lsl#8
+ and $i1,lr,$s2,lsr#8 @ i0
+ eor $t2,$t2,$i2,lsl#8
+ and $i2,lr,$s2 @ i1
+ ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
+ eor $t3,$t3,$i3,lsl#8
+ ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
+ and $i3,lr,$s2,lsr#16
+
+ add $s2,$tbl,$s2,lsr#24
+ ldrb $s2,[$s2] @ Td4[s2>>24]
+ eor $s0,$s0,$i1,lsl#8
+ ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
+ eor $s1,$i2,$s1,lsl#16
+ and $i1,lr,$s3,lsr#16 @ i0
+ eor $s2,$t2,$s2,lsl#16
+ and $i2,lr,$s3,lsr#8 @ i1
+ ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
+ eor $t3,$t3,$i3,lsl#16
+ ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
+ and $i3,lr,$s3 @ i2
+
+ add $s3,$tbl,$s3,lsr#24
+ ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
+ ldrb $s3,[$s3] @ Td4[s3>>24]
+ eor $s0,$s0,$i1,lsl#16
+ ldr $i1,[$key,#0]
+ eor $s1,$s1,$i2,lsl#8
+ ldr $t1,[$key,#4]
+ eor $s2,$i3,$s2,lsl#8
+ ldr $t2,[$key,#8]
+ eor $s3,$t3,$s3,lsl#24
+ ldr $t3,[$key,#12]
+
+ eor $s0,$s0,$i1
+ eor $s1,$s1,$t1
+ eor $s2,$s2,$t2
+ eor $s3,$s3,$t3
+
+ sub $tbl,$tbl,#1024
+ ldr pc,[sp],#4 @ pop and return
+.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
+.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx\tlr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-c64xplus.pl b/openssl-1.1.0h/crypto/aes/asm/aes-c64xplus.pl
new file mode 100644
index 0000000..19d2cc1
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-c64xplus.pl
@@ -0,0 +1,1382 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x+.
+#
+# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*) This means that there might be subtle correlation between data
+# and timing and one can wonder if it can be ... attacked:-(
+# On the other hand this also means that *if* one chooses to
+# implement *4* T-tables variant [instead of 1 T-table as in
+# this implementation, or in addition to], then one ought to
+# *interleave* them. Even though it complicates addressing,
+# references to interleaved tables would be guaranteed not to
+# clash. I reckon that it should be possible to break 8 cycles
+# per byte "barrier," i.e. improve by ~20%, naturally at the
+# cost of 8x increased pressure on L1D. 8x because you'd have
+# to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg AES_encrypt,_AES_encrypt
+ .asg AES_decrypt,_AES_decrypt
+ .asg AES_set_encrypt_key,_AES_set_encrypt_key
+ .asg AES_set_decrypt_key,_AES_set_decrypt_key
+ .asg AES_ctr32_encrypt,_AES_ctr32_encrypt
+ .endif
+
+ .asg B3,RA
+ .asg A4,INP
+ .asg B4,OUT
+ .asg A6,KEY
+ .asg A4,RET
+ .asg B15,SP
+
+ .eval 24,EXT0
+ .eval 16,EXT1
+ .eval 8,EXT2
+ .eval 0,EXT3
+ .eval 8,TBL1
+ .eval 16,TBL2
+ .eval 24,TBL3
+
+ .if .BIG_ENDIAN
+ .eval 24-EXT0,EXT0
+ .eval 24-EXT1,EXT1
+ .eval 24-EXT2,EXT2
+ .eval 24-EXT3,EXT3
+ .eval 32-TBL1,TBL1
+ .eval 32-TBL2,TBL2
+ .eval 32-TBL3,TBL3
+ .endif
+
+ .global _AES_encrypt
+_AES_encrypt:
+ .asmfunc
+ MVK 1,B2
+__encrypt:
+ .if __TI_EABI__
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+|| ADDKPC __encrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH \$PCR_OFFSET(AES_Te,__encrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .else
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL (AES_Te-__encrypt),$TEA
+|| ADDKPC __encrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH (AES_Te-__encrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .endif
+ LDW *$KPA++[2],$Te0[0] ; zero round key
+|| LDW *$KPB++[2],$Te0[1]
+|| MVK 60,A0
+|| ADD B0,$TEA,$TEA ; AES_Te
+ LDW *KEY[A0],B0 ; rounds
+|| MVK 1024,A0 ; sizeof(AES_Te)
+ LDW *$KPA++[2],$Te0[2]
+|| LDW *$KPB++[2],$Te0[3]
+|| MV $TEA,$TEB
+ NOP
+ .if .BIG_ENDIAN
+ MV A9,$s[0]
+|| MV A8,$s[1]
+|| MV B9,$s[2]
+|| MV B8,$s[3]
+ .else
+ MV A8,$s[0]
+|| MV A9,$s[1]
+|| MV B8,$s[2]
+|| MV B9,$s[3]
+ .endif
+ XOR $Te0[0],$s[0],$s[0]
+|| XOR $Te0[1],$s[1],$s[1]
+|| LDW *$KPA++[2],$K[0] ; 1st round key
+|| LDW *$KPB++[2],$K[1]
+ SUB B0,2,B0
+
+ SPLOOPD 13
+|| MVC B0,ILC
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+;;====================================================================
+ EXTU $s[1],EXT1,24,$Te1[1]
+|| EXTU $s[0],EXT3,24,$Te3[0]
+ LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
+|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
+|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[1],EXT3,24,$Te3[1]
+|| EXTU $s[0],EXT1,24,$Te1[0]
+ LDW *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2
+|| LDW *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3
+|| EXTU $s[2],EXT2,24,$Te2[2]
+|| EXTU $s[3],EXT2,24,$Te2[3]
+ LDW *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0
+|| LDW *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1
+|| EXTU $s[3],EXT3,24,$Te3[3]
+|| EXTU $s[2],EXT1,24,$Te1[2]
+ LDW *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0
+|| LDW *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1
+|| EXTU $s[0],EXT2,24,$Te2[0]
+|| EXTU $s[1],EXT2,24,$Te2[1]
+ LDW *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2
+|| LDW *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3
+|| EXTU $s[3],EXT1,24,$Te1[3]
+|| EXTU $s[2],EXT3,24,$Te3[2]
+ LDW *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2
+|| LDW *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3
+|| ROTL $Te1[1],TBL1,$Te3[0] ; t0
+|| ROTL $Te3[0],TBL3,$Te1[1] ; t1
+|| EXTU $s[0],EXT0,24,$Te0[0]
+|| EXTU $s[1],EXT0,24,$Te0[1]
+ LDW *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0
+|| LDW *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1
+|| ROTL $Te3[1],TBL3,$Te1[0] ; t2
+|| ROTL $Te1[0],TBL1,$Te3[1] ; t3
+|| EXTU $s[2],EXT0,24,$Te0[2]
+|| EXTU $s[3],EXT0,24,$Te0[3]
+ LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2
+|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3
+|| ROTL $Te2[2],TBL2,$Te2[2] ; t0
+|| ROTL $Te2[3],TBL2,$Te2[3] ; t1
+|| XOR $K[0],$Te3[0],$s[0]
+|| XOR $K[1],$Te1[1],$s[1]
+ ROTL $Te3[3],TBL3,$Te1[2] ; t0
+|| ROTL $Te1[2],TBL1,$Te3[3] ; t1
+|| XOR $K[2],$Te1[0],$s[2]
+|| XOR $K[3],$Te3[1],$s[3]
+|| LDW *$KPA++[2],$K[0] ; next round key
+|| LDW *$KPB++[2],$K[1]
+ ROTL $Te2[0],TBL2,$Te2[0] ; t2
+|| ROTL $Te2[1],TBL2,$Te2[1] ; t3
+|| XOR $s[0],$Te2[2],$s[0]
+|| XOR $s[1],$Te2[3],$s[1]
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+ ROTL $Te1[3],TBL1,$Te3[2] ; t2
+|| ROTL $Te3[2],TBL3,$Te1[3] ; t3
+|| XOR $s[0],$Te1[2],$s[0]
+|| XOR $s[1],$Te3[3],$s[1]
+ XOR $s[2],$Te2[0],$s[2]
+|| XOR $s[3],$Te2[1],$s[3]
+|| XOR $s[0],$Te0[0],$s[0]
+|| XOR $s[1],$Te0[1],$s[1]
+ SPKERNEL
+|| XOR.L $s[2],$Te3[2],$s[2]
+|| XOR.L $s[3],$Te1[3],$s[3]
+;;====================================================================
+ ADD.D ${TEA},A0,${TEA} ; point to Te4
+|| ADD.D ${TEB},A0,${TEB}
+|| EXTU $s[1],EXT1,24,$Te1[1]
+|| EXTU $s[0],EXT3,24,$Te3[0]
+ LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0
+|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1
+|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[0],EXT0,24,$Te0[0]
+|| EXTU $s[1],EXT0,24,$Te0[1]
+ LDBU *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0
+|| LDBU *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1
+|| EXTU $s[3],EXT3,24,$Te3[3]
+|| EXTU $s[2],EXT1,24,$Te1[2]
+ LDBU *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0
+|| LDBU *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1
+|| EXTU $s[2],EXT2,24,$Te2[2]
+|| EXTU $s[3],EXT2,24,$Te2[3]
+ LDBU *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0
+|| LDBU *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1
+|| EXTU $s[1],EXT3,24,$Te3[1]
+|| EXTU $s[0],EXT1,24,$Te1[0]
+ LDBU *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2
+|| LDBU *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3
+|| EXTU $s[3],EXT1,24,$Te1[3]
+|| EXTU $s[2],EXT3,24,$Te3[2]
+ LDBU *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2
+|| LDBU *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3
+|| EXTU $s[2],EXT0,24,$Te0[2]
+|| EXTU $s[3],EXT0,24,$Te0[3]
+ LDBU *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2
+|| LDBU *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3
+|| EXTU $s[0],EXT2,24,$Te2[0]
+|| EXTU $s[1],EXT2,24,$Te2[1]
+ LDBU *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2
+|| LDBU *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3
+
+ .if .BIG_ENDIAN
+ PACK2 $Te0[0],$Te1[1],$Te0[0]
+|| PACK2 $Te0[1],$Te1[2],$Te0[1]
+ PACK2 $Te2[2],$Te3[3],$Te2[2]
+|| PACK2 $Te2[3],$Te3[0],$Te2[3]
+ PACKL4 $Te0[0],$Te2[2],$Te0[0]
+|| PACKL4 $Te0[1],$Te2[3],$Te0[1]
+ XOR $K[0],$Te0[0],$Te0[0] ; s[0]
+|| XOR $K[1],$Te0[1],$Te0[1] ; s[1]
+
+ PACK2 $Te0[2],$Te1[3],$Te0[2]
+|| PACK2 $Te0[3],$Te1[0],$Te0[3]
+ PACK2 $Te2[0],$Te3[1],$Te2[0]
+|| PACK2 $Te2[1],$Te3[2],$Te2[1]
+|| BNOP RA
+ PACKL4 $Te0[2],$Te2[0],$Te0[2]
+|| PACKL4 $Te0[3],$Te2[1],$Te0[3]
+ XOR $K[2],$Te0[2],$Te0[2] ; s[2]
+|| XOR $K[3],$Te0[3],$Te0[3] ; s[3]
+
+ MV $Te0[0],A9
+|| MV $Te0[1],A8
+ MV $Te0[2],B9
+|| MV $Te0[3],B8
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .else
+ PACK2 $Te1[1],$Te0[0],$Te1[1]
+|| PACK2 $Te1[2],$Te0[1],$Te1[2]
+ PACK2 $Te3[3],$Te2[2],$Te3[3]
+|| PACK2 $Te3[0],$Te2[3],$Te3[0]
+ PACKL4 $Te3[3],$Te1[1],$Te1[1]
+|| PACKL4 $Te3[0],$Te1[2],$Te1[2]
+ XOR $K[0],$Te1[1],$Te1[1] ; s[0]
+|| XOR $K[1],$Te1[2],$Te1[2] ; s[1]
+
+ PACK2 $Te1[3],$Te0[2],$Te1[3]
+|| PACK2 $Te1[0],$Te0[3],$Te1[0]
+ PACK2 $Te3[1],$Te2[0],$Te3[1]
+|| PACK2 $Te3[2],$Te2[1],$Te3[2]
+|| BNOP RA
+ PACKL4 $Te3[1],$Te1[3],$Te1[3]
+|| PACKL4 $Te3[2],$Te1[0],$Te1[0]
+ XOR $K[2],$Te1[3],$Te1[3] ; s[2]
+|| XOR $K[3],$Te1[0],$Te1[0] ; s[3]
+
+ MV $Te1[1],A8
+|| MV $Te1[2],A9
+ MV $Te1[3],B8
+|| MV $Te1[0],B9
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .endif
+ .endasmfunc
+
+ .global _AES_decrypt
+_AES_decrypt:
+ .asmfunc
+ MVK 1,B2
+__decrypt:
+ .if __TI_EABI__
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADDKPC __decrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH \$PCR_OFFSET(AES_Td,__decrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .else
+ [B2] LDNDW *INP++,A9:A8 ; load input
+|| MVKL (AES_Td-__decrypt),$TEA
+|| ADDKPC __decrypt,B0
+ [B2] LDNDW *INP++,B9:B8
+|| MVKH (AES_Td-__decrypt),$TEA
+|| ADD 0,KEY,$KPA
+|| ADD 4,KEY,$KPB
+ .endif
+ LDW *$KPA++[2],$Td0[0] ; zero round key
+|| LDW *$KPB++[2],$Td0[1]
+|| MVK 60,A0
+|| ADD B0,$TEA,$TEA ; AES_Td
+ LDW *KEY[A0],B0 ; rounds
+|| MVK 1024,A0 ; sizeof(AES_Td)
+ LDW *$KPA++[2],$Td0[2]
+|| LDW *$KPB++[2],$Td0[3]
+|| MV $TEA,$TEB
+ NOP
+ .if .BIG_ENDIAN
+ MV A9,$s[0]
+|| MV A8,$s[1]
+|| MV B9,$s[2]
+|| MV B8,$s[3]
+ .else
+ MV A8,$s[0]
+|| MV A9,$s[1]
+|| MV B8,$s[2]
+|| MV B9,$s[3]
+ .endif
+ XOR $Td0[0],$s[0],$s[0]
+|| XOR $Td0[1],$s[1],$s[1]
+|| LDW *$KPA++[2],$K[0] ; 1st round key
+|| LDW *$KPB++[2],$K[1]
+ SUB B0,2,B0
+
+ SPLOOPD 13
+|| MVC B0,ILC
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+;;====================================================================
+ EXTU $s[1],EXT3,24,$Td3[1]
+|| EXTU $s[0],EXT1,24,$Td1[0]
+ LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
+|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
+|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[1],EXT1,24,$Td1[1]
+|| EXTU $s[0],EXT3,24,$Td3[0]
+ LDW *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2
+|| LDW *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3
+|| EXTU $s[2],EXT2,24,$Td2[2]
+|| EXTU $s[3],EXT2,24,$Td2[3]
+ LDW *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0
+|| LDW *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1
+|| EXTU $s[3],EXT1,24,$Td1[3]
+|| EXTU $s[2],EXT3,24,$Td3[2]
+ LDW *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0
+|| LDW *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1
+|| EXTU $s[0],EXT2,24,$Td2[0]
+|| EXTU $s[1],EXT2,24,$Td2[1]
+ LDW *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2
+|| LDW *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3
+|| EXTU $s[3],EXT3,24,$Td3[3]
+|| EXTU $s[2],EXT1,24,$Td1[2]
+ LDW *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2
+|| LDW *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3
+|| ROTL $Td3[1],TBL3,$Td1[0] ; t0
+|| ROTL $Td1[0],TBL1,$Td3[1] ; t1
+|| EXTU $s[0],EXT0,24,$Td0[0]
+|| EXTU $s[1],EXT0,24,$Td0[1]
+ LDW *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0
+|| LDW *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1
+|| ROTL $Td1[1],TBL1,$Td3[0] ; t2
+|| ROTL $Td3[0],TBL3,$Td1[1] ; t3
+|| EXTU $s[2],EXT0,24,$Td0[2]
+|| EXTU $s[3],EXT0,24,$Td0[3]
+ LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2
+|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3
+|| ROTL $Td2[2],TBL2,$Td2[2] ; t0
+|| ROTL $Td2[3],TBL2,$Td2[3] ; t1
+|| XOR $K[0],$Td1[0],$s[0]
+|| XOR $K[1],$Td3[1],$s[1]
+ ROTL $Td1[3],TBL1,$Td3[2] ; t0
+|| ROTL $Td3[2],TBL3,$Td1[3] ; t1
+|| XOR $K[2],$Td3[0],$s[2]
+|| XOR $K[3],$Td1[1],$s[3]
+|| LDW *$KPA++[2],$K[0] ; next round key
+|| LDW *$KPB++[2],$K[1]
+ ROTL $Td2[0],TBL2,$Td2[0] ; t2
+|| ROTL $Td2[1],TBL2,$Td2[1] ; t3
+|| XOR $s[0],$Td2[2],$s[0]
+|| XOR $s[1],$Td2[3],$s[1]
+|| LDW *$KPA++[2],$K[2]
+|| LDW *$KPB++[2],$K[3]
+ ROTL $Td3[3],TBL3,$Td1[2] ; t2
+|| ROTL $Td1[2],TBL1,$Td3[3] ; t3
+|| XOR $s[0],$Td3[2],$s[0]
+|| XOR $s[1],$Td1[3],$s[1]
+ XOR $s[2],$Td2[0],$s[2]
+|| XOR $s[3],$Td2[1],$s[3]
+|| XOR $s[0],$Td0[0],$s[0]
+|| XOR $s[1],$Td0[1],$s[1]
+ SPKERNEL
+|| XOR.L $s[2],$Td1[2],$s[2]
+|| XOR.L $s[3],$Td3[3],$s[3]
+;;====================================================================
+ ADD.D ${TEA},A0,${TEA} ; point to Td4
+|| ADD.D ${TEB},A0,${TEB}
+|| EXTU $s[1],EXT3,24,$Td3[1]
+|| EXTU $s[0],EXT1,24,$Td1[0]
+ LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0
+|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1
+|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled
+|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled
+|| EXTU $s[0],EXT0,24,$Td0[0]
+|| EXTU $s[1],EXT0,24,$Td0[1]
+ LDBU *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0
+|| LDBU *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1
+|| EXTU $s[2],EXT2,24,$Td2[2]
+|| EXTU $s[3],EXT2,24,$Td2[3]
+ LDBU *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0
+|| LDBU *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1
+|| EXTU $s[3],EXT1,24,$Td1[3]
+|| EXTU $s[2],EXT3,24,$Td3[2]
+ LDBU *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0
+|| LDBU *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1
+|| EXTU $s[1],EXT1,24,$Td1[1]
+|| EXTU $s[0],EXT3,24,$Td3[0]
+ LDBU *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2
+|| LDBU *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3
+|| EXTU $s[0],EXT2,24,$Td2[0]
+|| EXTU $s[1],EXT2,24,$Td2[1]
+ LDBU *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2
+|| LDBU *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3
+|| EXTU $s[3],EXT3,24,$Td3[3]
+|| EXTU $s[2],EXT1,24,$Td1[2]
+ LDBU *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2
+|| LDBU *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3
+|| EXTU $s[2],EXT0,24,$Td0[2]
+|| EXTU $s[3],EXT0,24,$Td0[3]
+ LDBU *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2
+|| LDBU *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3
+
+ .if .BIG_ENDIAN
+ PACK2 $Td0[0],$Td1[3],$Td0[0]
+|| PACK2 $Td0[1],$Td1[0],$Td0[1]
+ PACK2 $Td2[2],$Td3[1],$Td2[2]
+|| PACK2 $Td2[3],$Td3[2],$Td2[3]
+ PACKL4 $Td0[0],$Td2[2],$Td0[0]
+|| PACKL4 $Td0[1],$Td2[3],$Td0[1]
+ XOR $K[0],$Td0[0],$Td0[0] ; s[0]
+|| XOR $K[1],$Td0[1],$Td0[1] ; s[1]
+
+ PACK2 $Td0[2],$Td1[1],$Td0[2]
+|| PACK2 $Td0[3],$Td1[2],$Td0[3]
+ PACK2 $Td2[0],$Td3[3],$Td2[0]
+|| PACK2 $Td2[1],$Td3[0],$Td2[1]
+|| BNOP RA
+ PACKL4 $Td0[2],$Td2[0],$Td0[2]
+|| PACKL4 $Td0[3],$Td2[1],$Td0[3]
+ XOR $K[2],$Td0[2],$Td0[2] ; s[2]
+|| XOR $K[3],$Td0[3],$Td0[3] ; s[3]
+
+ MV $Td0[0],A9
+|| MV $Td0[1],A8
+ MV $Td0[2],B9
+|| MV $Td0[3],B8
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .else
+ PACK2 $Td1[3],$Td0[0],$Td1[3]
+|| PACK2 $Td1[0],$Td0[1],$Td1[0]
+ PACK2 $Td3[1],$Td2[2],$Td3[1]
+|| PACK2 $Td3[2],$Td2[3],$Td3[2]
+ PACKL4 $Td3[1],$Td1[3],$Td1[3]
+|| PACKL4 $Td3[2],$Td1[0],$Td1[0]
+ XOR $K[0],$Td1[3],$Td1[3] ; s[0]
+|| XOR $K[1],$Td1[0],$Td1[0] ; s[1]
+
+ PACK2 $Td1[1],$Td0[2],$Td1[1]
+|| PACK2 $Td1[2],$Td0[3],$Td1[2]
+ PACK2 $Td3[3],$Td2[0],$Td3[3]
+|| PACK2 $Td3[0],$Td2[1],$Td3[0]
+|| BNOP RA
+ PACKL4 $Td3[3],$Td1[1],$Td1[1]
+|| PACKL4 $Td3[0],$Td1[2],$Td1[2]
+ XOR $K[2],$Td1[1],$Td1[1] ; s[2]
+|| XOR $K[3],$Td1[2],$Td1[2] ; s[3]
+
+ MV $Td1[3],A8
+|| MV $Td1[0],A9
+ MV $Td1[1],B8
+|| MV $Td1[2],B9
+|| [B2] STNDW A9:A8,*OUT++
+ [B2] STNDW B9:B8,*OUT++
+ .endif
+ .endasmfunc
+___
+{
+my @K=(@K,@s); # extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0; # used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+ .asg OUT,BITS
+
+ .global _AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+ .asmfunc
+ MV INP,A0
+|| SHRU BITS,5,BITS ; 128-192-256 -> 4-6-8
+|| MV KEY,A1
+ [!A0] B RA
+||[!A0] MVK -1,RET
+||[!A0] MVK 1,A1 ; only one B RA
+ [!A1] B RA
+||[!A1] MVK -1,RET
+||[!A1] MVK 0,A0
+|| MVK 0,B0
+|| MVK 0,A1
+ [A0] LDNDW *INP++,A9:A8
+|| [A0] CMPEQ 4,BITS,B0
+|| [A0] CMPLT 3,BITS,A1
+ [B0] B key128?
+|| [A1] LDNDW *INP++,B9:B8
+|| [A0] CMPEQ 6,BITS,B0
+|| [A0] CMPLT 5,BITS,A1
+ [B0] B key192?
+|| [A1] LDNDW *INP++,B17:B16
+|| [A0] CMPEQ 8,BITS,B0
+|| [A0] CMPLT 7,BITS,A1
+ [B0] B key256?
+|| [A1] LDNDW *INP++,B19:B18
+
+ .if __TI_EABI__
+ [A0] ADD 0,KEY,$KPA
+|| [A0] ADD 4,KEY,$KPB
+|| [A0] MVKL \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+ [A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .else
+ [A0] ADD 0,KEY,$KPA
+|| [A0] ADD 4,KEY,$KPB
+|| [A0] MVKL (AES_Te4-__set_encrypt_key),$TEA
+|| [A0] ADDKPC __set_encrypt_key,B6
+ [A0] MVKH (AES_Te4-__set_encrypt_key),$TEA
+ [A0] ADD B6,$TEA,$TEA ; AES_Te4
+ .endif
+ NOP
+ NOP
+
+ BNOP RA,5
+|| MVK -2,RET ; unknown bit length
+|| MVK 0,B0 ; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$Te4[2]
+|| MV B8,$K[3]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$Te4[2]
+|| MV B9,$K[3]
+ .endif
+
+ MVK 256,A0
+|| MVK 9,B0
+
+ SPLOOPD 14
+|| MVC B0,ILC
+|| MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[2]
+|| EXTU $K[3],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[3],A0
+|| EXTU $K[3],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[3],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+
+ XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+ XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+ SPKERNEL
+;;====================================================================
+ BNOP RA
+ MV $Te4[2],$K[2]
+|| STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 10,B0 ; rounds
+ STW B0,*++${KPB}[15]
+ MVK 0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$K[2]
+|| MV B8,$K[3]
+ MV B17,$Te4[2]
+|| MV B16,$K[5]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$K[2]
+|| MV B9,$K[3]
+ MV B16,$Te4[2]
+|| MV B17,$K[5]
+ .endif
+
+ MVK 256,A0
+|| MVK 6,B0
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop192?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[4]
+|| EXTU $K[5],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[5],A0
+|| EXTU $K[5],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[5],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ STW $K[4],*$KPA++[2]
+|| STW $K[5],*$KPB++[2]
+
+ XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+ BDEC loop192?,B0
+|| XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+ MV $Te4[2],$K[2]
+|| XOR $K[3],$K[4],$Te4[2] ; K[4]
+ XOR $Te4[2],$K[5],$K[5] ; K[5]
+;;====================================================================
+ BNOP RA
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 12,B0 ; rounds
+ STW B0,*++${KPB}[7]
+ MVK 0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+ .if .BIG_ENDIAN
+ MV A9,$K[0]
+|| MV A8,$K[1]
+|| MV B9,$K[2]
+|| MV B8,$K[3]
+ MV B17,$K[4]
+|| MV B16,$K[5]
+|| MV B19,$Te4[2]
+|| MV B18,$K[7]
+ .else
+ MV A8,$K[0]
+|| MV A9,$K[1]
+|| MV B8,$K[2]
+|| MV B9,$K[3]
+ MV B16,$K[4]
+|| MV B17,$K[5]
+|| MV B18,$Te4[2]
+|| MV B19,$K[7]
+ .endif
+
+ MVK 256,A0
+|| MVK 6,B0
+ MV $TEA,$TEB
+|| ADD $TEA,A0,A30 ; rcon
+;;====================================================================
+loop256?:
+ LDW *A30++[1],A31 ; rcon[i]
+|| MV $Te4[2],$K[6]
+|| EXTU $K[7],EXT1,24,$Te4[0]
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[7],A0
+|| EXTU $K[7],EXT2,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT3,24,A0
+|| EXTU $K[7],EXT0,24,$Te4[3]
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ .endif
+
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ STW $K[4],*$KPA++[2]
+|| STW $K[5],*$KPB++[2]
+ STW $K[6],*$KPA++[2]
+|| STW $K[7],*$KPB++[2]
+|| XOR A31,$K[0],$K[0] ; ^=rcon[i]
+ .if .BIG_ENDIAN
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+||[!B0] B done256?
+ .else
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+|| PACK2 $Te4[3],A0,$Te4[3]
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+||[!B0] B done256?
+ .endif
+ XOR $Te4[3],$K[0],$Te4[0] ; K[0]
+ XOR $Te4[0],$K[1],$K[1] ; K[1]
+ MV $Te4[0],$K[0]
+|| XOR $K[1],$K[2],$Te4[2] ; K[2]
+ XOR $Te4[2],$K[3],$K[3] ; K[3]
+
+ MV $Te4[2],$K[2]
+|| [B0] EXTU $K[3],EXT0,24,$Te4[0]
+|| [B0] SUB B0,1,B0
+ LDBU *${TEB}[$Te4[0]],$Te4[0]
+|| MV $K[3],A0
+|| EXTU $K[3],EXT1,24,$Te4[1]
+ LDBU *${TEB}[$Te4[1]],$Te4[1]
+|| EXTU A0,EXT2,24,A0
+|| EXTU $K[3],EXT3,24,$Te4[3]
+
+ .if .BIG_ENDIAN
+ LDBU *${TEA}[A0],$Te4[3]
+|| LDBU *${TEB}[$Te4[3]],A0
+ NOP 3
+ PACK2 $Te4[0],$Te4[1],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| B loop256?
+ PACKL4 $Te4[1],$Te4[3],$Te4[3]
+ .else
+ LDBU *${TEA}[A0],A0
+|| LDBU *${TEB}[$Te4[3]],$Te4[3]
+ NOP 3
+ PACK2 $Te4[1],$Te4[0],$Te4[1]
+ PACK2 $Te4[3],A0,$Te4[3]
+|| B loop256?
+ PACKL4 $Te4[3],$Te4[1],$Te4[3]
+ .endif
+
+ XOR $Te4[3],$K[4],$Te4[0] ; K[4]
+ XOR $Te4[0],$K[5],$K[5] ; K[5]
+ MV $Te4[0],$K[4]
+|| XOR $K[5],$K[6],$Te4[2] ; K[6]
+ XOR $Te4[2],$K[7],$K[7] ; K[7]
+;;====================================================================
+done256?:
+ BNOP RA
+ STW $K[0],*$KPA++[2]
+|| STW $K[1],*$KPB++[2]
+ STW $K[2],*$KPA++[2]
+|| STW $K[3],*$KPB++[2]
+ MVK 14,B0 ; rounds
+ STW B0,*--${KPB}[1]
+ MVK 0,RET
+ .endasmfunc
+
+ .global _AES_set_decrypt_key
+_AES_set_decrypt_key:
+ .asmfunc
+ B __set_encrypt_key ; guarantee local call
+ MV KEY,B30 ; B30 is not modified
+ MV RA, B31 ; B31 is not modified
+ ADDKPC ret?,RA,2
+ret?: ; B0 holds rounds or zero
+ [!B0] BNOP B31 ; return if zero
+ [B0] SHL B0,4,A0 ; offset to last round key
+ [B0] SHRU B0,1,B1
+ [B0] SUB B1,1,B1
+ [B0] MVK 0x0000001B,B3 ; AES polynomial
+ [B0] MVKH 0x07000000,B3
+
+ SPLOOPD 9 ; flip round keys
+|| MVC B1,ILC
+|| MV B30,$KPA
+|| ADD B30,A0,$KPB
+|| MVK 16,A0 ; sizeof(round key)
+;;====================================================================
+ LDW *${KPA}[0],A16
+|| LDW *${KPB}[0],B16
+ LDW *${KPA}[1],A17
+|| LDW *${KPB}[1],B17
+ LDW *${KPA}[2],A18
+|| LDW *${KPB}[2],B18
+ LDW *${KPA}[3],A19
+|| ADD $KPA,A0,$KPA
+|| LDW *${KPB}[3],B19
+|| SUB $KPB,A0,$KPB
+ NOP
+ STW B16,*${KPA}[-4]
+|| STW A16,*${KPB}[4]
+ STW B17,*${KPA}[-3]
+|| STW A17,*${KPB}[5]
+ STW B18,*${KPA}[-2]
+|| STW A18,*${KPB}[6]
+ STW B19,*${KPA}[-1]
+|| STW A19,*${KPB}[7]
+ SPKERNEL
+;;====================================================================
+ SUB B0,1,B0 ; skip last round
+|| ADD B30,A0,$KPA ; skip first round
+|| ADD B30,A0,$KPB
+|| MVC GFPGFR,B30 ; save GFPGFR
+ LDW *${KPA}[0],$K[0]
+|| LDW *${KPB}[1],$K[1]
+|| MVC B3,GFPGFR
+ LDW *${KPA}[2],$K[2]
+|| LDW *${KPB}[3],$K[3]
+ MVK 0x00000909,A24
+|| MVK 0x00000B0B,B24
+ MVKH 0x09090000,A24
+|| MVKH 0x0B0B0000,B24
+ MVC B0,ILC
+|| SUB B0,1,B0
+
+ GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+|| GMPY4 $K[1],A24,$Kx9[1]
+|| MVK 0x00000D0D,A25
+|| MVK 0x00000E0E,B25
+ GMPY4 $K[2],A24,$Kx9[2]
+|| GMPY4 $K[3],A24,$Kx9[3]
+|| MVKH 0x0D0D0000,A25
+|| MVKH 0x0E0E0000,B25
+
+ GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+|| GMPY4 $K[1],B24,$KxB[1]
+ GMPY4 $K[2],B24,$KxB[2]
+|| GMPY4 $K[3],B24,$KxB[3]
+
+ SPLOOP 11 ; InvMixColumns
+;;====================================================================
+ GMPY4 $K[0],A25,$KxD[0] ; ·0x0D
+|| GMPY4 $K[1],A25,$KxD[1]
+|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16
+|| SWAP2 $Kx9[1],$Kx9[1]
+|| MV $K[0],$s[0] ; this or DINT
+|| MV $K[1],$s[1]
+|| [B0] LDW *${KPA}[4],$K[0]
+|| [B0] LDW *${KPB}[5],$K[1]
+ GMPY4 $K[2],A25,$KxD[2]
+|| GMPY4 $K[3],A25,$KxD[3]
+|| SWAP2 $Kx9[2],$Kx9[2]
+|| SWAP2 $Kx9[3],$Kx9[3]
+|| MV $K[2],$s[2]
+|| MV $K[3],$s[3]
+|| [B0] LDW *${KPA}[6],$K[2]
+|| [B0] LDW *${KPB}[7],$K[3]
+
+ GMPY4 $s[0],B25,$KxE[0] ; ·0x0E
+|| GMPY4 $s[1],B25,$KxE[1]
+|| XOR $Kx9[0],$KxB[0],$KxB[0]
+|| XOR $Kx9[1],$KxB[1],$KxB[1]
+ GMPY4 $s[2],B25,$KxE[2]
+|| GMPY4 $s[3],B25,$KxE[3]
+|| XOR $Kx9[2],$KxB[2],$KxB[2]
+|| XOR $Kx9[3],$KxB[3],$KxB[3]
+
+ ROTL $KxB[0],TBL3,$KxB[0]
+|| ROTL $KxB[1],TBL3,$KxB[1]
+|| SWAP2 $KxD[0],$KxD[0] ; rotate by 16
+|| SWAP2 $KxD[1],$KxD[1]
+ ROTL $KxB[2],TBL3,$KxB[2]
+|| ROTL $KxB[3],TBL3,$KxB[3]
+|| SWAP2 $KxD[2],$KxD[2]
+|| SWAP2 $KxD[3],$KxD[3]
+
+ XOR $KxE[0],$KxD[0],$KxE[0]
+|| XOR $KxE[1],$KxD[1],$KxE[1]
+|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09
+|| [B0] GMPY4 $K[1],A24,$Kx9[1]
+|| ADDAW $KPA,4,$KPA
+ XOR $KxE[2],$KxD[2],$KxE[2]
+|| XOR $KxE[3],$KxD[3],$KxE[3]
+|| [B0] GMPY4 $K[2],A24,$Kx9[2]
+|| [B0] GMPY4 $K[3],A24,$Kx9[3]
+|| ADDAW $KPB,4,$KPB
+
+ XOR $KxB[0],$KxE[0],$KxE[0]
+|| XOR $KxB[1],$KxE[1],$KxE[1]
+|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B
+|| [B0] GMPY4 $K[1],B24,$KxB[1]
+ XOR $KxB[2],$KxE[2],$KxE[2]
+|| XOR $KxB[3],$KxE[3],$KxE[3]
+|| [B0] GMPY4 $K[2],B24,$KxB[2]
+|| [B0] GMPY4 $K[3],B24,$KxB[3]
+|| STW $KxE[0],*${KPA}[-4]
+|| STW $KxE[1],*${KPB}[-3]
+ STW $KxE[2],*${KPA}[-2]
+|| STW $KxE[3],*${KPB}[-1]
+|| [B0] SUB B0,1,B0
+ SPKERNEL
+;;====================================================================
+ BNOP B31,3
+ MVC B30,GFPGFR ; restore GFPGFR(*)
+ MVK 0,RET
+ .endasmfunc
+___
+# (*) Even though ABI doesn't specify GFPGFR as non-volatile, there
+# are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+ .global _AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+ .asmfunc
+ LDNDW *${ivp}[0],A31:A30 ; load counter value
+|| MV $blocks,A2 ; reassign $blocks
+|| DMV RA,$key,B27:B26 ; reassign RA and $key
+ LDNDW *${ivp}[1],B31:B30
+|| MVK 0,B2 ; don't let __encrypt load input
+|| MVK 0,A1 ; and postpone writing output
+ .if .BIG_ENDIAN
+ NOP
+ .else
+ NOP 4
+ SWAP2 B31,B31 ; keep least significant 32 bits
+ SWAP4 B31,B31 ; in host byte order
+ .endif
+ctr32_loop?:
+ [A2] BNOP __encrypt
+|| [A1] XOR A29,A9,A9 ; input^Ek(counter)
+|| [A1] XOR A28,A8,A8
+|| [A2] LDNDW *INP++,A29:A28 ; load input
+ [!A2] BNOP B27 ; return
+|| [A1] XOR B29,B9,B9
+|| [A1] XOR B28,B8,B8
+|| [A2] LDNDW *INP++,B29:B28
+ .if .BIG_ENDIAN
+ [A1] STNDW A9:A8,*OUT++ ; save output
+|| [A2] DMV A31,A30,A9:A8 ; pass counter value to __encrypt
+ [A1] STNDW B9:B8,*OUT++
+|| [A2] DMV B31,B30,B9:B8
+|| [A2] ADD B30,1,B30 ; counter++
+ .else
+ [A1] STNDW A9:A8,*OUT++ ; save output
+|| [A2] DMV A31,A30,A9:A8
+|| [A2] SWAP2 B31,B0
+|| [A2] ADD B31,1,B31 ; counter++
+ [A1] STNDW B9:B8,*OUT++
+|| [A2] MV B30,B8
+|| [A2] SWAP4 B0,B9
+ .endif
+ [A2] ADDKPC ctr32_loop?,RA ; return to ctr32_loop?
+|| [A2] MV B26,KEY ; pass $key
+|| [A2] SUB A2,1,A2 ; $blocks--
+||[!A1] MVK 1,A1
+ NOP
+ NOP
+ .endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+ .if __TI_EABI__
+ .sect ".text:aes_asm.const"
+ .else
+ .sect ".const:aes_asm"
+ .endif
+ .align 128
+AES_Te:
+ .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84
+ .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
+ .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
+ .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
+ .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
+ .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
+ .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
+ .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
+ .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
+ .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
+ .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
+ .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
+ .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
+ .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
+ .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
+ .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
+ .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
+ .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
+ .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
+ .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
+ .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
+ .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
+ .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
+ .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
+ .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
+ .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
+ .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
+ .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
+ .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
+ .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
+ .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
+ .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
+ .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
+ .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
+ .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
+ .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
+ .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
+ .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
+ .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
+ .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
+ .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
+ .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
+ .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
+ .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
+ .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
+ .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
+ .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
+ .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
+ .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
+ .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
+ .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
+ .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
+ .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
+ .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
+ .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
+ .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
+ .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
+ .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
+ .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
+ .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
+ .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
+ .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
+ .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
+ .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
+ .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
+ .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
+ .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
+ .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
+ .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
+ .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
+ .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
+ .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
+ .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
+ .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
+ .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
+ .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
+ .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
+ .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
+ .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
+ .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
+ .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
+ .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
+ .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
+ .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
+ .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
+ .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
+ .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
+ .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
+ .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
+ .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
+ .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
+ .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
+ .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
+ .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
+ .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
+ .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
+ .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
+ .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
+ .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
+ .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
+ .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
+ .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
+ .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
+ .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
+ .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
+ .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
+ .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
+ .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
+ .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
+ .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
+ .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
+ .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
+ .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
+ .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
+ .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
+ .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
+ .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
+ .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
+ .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
+ .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
+ .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
+ .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
+ .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
+ .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
+ .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
+ .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
+ .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
+ .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
+AES_Te4:
+ .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+rcon:
+ .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00
+ .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
+ .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
+ .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
+ .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
+ .align 128
+AES_Td:
+ .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53
+ .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
+ .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
+ .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
+ .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
+ .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
+ .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
+ .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
+ .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
+ .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
+ .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
+ .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
+ .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
+ .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
+ .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
+ .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
+ .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
+ .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
+ .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
+ .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
+ .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
+ .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
+ .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
+ .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
+ .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
+ .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
+ .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
+ .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
+ .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
+ .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
+ .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
+ .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
+ .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
+ .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
+ .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
+ .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
+ .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
+ .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
+ .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
+ .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
+ .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
+ .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
+ .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
+ .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
+ .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
+ .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
+ .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
+ .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
+ .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
+ .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
+ .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
+ .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
+ .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
+ .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
+ .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
+ .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
+ .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
+ .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
+ .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
+ .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
+ .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
+ .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
+ .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
+ .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
+ .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
+ .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
+ .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
+ .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
+ .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
+ .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
+ .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
+ .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
+ .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
+ .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
+ .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
+ .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
+ .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
+ .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
+ .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
+ .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
+ .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
+ .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
+ .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
+ .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
+ .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
+ .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
+ .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
+ .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
+ .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
+ .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
+ .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
+ .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
+ .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
+ .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
+ .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
+ .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
+ .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
+ .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
+ .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
+ .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
+ .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
+ .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
+ .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
+ .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
+ .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
+ .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
+ .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
+ .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
+ .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
+ .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
+ .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
+ .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
+ .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
+ .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
+ .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
+ .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
+ .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
+ .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
+ .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
+ .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
+ .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
+ .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
+ .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
+ .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
+ .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
+ .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
+ .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
+ .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
+AES_Td4:
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-ia64.S b/openssl-1.1.0h/crypto/aes/asm/aes-ia64.S
new file mode 100644
index 0000000..f7f1f63
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-ia64.S
@@ -0,0 +1,1130 @@
+// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project. Rights for redistribution and usage in source and binary
+// forms are granted according to the OpenSSL license.
+// ====================================================================
+//
+// What's wrong with compiler generated code? Compiler never uses
+// variable 'shr' which is pairable with 'extr'/'dep' instructions.
+// Then it uses 'zxt' which is an I-type, but can be replaced with
+// 'and' which in turn can be assigned to M-port [there're double as
+// much M-ports as there're I-ports on Itanium 2]. By sacrificing few
+// registers for small constants (255, 24 and 16) to be used with
+// 'shr' and 'and' instructions I can achieve better ILP, Instruction
+// Level Parallelism, and performance. This code outperforms GCC 3.3
+// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
+// HP C - by 40%. Measured best-case scenario, i.e. aligned
+// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)
+// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.
+
+// Version 1.2 mitigates the hazard of cache-timing attacks by
+// a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling
+// references to S-boxes for L2 cache latency, c) prefetching T[ed]4
+// prior last round. As result performance dropped to (26 + 15*rounds)
+// ticks per block or 11 cycles per byte processed with 128-bit key.
+// This is ~16% deterioration. For reference Itanium 2 L1 cache has
+// 64 bytes line size and L2 - 128 bytes...
+
+.ident "aes-ia64.S, version 1.2"
+.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+.explicit
+.text
+
+rk0=r8; rk1=r9;
+
+pfssave=r2;
+lcsave=r10;
+prsave=r3;
+maskff=r11;
+twenty4=r14;
+sixteen=r15;
+
+te00=r16; te11=r17; te22=r18; te33=r19;
+te01=r20; te12=r21; te23=r22; te30=r23;
+te02=r24; te13=r25; te20=r26; te31=r27;
+te03=r28; te10=r29; te21=r30; te32=r31;
+
+// these are rotating...
+t0=r32; s0=r33;
+t1=r34; s1=r35;
+t2=r36; s2=r37;
+t3=r38; s3=r39;
+
+te0=r40; te1=r41; te2=r42; te3=r43;
+
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+# define ADDP addp4
+#else
+# define ADDP add
+#endif
+
+// Offsets from Te0
+#define TE0 0
+#define TE2 2
+#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
+#define TE1 3
+#define TE3 1
+#else
+#define TE1 1
+#define TE3 3
+#endif
+
+// This implies that AES_KEY comprises 32-bit key schedule elements
+// even on LP64 platforms.
+#ifndef KSZ
+# define KSZ 4
+# define LDKEY ld4
+#endif
+
+.proc _ia64_AES_encrypt#
+// Input: rk0-rk1
+// te0
+// te3 as AES_KEY->rounds!!!
+// s0-s3
+// maskff,twenty4,sixteen
+// Output: r16,r20,r24,r28 as s0-s3
+// Clobber: r16-r31,rk0-rk1,r32-r43
+.align 32
+_ia64_AES_encrypt:
+ .prologue
+ .altrp b6
+ .body
+{ .mmi; alloc r16=ar.pfs,12,0,0,8
+ LDKEY t0=[rk0],2*KSZ
+ mov pr.rot=1<<16 }
+{ .mmi; LDKEY t1=[rk1],2*KSZ
+ add te1=TE1,te0
+ add te3=-3,te3 };;
+{ .mib; LDKEY t2=[rk0],2*KSZ
+ mov ar.ec=2 }
+{ .mib; LDKEY t3=[rk1],2*KSZ
+ add te2=TE2,te0
+ brp.loop.imp .Le_top,.Le_end-16 };;
+
+{ .mmi; xor s0=s0,t0
+ xor s1=s1,t1
+ mov ar.lc=te3 }
+{ .mmi; xor s2=s2,t2
+ xor s3=s3,t3
+ add te3=TE3,te0 };;
+
+.align 32
+.Le_top:
+{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
+ (p0) and te33=s3,maskff // 0/0:s3&0xff
+ (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
+{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
+ (p0) and te30=s0,maskff // 0/1:s0&0xff
+ (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
+{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
+ (p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24
+ (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
+{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
+ (p0) shladd te30=te30,3,te3 // 1/1:te3+s0
+ (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
+{ .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff]
+ (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
+ (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
+{ .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0]
+ (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
+ (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
+{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
+ (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
+ (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
+{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
+ (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
+ (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
+{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
+ (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
+ (p0) extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
+{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
+ (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
+ (p0) shr.u te13=s3,sixteen };; // 4/2:s3>>16
+{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
+ (p0) shladd te11=te11,3,te1 // 5/0:te1+s1>>16
+ (p0) extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
+{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
+ (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
+ (p0) and te31=s1,maskff };; // 5/2:s1&0xff
+{ .mmi; (p0) ld4 te11=[te11] // 6/0:te1[s1>>16]
+ (p0) shladd te12=te12,3,te1 // 6/1:te1+s2>>16
+ (p0) extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
+{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
+ (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
+ (p0) and te32=s2,maskff };; // 6/3:s2&0xff
+
+{ .mmi; (p0) ld4 te12=[te12] // 7/1:te1[s2>>16]
+ (p0) shladd te31=te31,3,te3 // 7/2:te3+s1&0xff
+ (p0) and te13=te13,maskff} // 7/2:s3>>16&0xff
+{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
+ (p0) shladd te32=te32,3,te3 // 7/3:te3+s2
+ (p0) xor t0=t0,te33 };; // 7/0:
+{ .mmi; (p0) ld4 te31=[te31] // 8/2:te3[s1]
+ (p0) shladd te13=te13,3,te1 // 8/2:te1+s3>>16
+ (p0) xor t0=t0,te22 } // 8/0:
+{ .mmi; (p0) ld4 te32=[te32] // 8/3:te3[s2]
+ (p0) shladd te10=te10,3,te1 // 8/3:te1+s0>>16
+ (p0) xor t1=t1,te30 };; // 8/1:
+{ .mmi; (p0) ld4 te13=[te13] // 9/2:te1[s3>>16]
+ (p0) ld4 te10=[te10] // 9/3:te1[s0>>16]
+ (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
+{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
+ (p0) xor t2=t2,te20 // 10[9]/2:
+ (p0) xor t3=t3,te21 };; // 10[9]/3:
+{ .mmi; (p0) xor t0=t0,te11 // 11[10]/0:done!
+ (p0) xor t1=t1,te01 // 11[10]/1:
+ (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
+{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
+ (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
+{ .mmi; (p0) xor t1=t1,te12 // 13[11]/1:done!
+ (p0) xor t2=t2,te31 // 13[11]/2:
+ (p0) xor t3=t3,te32 } // 13[11]/3:
+{ .mmi; (p17) add te0=2048,te0 // 13[11]/
+ (p17) add te1=2048+64-TE1,te1};; // 13[11]/
+{ .mib; (p0) xor t2=t2,te13 // 14[12]/2:done!
+ (p17) add te2=2048+128-TE2,te2} // 14[12]/
+{ .mib; (p0) xor t3=t3,te10 // 14[12]/3:done!
+ (p17) add te3=2048+192-TE3,te3 // 14[12]/
+ br.ctop.sptk .Le_top };;
+.Le_end:
+
+
+{ .mmi; ld8 te12=[te0] // prefetch Te4
+ ld8 te31=[te1] }
+{ .mmi; ld8 te10=[te2]
+ ld8 te32=[te3] }
+
+{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
+ and te33=s3,maskff // 0/0:s3&0xff
+ extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
+{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
+ and te30=s0,maskff // 0/1:s0&0xff
+ shr.u te00=s0,twenty4 };; // 0/0:s0>>24
+{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
+ add te33=te33,te0 // 1/0:te0+s0>>24
+ extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
+{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
+ add te30=te30,te0 // 1/1:te0+s0
+ shr.u te01=s1,twenty4 };; // 1/1:s1>>24
+{ .mmi; ld1 te33=[te33] // 2/0:te0[s3&0xff]
+ add te22=te22,te0 // 2/0:te0+s2>>8&0xff
+ extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
+{ .mmi; ld1 te30=[te30] // 2/1:te0[s0]
+ add te23=te23,te0 // 2/1:te0+s3>>8
+ shr.u te02=s2,twenty4 };; // 2/2:s2>>24
+{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
+ add te20=te20,te0 // 3/2:te0+s0>>8
+ extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
+{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
+ add te00=te00,te0 // 3/0:te0+s0>>24
+ shr.u te03=s3,twenty4 };; // 3/3:s3>>24
+{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
+ add te21=te21,te0 // 4/3:te0+s2
+ extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
+{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
+ add te01=te01,te0 // 4/1:te0+s1>>24
+ shr.u te13=s3,sixteen };; // 4/2:s3>>16
+{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
+ add te11=te11,te0 // 5/0:te0+s1>>16
+ extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
+{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
+ add te02=te02,te0 // 5/2:te0+s2>>24
+ and te31=s1,maskff };; // 5/2:s1&0xff
+{ .mmi; ld1 te11=[te11] // 6/0:te0[s1>>16]
+ add te12=te12,te0 // 6/1:te0+s2>>16
+ extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
+{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
+ add te03=te03,te0 // 6/3:te0+s0>>16
+ and te32=s2,maskff };; // 6/3:s2&0xff
+
+{ .mmi; ld1 te12=[te12] // 7/1:te0[s2>>16]
+ add te31=te31,te0 // 7/2:te0+s1&0xff
+ dep te33=te22,te33,8,8} // 7/0:
+{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
+ add te32=te32,te0 // 7/3:te0+s2
+ and te13=te13,maskff};; // 7/2:s3>>16&0xff
+{ .mmi; ld1 te31=[te31] // 8/2:te0[s1]
+ add te13=te13,te0 // 8/2:te0+s3>>16
+ dep te30=te23,te30,8,8} // 8/1:
+{ .mmi; ld1 te32=[te32] // 8/3:te0[s2]
+ add te10=te10,te0 // 8/3:te0+s0>>16
+ shl te00=te00,twenty4};; // 8/0:
+{ .mii; ld1 te13=[te13] // 9/2:te0[s3>>16]
+ dep te33=te11,te33,16,8 // 9/0:
+ shl te01=te01,twenty4};; // 9/1:
+{ .mii; ld1 te10=[te10] // 10/3:te0[s0>>16]
+ dep te31=te20,te31,8,8 // 10/2:
+ shl te02=te02,twenty4};; // 10/2:
+{ .mii; xor t0=t0,te33 // 11/0:
+ dep te32=te21,te32,8,8 // 11/3:
+ shl te12=te12,sixteen};; // 11/1:
+{ .mii; xor r16=t0,te00 // 12/0:done!
+ dep te31=te13,te31,16,8 // 12/2:
+ shl te03=te03,twenty4};; // 12/3:
+{ .mmi; xor t1=t1,te01 // 13/1:
+ xor t2=t2,te02 // 13/2:
+ dep te32=te10,te32,16,8};; // 13/3:
+{ .mmi; xor t1=t1,te30 // 14/1:
+ xor r24=t2,te31 // 14/2:done!
+ xor t3=t3,te32 };; // 14/3:
+{ .mib; xor r20=t1,te12 // 15/1:done!
+ xor r28=t3,te03 // 15/3:done!
+ br.ret.sptk b6 };;
+.endp _ia64_AES_encrypt#
+
+// void AES_encrypt (const void *in,void *out,const AES_KEY *key);
+.global AES_encrypt#
+.proc AES_encrypt#
+.align 32
+AES_encrypt:
+ .prologue
+ .save ar.pfs,pfssave
+{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
+ and out0=3,in0
+ mov r3=ip }
+{ .mmi; ADDP in0=0,in0
+ mov loc0=psr.um
+ ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
+
+{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
+ add out8=(AES_Te#-AES_encrypt#),r3 // Te0
+ .save pr,prsave
+ mov prsave=pr }
+{ .mmi; rum 1<<3 // clear um.ac
+ .save ar.lc,lcsave
+ mov lcsave=ar.lc };;
+
+ .body
+#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
+{ .mib; cmp.ne p6,p0=out0,r0
+ add out0=4,in0
+(p6) br.dpnt.many .Le_i_unaligned };;
+
+{ .mmi; ld4 out1=[in0],8 // s0
+ and out9=3,in1
+ mov twenty4=24 }
+{ .mmi; ld4 out3=[out0],8 // s1
+ ADDP rk0=0,in2
+ mov sixteen=16 };;
+{ .mmi; ld4 out5=[in0] // s2
+ cmp.ne p6,p0=out9,r0
+ mov maskff=0xff }
+{ .mmb; ld4 out7=[out0] // s3
+ ADDP rk1=KSZ,in2
+ br.call.sptk.many b6=_ia64_AES_encrypt };;
+
+{ .mib; ADDP in0=4,in1
+ ADDP in1=0,in1
+(p6) br.spnt .Le_o_unaligned };;
+
+{ .mii; mov psr.um=loc0
+ mov ar.pfs=pfssave
+ mov ar.lc=lcsave };;
+{ .mmi; st4 [in1]=r16,8 // s0
+ st4 [in0]=r20,8 // s1
+ mov pr=prsave,0x1ffff };;
+{ .mmb; st4 [in1]=r24 // s2
+ st4 [in0]=r28 // s3
+ br.ret.sptk.many b0 };;
+#endif
+
+.align 32
+.Le_i_unaligned:
+{ .mmi; add out0=1,in0
+ add out2=2,in0
+ add out4=3,in0 };;
+{ .mmi; ld1 r16=[in0],4
+ ld1 r17=[out0],4 }//;;
+{ .mmi; ld1 r18=[out2],4
+ ld1 out1=[out4],4 };; // s0
+{ .mmi; ld1 r20=[in0],4
+ ld1 r21=[out0],4 }//;;
+{ .mmi; ld1 r22=[out2],4
+ ld1 out3=[out4],4 };; // s1
+{ .mmi; ld1 r24=[in0],4
+ ld1 r25=[out0],4 }//;;
+{ .mmi; ld1 r26=[out2],4
+ ld1 out5=[out4],4 };; // s2
+{ .mmi; ld1 r28=[in0]
+ ld1 r29=[out0] }//;;
+{ .mmi; ld1 r30=[out2]
+ ld1 out7=[out4] };; // s3
+
+{ .mii;
+ dep out1=r16,out1,24,8 //;;
+ dep out3=r20,out3,24,8 }//;;
+{ .mii; ADDP rk0=0,in2
+ dep out5=r24,out5,24,8 //;;
+ dep out7=r28,out7,24,8 };;
+{ .mii; ADDP rk1=KSZ,in2
+ dep out1=r17,out1,16,8 //;;
+ dep out3=r21,out3,16,8 }//;;
+{ .mii; mov twenty4=24
+ dep out5=r25,out5,16,8 //;;
+ dep out7=r29,out7,16,8 };;
+{ .mii; mov sixteen=16
+ dep out1=r18,out1,8,8 //;;
+ dep out3=r22,out3,8,8 }//;;
+{ .mii; mov maskff=0xff
+ dep out5=r26,out5,8,8 //;;
+ dep out7=r30,out7,8,8 };;
+
+{ .mib; br.call.sptk.many b6=_ia64_AES_encrypt };;
+
+.Le_o_unaligned:
+{ .mii; ADDP out0=0,in1
+ extr.u r17=r16,8,8 // s0
+ shr.u r19=r16,twenty4 }//;;
+{ .mii; ADDP out1=1,in1
+ extr.u r18=r16,16,8
+ shr.u r23=r20,twenty4 }//;; // s1
+{ .mii; ADDP out2=2,in1
+ extr.u r21=r20,8,8
+ shr.u r22=r20,sixteen }//;;
+{ .mii; ADDP out3=3,in1
+ extr.u r25=r24,8,8 // s2
+ shr.u r27=r24,twenty4 };;
+{ .mii; st1 [out3]=r16,4
+ extr.u r26=r24,16,8
+ shr.u r31=r28,twenty4 }//;; // s3
+{ .mii; st1 [out2]=r17,4
+ extr.u r29=r28,8,8
+ shr.u r30=r28,sixteen }//;;
+
+{ .mmi; st1 [out1]=r18,4
+ st1 [out0]=r19,4 };;
+{ .mmi; st1 [out3]=r20,4
+ st1 [out2]=r21,4 }//;;
+{ .mmi; st1 [out1]=r22,4
+ st1 [out0]=r23,4 };;
+{ .mmi; st1 [out3]=r24,4
+ st1 [out2]=r25,4
+ mov pr=prsave,0x1ffff }//;;
+{ .mmi; st1 [out1]=r26,4
+ st1 [out0]=r27,4
+ mov ar.pfs=pfssave };;
+{ .mmi; st1 [out3]=r28
+ st1 [out2]=r29
+ mov ar.lc=lcsave }//;;
+{ .mmi; st1 [out1]=r30
+ st1 [out0]=r31 }
+{ .mfb; mov psr.um=loc0 // restore user mask
+ br.ret.sptk.many b0 };;
+.endp AES_encrypt#
+
+// *AES_decrypt are autogenerated by the following script:
+#if 0
+#!/usr/bin/env perl
+print "// *AES_decrypt are autogenerated by the following script:\n#if 0\n";
+open(PROG,'<'.$0); while(<PROG>) { print; } close(PROG);
+print "#endif\n";
+while(<>) {
+ $process=1 if (/\.proc\s+_ia64_AES_encrypt/);
+ next if (!$process);
+
+ #s/te00=s0/td00=s0/; s/te00/td00/g;
+ s/te11=s1/td13=s3/; s/te11/td13/g;
+ #s/te22=s2/td22=s2/; s/te22/td22/g;
+ s/te33=s3/td31=s1/; s/te33/td31/g;
+
+ #s/te01=s1/td01=s1/; s/te01/td01/g;
+ s/te12=s2/td10=s0/; s/te12/td10/g;
+ #s/te23=s3/td23=s3/; s/te23/td23/g;
+ s/te30=s0/td32=s2/; s/te30/td32/g;
+
+ #s/te02=s2/td02=s2/; s/te02/td02/g;
+ s/te13=s3/td11=s1/; s/te13/td11/g;
+ #s/te20=s0/td20=s0/; s/te20/td20/g;
+ s/te31=s1/td33=s3/; s/te31/td33/g;
+
+ #s/te03=s3/td03=s3/; s/te03/td03/g;
+ s/te10=s0/td12=s2/; s/te10/td12/g;
+ #s/te21=s1/td21=s1/; s/te21/td21/g;
+ s/te32=s2/td30=s0/; s/te32/td30/g;
+
+ s/td/te/g;
+
+ s/AES_encrypt/AES_decrypt/g;
+ s/\.Le_/.Ld_/g;
+ s/AES_Te#/AES_Td#/g;
+
+ print;
+
+ exit if (/\.endp\s+AES_decrypt/);
+}
+#endif
+.proc _ia64_AES_decrypt#
+// Input: rk0-rk1
+// te0
+// te3 as AES_KEY->rounds!!!
+// s0-s3
+// maskff,twenty4,sixteen
+// Output: r16,r20,r24,r28 as s0-s3
+// Clobber: r16-r31,rk0-rk1,r32-r43
+.align 32
+_ia64_AES_decrypt:
+ .prologue
+ .altrp b6
+ .body
+{ .mmi; alloc r16=ar.pfs,12,0,0,8
+ LDKEY t0=[rk0],2*KSZ
+ mov pr.rot=1<<16 }
+{ .mmi; LDKEY t1=[rk1],2*KSZ
+ add te1=TE1,te0
+ add te3=-3,te3 };;
+{ .mib; LDKEY t2=[rk0],2*KSZ
+ mov ar.ec=2 }
+{ .mib; LDKEY t3=[rk1],2*KSZ
+ add te2=TE2,te0
+ brp.loop.imp .Ld_top,.Ld_end-16 };;
+
+{ .mmi; xor s0=s0,t0
+ xor s1=s1,t1
+ mov ar.lc=te3 }
+{ .mmi; xor s2=s2,t2
+ xor s3=s3,t3
+ add te3=TE3,te0 };;
+
+.align 32
+.Ld_top:
+{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
+ (p0) and te31=s1,maskff // 0/0:s3&0xff
+ (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
+{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
+ (p0) and te32=s2,maskff // 0/1:s0&0xff
+ (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
+{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
+ (p0) shladd te31=te31,3,te3 // 1/0:te0+s0>>24
+ (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
+{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
+ (p0) shladd te32=te32,3,te3 // 1/1:te3+s0
+ (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
+{ .mmi; (p0) ld4 te31=[te31] // 2/0:te3[s3&0xff]
+ (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
+ (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
+{ .mmi; (p0) ld4 te32=[te32] // 2/1:te3[s0]
+ (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
+ (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
+{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
+ (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
+ (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
+{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
+ (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
+ (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
+{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
+ (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
+ (p0) extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
+{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
+ (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
+ (p0) shr.u te11=s1,sixteen };; // 4/2:s3>>16
+{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
+ (p0) shladd te13=te13,3,te1 // 5/0:te1+s1>>16
+ (p0) extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
+{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
+ (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
+ (p0) and te33=s3,maskff };; // 5/2:s1&0xff
+{ .mmi; (p0) ld4 te13=[te13] // 6/0:te1[s1>>16]
+ (p0) shladd te10=te10,3,te1 // 6/1:te1+s2>>16
+ (p0) extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
+{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
+ (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
+ (p0) and te30=s0,maskff };; // 6/3:s2&0xff
+
+{ .mmi; (p0) ld4 te10=[te10] // 7/1:te1[s2>>16]
+ (p0) shladd te33=te33,3,te3 // 7/2:te3+s1&0xff
+ (p0) and te11=te11,maskff} // 7/2:s3>>16&0xff
+{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
+ (p0) shladd te30=te30,3,te3 // 7/3:te3+s2
+ (p0) xor t0=t0,te31 };; // 7/0:
+{ .mmi; (p0) ld4 te33=[te33] // 8/2:te3[s1]
+ (p0) shladd te11=te11,3,te1 // 8/2:te1+s3>>16
+ (p0) xor t0=t0,te22 } // 8/0:
+{ .mmi; (p0) ld4 te30=[te30] // 8/3:te3[s2]
+ (p0) shladd te12=te12,3,te1 // 8/3:te1+s0>>16
+ (p0) xor t1=t1,te32 };; // 8/1:
+{ .mmi; (p0) ld4 te11=[te11] // 9/2:te1[s3>>16]
+ (p0) ld4 te12=[te12] // 9/3:te1[s0>>16]
+ (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
+{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
+ (p0) xor t2=t2,te20 // 10[9]/2:
+ (p0) xor t3=t3,te21 };; // 10[9]/3:
+{ .mmi; (p0) xor t0=t0,te13 // 11[10]/0:done!
+ (p0) xor t1=t1,te01 // 11[10]/1:
+ (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
+{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
+ (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
+{ .mmi; (p0) xor t1=t1,te10 // 13[11]/1:done!
+ (p0) xor t2=t2,te33 // 13[11]/2:
+ (p0) xor t3=t3,te30 } // 13[11]/3:
+{ .mmi; (p17) add te0=2048,te0 // 13[11]/
+ (p17) add te1=2048+64-TE1,te1};; // 13[11]/
+{ .mib; (p0) xor t2=t2,te11 // 14[12]/2:done!
+ (p17) add te2=2048+128-TE2,te2} // 14[12]/
+{ .mib; (p0) xor t3=t3,te12 // 14[12]/3:done!
+ (p17) add te3=2048+192-TE3,te3 // 14[12]/
+ br.ctop.sptk .Ld_top };;
+.Ld_end:
+
+
+{ .mmi; ld8 te10=[te0] // prefetch Td4
+ ld8 te33=[te1] }
+{ .mmi; ld8 te12=[te2]
+ ld8 te30=[te3] }
+
+{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
+ and te31=s1,maskff // 0/0:s3&0xff
+ extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
+{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
+ and te32=s2,maskff // 0/1:s0&0xff
+ shr.u te00=s0,twenty4 };; // 0/0:s0>>24
+{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
+ add te31=te31,te0 // 1/0:te0+s0>>24
+ extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
+{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
+ add te32=te32,te0 // 1/1:te0+s0
+ shr.u te01=s1,twenty4 };; // 1/1:s1>>24
+{ .mmi; ld1 te31=[te31] // 2/0:te0[s3&0xff]
+ add te22=te22,te0 // 2/0:te0+s2>>8&0xff
+ extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
+{ .mmi; ld1 te32=[te32] // 2/1:te0[s0]
+ add te23=te23,te0 // 2/1:te0+s3>>8
+ shr.u te02=s2,twenty4 };; // 2/2:s2>>24
+{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
+ add te20=te20,te0 // 3/2:te0+s0>>8
+ extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
+{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
+ add te00=te00,te0 // 3/0:te0+s0>>24
+ shr.u te03=s3,twenty4 };; // 3/3:s3>>24
+{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
+ add te21=te21,te0 // 4/3:te0+s2
+ extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
+{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
+ add te01=te01,te0 // 4/1:te0+s1>>24
+ shr.u te11=s1,sixteen };; // 4/2:s3>>16
+{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
+ add te13=te13,te0 // 5/0:te0+s1>>16
+ extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
+{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
+ add te02=te02,te0 // 5/2:te0+s2>>24
+ and te33=s3,maskff };; // 5/2:s1&0xff
+{ .mmi; ld1 te13=[te13] // 6/0:te0[s1>>16]
+ add te10=te10,te0 // 6/1:te0+s2>>16
+ extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
+{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
+ add te03=te03,te0 // 6/3:te0+s0>>16
+ and te30=s0,maskff };; // 6/3:s2&0xff
+
+{ .mmi; ld1 te10=[te10] // 7/1:te0[s2>>16]
+ add te33=te33,te0 // 7/2:te0+s1&0xff
+ dep te31=te22,te31,8,8} // 7/0:
+{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
+ add te30=te30,te0 // 7/3:te0+s2
+ and te11=te11,maskff};; // 7/2:s3>>16&0xff
+{ .mmi; ld1 te33=[te33] // 8/2:te0[s1]
+ add te11=te11,te0 // 8/2:te0+s3>>16
+ dep te32=te23,te32,8,8} // 8/1:
+{ .mmi; ld1 te30=[te30] // 8/3:te0[s2]
+ add te12=te12,te0 // 8/3:te0+s0>>16
+ shl te00=te00,twenty4};; // 8/0:
+{ .mii; ld1 te11=[te11] // 9/2:te0[s3>>16]
+ dep te31=te13,te31,16,8 // 9/0:
+ shl te01=te01,twenty4};; // 9/1:
+{ .mii; ld1 te12=[te12] // 10/3:te0[s0>>16]
+ dep te33=te20,te33,8,8 // 10/2:
+ shl te02=te02,twenty4};; // 10/2:
+{ .mii; xor t0=t0,te31 // 11/0:
+ dep te30=te21,te30,8,8 // 11/3:
+ shl te10=te10,sixteen};; // 11/1:
+{ .mii; xor r16=t0,te00 // 12/0:done!
+ dep te33=te11,te33,16,8 // 12/2:
+ shl te03=te03,twenty4};; // 12/3:
+{ .mmi; xor t1=t1,te01 // 13/1:
+ xor t2=t2,te02 // 13/2:
+ dep te30=te12,te30,16,8};; // 13/3:
+{ .mmi; xor t1=t1,te32 // 14/1:
+ xor r24=t2,te33 // 14/2:done!
+ xor t3=t3,te30 };; // 14/3:
+{ .mib; xor r20=t1,te10 // 15/1:done!
+ xor r28=t3,te03 // 15/3:done!
+ br.ret.sptk b6 };;
+.endp _ia64_AES_decrypt#
+
+// void AES_decrypt (const void *in,void *out,const AES_KEY *key);
+.global AES_decrypt#
+.proc AES_decrypt#
+.align 32
+AES_decrypt:
+ .prologue
+ .save ar.pfs,pfssave
+{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
+ and out0=3,in0
+ mov r3=ip }
+{ .mmi; ADDP in0=0,in0
+ mov loc0=psr.um
+ ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
+
+{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
+ add out8=(AES_Td#-AES_decrypt#),r3 // Te0
+ .save pr,prsave
+ mov prsave=pr }
+{ .mmi; rum 1<<3 // clear um.ac
+ .save ar.lc,lcsave
+ mov lcsave=ar.lc };;
+
+ .body
+#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
+{ .mib; cmp.ne p6,p0=out0,r0
+ add out0=4,in0
+(p6) br.dpnt.many .Ld_i_unaligned };;
+
+{ .mmi; ld4 out1=[in0],8 // s0
+ and out9=3,in1
+ mov twenty4=24 }
+{ .mmi; ld4 out3=[out0],8 // s1
+ ADDP rk0=0,in2
+ mov sixteen=16 };;
+{ .mmi; ld4 out5=[in0] // s2
+ cmp.ne p6,p0=out9,r0
+ mov maskff=0xff }
+{ .mmb; ld4 out7=[out0] // s3
+ ADDP rk1=KSZ,in2
+ br.call.sptk.many b6=_ia64_AES_decrypt };;
+
+{ .mib; ADDP in0=4,in1
+ ADDP in1=0,in1
+(p6) br.spnt .Ld_o_unaligned };;
+
+{ .mii; mov psr.um=loc0
+ mov ar.pfs=pfssave
+ mov ar.lc=lcsave };;
+{ .mmi; st4 [in1]=r16,8 // s0
+ st4 [in0]=r20,8 // s1
+ mov pr=prsave,0x1ffff };;
+{ .mmb; st4 [in1]=r24 // s2
+ st4 [in0]=r28 // s3
+ br.ret.sptk.many b0 };;
+#endif
+
+.align 32
+.Ld_i_unaligned:
+{ .mmi; add out0=1,in0
+ add out2=2,in0
+ add out4=3,in0 };;
+{ .mmi; ld1 r16=[in0],4
+ ld1 r17=[out0],4 }//;;
+{ .mmi; ld1 r18=[out2],4
+ ld1 out1=[out4],4 };; // s0
+{ .mmi; ld1 r20=[in0],4
+ ld1 r21=[out0],4 }//;;
+{ .mmi; ld1 r22=[out2],4
+ ld1 out3=[out4],4 };; // s1
+{ .mmi; ld1 r24=[in0],4
+ ld1 r25=[out0],4 }//;;
+{ .mmi; ld1 r26=[out2],4
+ ld1 out5=[out4],4 };; // s2
+{ .mmi; ld1 r28=[in0]
+ ld1 r29=[out0] }//;;
+{ .mmi; ld1 r30=[out2]
+ ld1 out7=[out4] };; // s3
+
+{ .mii;
+ dep out1=r16,out1,24,8 //;;
+ dep out3=r20,out3,24,8 }//;;
+{ .mii; ADDP rk0=0,in2
+ dep out5=r24,out5,24,8 //;;
+ dep out7=r28,out7,24,8 };;
+{ .mii; ADDP rk1=KSZ,in2
+ dep out1=r17,out1,16,8 //;;
+ dep out3=r21,out3,16,8 }//;;
+{ .mii; mov twenty4=24
+ dep out5=r25,out5,16,8 //;;
+ dep out7=r29,out7,16,8 };;
+{ .mii; mov sixteen=16
+ dep out1=r18,out1,8,8 //;;
+ dep out3=r22,out3,8,8 }//;;
+{ .mii; mov maskff=0xff
+ dep out5=r26,out5,8,8 //;;
+ dep out7=r30,out7,8,8 };;
+
+{ .mib; br.call.sptk.many b6=_ia64_AES_decrypt };;
+
+.Ld_o_unaligned:
+{ .mii; ADDP out0=0,in1
+ extr.u r17=r16,8,8 // s0
+ shr.u r19=r16,twenty4 }//;;
+{ .mii; ADDP out1=1,in1
+ extr.u r18=r16,16,8
+ shr.u r23=r20,twenty4 }//;; // s1
+{ .mii; ADDP out2=2,in1
+ extr.u r21=r20,8,8
+ shr.u r22=r20,sixteen }//;;
+{ .mii; ADDP out3=3,in1
+ extr.u r25=r24,8,8 // s2
+ shr.u r27=r24,twenty4 };;
+{ .mii; st1 [out3]=r16,4
+ extr.u r26=r24,16,8
+ shr.u r31=r28,twenty4 }//;; // s3
+{ .mii; st1 [out2]=r17,4
+ extr.u r29=r28,8,8
+ shr.u r30=r28,sixteen }//;;
+
+{ .mmi; st1 [out1]=r18,4
+ st1 [out0]=r19,4 };;
+{ .mmi; st1 [out3]=r20,4
+ st1 [out2]=r21,4 }//;;
+{ .mmi; st1 [out1]=r22,4
+ st1 [out0]=r23,4 };;
+{ .mmi; st1 [out3]=r24,4
+ st1 [out2]=r25,4
+ mov pr=prsave,0x1ffff }//;;
+{ .mmi; st1 [out1]=r26,4
+ st1 [out0]=r27,4
+ mov ar.pfs=pfssave };;
+{ .mmi; st1 [out3]=r28
+ st1 [out2]=r29
+ mov ar.lc=lcsave }//;;
+{ .mmi; st1 [out1]=r30
+ st1 [out0]=r31 }
+{ .mfb; mov psr.um=loc0 // restore user mask
+ br.ret.sptk.many b0 };;
+.endp AES_decrypt#
+
+// leave it in .text segment...
+.align 64
+.global AES_Te#
+.type AES_Te#,@object
+AES_Te: data4 0xc66363a5,0xc66363a5, 0xf87c7c84,0xf87c7c84
+ data4 0xee777799,0xee777799, 0xf67b7b8d,0xf67b7b8d
+ data4 0xfff2f20d,0xfff2f20d, 0xd66b6bbd,0xd66b6bbd
+ data4 0xde6f6fb1,0xde6f6fb1, 0x91c5c554,0x91c5c554
+ data4 0x60303050,0x60303050, 0x02010103,0x02010103
+ data4 0xce6767a9,0xce6767a9, 0x562b2b7d,0x562b2b7d
+ data4 0xe7fefe19,0xe7fefe19, 0xb5d7d762,0xb5d7d762
+ data4 0x4dababe6,0x4dababe6, 0xec76769a,0xec76769a
+ data4 0x8fcaca45,0x8fcaca45, 0x1f82829d,0x1f82829d
+ data4 0x89c9c940,0x89c9c940, 0xfa7d7d87,0xfa7d7d87
+ data4 0xeffafa15,0xeffafa15, 0xb25959eb,0xb25959eb
+ data4 0x8e4747c9,0x8e4747c9, 0xfbf0f00b,0xfbf0f00b
+ data4 0x41adadec,0x41adadec, 0xb3d4d467,0xb3d4d467
+ data4 0x5fa2a2fd,0x5fa2a2fd, 0x45afafea,0x45afafea
+ data4 0x239c9cbf,0x239c9cbf, 0x53a4a4f7,0x53a4a4f7
+ data4 0xe4727296,0xe4727296, 0x9bc0c05b,0x9bc0c05b
+ data4 0x75b7b7c2,0x75b7b7c2, 0xe1fdfd1c,0xe1fdfd1c
+ data4 0x3d9393ae,0x3d9393ae, 0x4c26266a,0x4c26266a
+ data4 0x6c36365a,0x6c36365a, 0x7e3f3f41,0x7e3f3f41
+ data4 0xf5f7f702,0xf5f7f702, 0x83cccc4f,0x83cccc4f
+ data4 0x6834345c,0x6834345c, 0x51a5a5f4,0x51a5a5f4
+ data4 0xd1e5e534,0xd1e5e534, 0xf9f1f108,0xf9f1f108
+ data4 0xe2717193,0xe2717193, 0xabd8d873,0xabd8d873
+ data4 0x62313153,0x62313153, 0x2a15153f,0x2a15153f
+ data4 0x0804040c,0x0804040c, 0x95c7c752,0x95c7c752
+ data4 0x46232365,0x46232365, 0x9dc3c35e,0x9dc3c35e
+ data4 0x30181828,0x30181828, 0x379696a1,0x379696a1
+ data4 0x0a05050f,0x0a05050f, 0x2f9a9ab5,0x2f9a9ab5
+ data4 0x0e070709,0x0e070709, 0x24121236,0x24121236
+ data4 0x1b80809b,0x1b80809b, 0xdfe2e23d,0xdfe2e23d
+ data4 0xcdebeb26,0xcdebeb26, 0x4e272769,0x4e272769
+ data4 0x7fb2b2cd,0x7fb2b2cd, 0xea75759f,0xea75759f
+ data4 0x1209091b,0x1209091b, 0x1d83839e,0x1d83839e
+ data4 0x582c2c74,0x582c2c74, 0x341a1a2e,0x341a1a2e
+ data4 0x361b1b2d,0x361b1b2d, 0xdc6e6eb2,0xdc6e6eb2
+ data4 0xb45a5aee,0xb45a5aee, 0x5ba0a0fb,0x5ba0a0fb
+ data4 0xa45252f6,0xa45252f6, 0x763b3b4d,0x763b3b4d
+ data4 0xb7d6d661,0xb7d6d661, 0x7db3b3ce,0x7db3b3ce
+ data4 0x5229297b,0x5229297b, 0xdde3e33e,0xdde3e33e
+ data4 0x5e2f2f71,0x5e2f2f71, 0x13848497,0x13848497
+ data4 0xa65353f5,0xa65353f5, 0xb9d1d168,0xb9d1d168
+ data4 0x00000000,0x00000000, 0xc1eded2c,0xc1eded2c
+ data4 0x40202060,0x40202060, 0xe3fcfc1f,0xe3fcfc1f
+ data4 0x79b1b1c8,0x79b1b1c8, 0xb65b5bed,0xb65b5bed
+ data4 0xd46a6abe,0xd46a6abe, 0x8dcbcb46,0x8dcbcb46
+ data4 0x67bebed9,0x67bebed9, 0x7239394b,0x7239394b
+ data4 0x944a4ade,0x944a4ade, 0x984c4cd4,0x984c4cd4
+ data4 0xb05858e8,0xb05858e8, 0x85cfcf4a,0x85cfcf4a
+ data4 0xbbd0d06b,0xbbd0d06b, 0xc5efef2a,0xc5efef2a
+ data4 0x4faaaae5,0x4faaaae5, 0xedfbfb16,0xedfbfb16
+ data4 0x864343c5,0x864343c5, 0x9a4d4dd7,0x9a4d4dd7
+ data4 0x66333355,0x66333355, 0x11858594,0x11858594
+ data4 0x8a4545cf,0x8a4545cf, 0xe9f9f910,0xe9f9f910
+ data4 0x04020206,0x04020206, 0xfe7f7f81,0xfe7f7f81
+ data4 0xa05050f0,0xa05050f0, 0x783c3c44,0x783c3c44
+ data4 0x259f9fba,0x259f9fba, 0x4ba8a8e3,0x4ba8a8e3
+ data4 0xa25151f3,0xa25151f3, 0x5da3a3fe,0x5da3a3fe
+ data4 0x804040c0,0x804040c0, 0x058f8f8a,0x058f8f8a
+ data4 0x3f9292ad,0x3f9292ad, 0x219d9dbc,0x219d9dbc
+ data4 0x70383848,0x70383848, 0xf1f5f504,0xf1f5f504
+ data4 0x63bcbcdf,0x63bcbcdf, 0x77b6b6c1,0x77b6b6c1
+ data4 0xafdada75,0xafdada75, 0x42212163,0x42212163
+ data4 0x20101030,0x20101030, 0xe5ffff1a,0xe5ffff1a
+ data4 0xfdf3f30e,0xfdf3f30e, 0xbfd2d26d,0xbfd2d26d
+ data4 0x81cdcd4c,0x81cdcd4c, 0x180c0c14,0x180c0c14
+ data4 0x26131335,0x26131335, 0xc3ecec2f,0xc3ecec2f
+ data4 0xbe5f5fe1,0xbe5f5fe1, 0x359797a2,0x359797a2
+ data4 0x884444cc,0x884444cc, 0x2e171739,0x2e171739
+ data4 0x93c4c457,0x93c4c457, 0x55a7a7f2,0x55a7a7f2
+ data4 0xfc7e7e82,0xfc7e7e82, 0x7a3d3d47,0x7a3d3d47
+ data4 0xc86464ac,0xc86464ac, 0xba5d5de7,0xba5d5de7
+ data4 0x3219192b,0x3219192b, 0xe6737395,0xe6737395
+ data4 0xc06060a0,0xc06060a0, 0x19818198,0x19818198
+ data4 0x9e4f4fd1,0x9e4f4fd1, 0xa3dcdc7f,0xa3dcdc7f
+ data4 0x44222266,0x44222266, 0x542a2a7e,0x542a2a7e
+ data4 0x3b9090ab,0x3b9090ab, 0x0b888883,0x0b888883
+ data4 0x8c4646ca,0x8c4646ca, 0xc7eeee29,0xc7eeee29
+ data4 0x6bb8b8d3,0x6bb8b8d3, 0x2814143c,0x2814143c
+ data4 0xa7dede79,0xa7dede79, 0xbc5e5ee2,0xbc5e5ee2
+ data4 0x160b0b1d,0x160b0b1d, 0xaddbdb76,0xaddbdb76
+ data4 0xdbe0e03b,0xdbe0e03b, 0x64323256,0x64323256
+ data4 0x743a3a4e,0x743a3a4e, 0x140a0a1e,0x140a0a1e
+ data4 0x924949db,0x924949db, 0x0c06060a,0x0c06060a
+ data4 0x4824246c,0x4824246c, 0xb85c5ce4,0xb85c5ce4
+ data4 0x9fc2c25d,0x9fc2c25d, 0xbdd3d36e,0xbdd3d36e
+ data4 0x43acacef,0x43acacef, 0xc46262a6,0xc46262a6
+ data4 0x399191a8,0x399191a8, 0x319595a4,0x319595a4
+ data4 0xd3e4e437,0xd3e4e437, 0xf279798b,0xf279798b
+ data4 0xd5e7e732,0xd5e7e732, 0x8bc8c843,0x8bc8c843
+ data4 0x6e373759,0x6e373759, 0xda6d6db7,0xda6d6db7
+ data4 0x018d8d8c,0x018d8d8c, 0xb1d5d564,0xb1d5d564
+ data4 0x9c4e4ed2,0x9c4e4ed2, 0x49a9a9e0,0x49a9a9e0
+ data4 0xd86c6cb4,0xd86c6cb4, 0xac5656fa,0xac5656fa
+ data4 0xf3f4f407,0xf3f4f407, 0xcfeaea25,0xcfeaea25
+ data4 0xca6565af,0xca6565af, 0xf47a7a8e,0xf47a7a8e
+ data4 0x47aeaee9,0x47aeaee9, 0x10080818,0x10080818
+ data4 0x6fbabad5,0x6fbabad5, 0xf0787888,0xf0787888
+ data4 0x4a25256f,0x4a25256f, 0x5c2e2e72,0x5c2e2e72
+ data4 0x381c1c24,0x381c1c24, 0x57a6a6f1,0x57a6a6f1
+ data4 0x73b4b4c7,0x73b4b4c7, 0x97c6c651,0x97c6c651
+ data4 0xcbe8e823,0xcbe8e823, 0xa1dddd7c,0xa1dddd7c
+ data4 0xe874749c,0xe874749c, 0x3e1f1f21,0x3e1f1f21
+ data4 0x964b4bdd,0x964b4bdd, 0x61bdbddc,0x61bdbddc
+ data4 0x0d8b8b86,0x0d8b8b86, 0x0f8a8a85,0x0f8a8a85
+ data4 0xe0707090,0xe0707090, 0x7c3e3e42,0x7c3e3e42
+ data4 0x71b5b5c4,0x71b5b5c4, 0xcc6666aa,0xcc6666aa
+ data4 0x904848d8,0x904848d8, 0x06030305,0x06030305
+ data4 0xf7f6f601,0xf7f6f601, 0x1c0e0e12,0x1c0e0e12
+ data4 0xc26161a3,0xc26161a3, 0x6a35355f,0x6a35355f
+ data4 0xae5757f9,0xae5757f9, 0x69b9b9d0,0x69b9b9d0
+ data4 0x17868691,0x17868691, 0x99c1c158,0x99c1c158
+ data4 0x3a1d1d27,0x3a1d1d27, 0x279e9eb9,0x279e9eb9
+ data4 0xd9e1e138,0xd9e1e138, 0xebf8f813,0xebf8f813
+ data4 0x2b9898b3,0x2b9898b3, 0x22111133,0x22111133
+ data4 0xd26969bb,0xd26969bb, 0xa9d9d970,0xa9d9d970
+ data4 0x078e8e89,0x078e8e89, 0x339494a7,0x339494a7
+ data4 0x2d9b9bb6,0x2d9b9bb6, 0x3c1e1e22,0x3c1e1e22
+ data4 0x15878792,0x15878792, 0xc9e9e920,0xc9e9e920
+ data4 0x87cece49,0x87cece49, 0xaa5555ff,0xaa5555ff
+ data4 0x50282878,0x50282878, 0xa5dfdf7a,0xa5dfdf7a
+ data4 0x038c8c8f,0x038c8c8f, 0x59a1a1f8,0x59a1a1f8
+ data4 0x09898980,0x09898980, 0x1a0d0d17,0x1a0d0d17
+ data4 0x65bfbfda,0x65bfbfda, 0xd7e6e631,0xd7e6e631
+ data4 0x844242c6,0x844242c6, 0xd06868b8,0xd06868b8
+ data4 0x824141c3,0x824141c3, 0x299999b0,0x299999b0
+ data4 0x5a2d2d77,0x5a2d2d77, 0x1e0f0f11,0x1e0f0f11
+ data4 0x7bb0b0cb,0x7bb0b0cb, 0xa85454fc,0xa85454fc
+ data4 0x6dbbbbd6,0x6dbbbbd6, 0x2c16163a,0x2c16163a
+// Te4:
+ data1 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ data1 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ data1 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ data1 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ data1 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ data1 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ data1 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ data1 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ data1 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ data1 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ data1 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ data1 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ data1 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ data1 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ data1 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ data1 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ data1 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ data1 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ data1 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ data1 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ data1 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ data1 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ data1 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ data1 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ data1 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ data1 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ data1 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ data1 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ data1 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ data1 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ data1 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ data1 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+.size AES_Te#,2048+256 // HP-UX assembler fails to ".-AES_Te#"
+
+.align 64
+.global AES_Td#
+.type AES_Td#,@object
+AES_Td: data4 0x51f4a750,0x51f4a750, 0x7e416553,0x7e416553
+ data4 0x1a17a4c3,0x1a17a4c3, 0x3a275e96,0x3a275e96
+ data4 0x3bab6bcb,0x3bab6bcb, 0x1f9d45f1,0x1f9d45f1
+ data4 0xacfa58ab,0xacfa58ab, 0x4be30393,0x4be30393
+ data4 0x2030fa55,0x2030fa55, 0xad766df6,0xad766df6
+ data4 0x88cc7691,0x88cc7691, 0xf5024c25,0xf5024c25
+ data4 0x4fe5d7fc,0x4fe5d7fc, 0xc52acbd7,0xc52acbd7
+ data4 0x26354480,0x26354480, 0xb562a38f,0xb562a38f
+ data4 0xdeb15a49,0xdeb15a49, 0x25ba1b67,0x25ba1b67
+ data4 0x45ea0e98,0x45ea0e98, 0x5dfec0e1,0x5dfec0e1
+ data4 0xc32f7502,0xc32f7502, 0x814cf012,0x814cf012
+ data4 0x8d4697a3,0x8d4697a3, 0x6bd3f9c6,0x6bd3f9c6
+ data4 0x038f5fe7,0x038f5fe7, 0x15929c95,0x15929c95
+ data4 0xbf6d7aeb,0xbf6d7aeb, 0x955259da,0x955259da
+ data4 0xd4be832d,0xd4be832d, 0x587421d3,0x587421d3
+ data4 0x49e06929,0x49e06929, 0x8ec9c844,0x8ec9c844
+ data4 0x75c2896a,0x75c2896a, 0xf48e7978,0xf48e7978
+ data4 0x99583e6b,0x99583e6b, 0x27b971dd,0x27b971dd
+ data4 0xbee14fb6,0xbee14fb6, 0xf088ad17,0xf088ad17
+ data4 0xc920ac66,0xc920ac66, 0x7dce3ab4,0x7dce3ab4
+ data4 0x63df4a18,0x63df4a18, 0xe51a3182,0xe51a3182
+ data4 0x97513360,0x97513360, 0x62537f45,0x62537f45
+ data4 0xb16477e0,0xb16477e0, 0xbb6bae84,0xbb6bae84
+ data4 0xfe81a01c,0xfe81a01c, 0xf9082b94,0xf9082b94
+ data4 0x70486858,0x70486858, 0x8f45fd19,0x8f45fd19
+ data4 0x94de6c87,0x94de6c87, 0x527bf8b7,0x527bf8b7
+ data4 0xab73d323,0xab73d323, 0x724b02e2,0x724b02e2
+ data4 0xe31f8f57,0xe31f8f57, 0x6655ab2a,0x6655ab2a
+ data4 0xb2eb2807,0xb2eb2807, 0x2fb5c203,0x2fb5c203
+ data4 0x86c57b9a,0x86c57b9a, 0xd33708a5,0xd33708a5
+ data4 0x302887f2,0x302887f2, 0x23bfa5b2,0x23bfa5b2
+ data4 0x02036aba,0x02036aba, 0xed16825c,0xed16825c
+ data4 0x8acf1c2b,0x8acf1c2b, 0xa779b492,0xa779b492
+ data4 0xf307f2f0,0xf307f2f0, 0x4e69e2a1,0x4e69e2a1
+ data4 0x65daf4cd,0x65daf4cd, 0x0605bed5,0x0605bed5
+ data4 0xd134621f,0xd134621f, 0xc4a6fe8a,0xc4a6fe8a
+ data4 0x342e539d,0x342e539d, 0xa2f355a0,0xa2f355a0
+ data4 0x058ae132,0x058ae132, 0xa4f6eb75,0xa4f6eb75
+ data4 0x0b83ec39,0x0b83ec39, 0x4060efaa,0x4060efaa
+ data4 0x5e719f06,0x5e719f06, 0xbd6e1051,0xbd6e1051
+ data4 0x3e218af9,0x3e218af9, 0x96dd063d,0x96dd063d
+ data4 0xdd3e05ae,0xdd3e05ae, 0x4de6bd46,0x4de6bd46
+ data4 0x91548db5,0x91548db5, 0x71c45d05,0x71c45d05
+ data4 0x0406d46f,0x0406d46f, 0x605015ff,0x605015ff
+ data4 0x1998fb24,0x1998fb24, 0xd6bde997,0xd6bde997
+ data4 0x894043cc,0x894043cc, 0x67d99e77,0x67d99e77
+ data4 0xb0e842bd,0xb0e842bd, 0x07898b88,0x07898b88
+ data4 0xe7195b38,0xe7195b38, 0x79c8eedb,0x79c8eedb
+ data4 0xa17c0a47,0xa17c0a47, 0x7c420fe9,0x7c420fe9
+ data4 0xf8841ec9,0xf8841ec9, 0x00000000,0x00000000
+ data4 0x09808683,0x09808683, 0x322bed48,0x322bed48
+ data4 0x1e1170ac,0x1e1170ac, 0x6c5a724e,0x6c5a724e
+ data4 0xfd0efffb,0xfd0efffb, 0x0f853856,0x0f853856
+ data4 0x3daed51e,0x3daed51e, 0x362d3927,0x362d3927
+ data4 0x0a0fd964,0x0a0fd964, 0x685ca621,0x685ca621
+ data4 0x9b5b54d1,0x9b5b54d1, 0x24362e3a,0x24362e3a
+ data4 0x0c0a67b1,0x0c0a67b1, 0x9357e70f,0x9357e70f
+ data4 0xb4ee96d2,0xb4ee96d2, 0x1b9b919e,0x1b9b919e
+ data4 0x80c0c54f,0x80c0c54f, 0x61dc20a2,0x61dc20a2
+ data4 0x5a774b69,0x5a774b69, 0x1c121a16,0x1c121a16
+ data4 0xe293ba0a,0xe293ba0a, 0xc0a02ae5,0xc0a02ae5
+ data4 0x3c22e043,0x3c22e043, 0x121b171d,0x121b171d
+ data4 0x0e090d0b,0x0e090d0b, 0xf28bc7ad,0xf28bc7ad
+ data4 0x2db6a8b9,0x2db6a8b9, 0x141ea9c8,0x141ea9c8
+ data4 0x57f11985,0x57f11985, 0xaf75074c,0xaf75074c
+ data4 0xee99ddbb,0xee99ddbb, 0xa37f60fd,0xa37f60fd
+ data4 0xf701269f,0xf701269f, 0x5c72f5bc,0x5c72f5bc
+ data4 0x44663bc5,0x44663bc5, 0x5bfb7e34,0x5bfb7e34
+ data4 0x8b432976,0x8b432976, 0xcb23c6dc,0xcb23c6dc
+ data4 0xb6edfc68,0xb6edfc68, 0xb8e4f163,0xb8e4f163
+ data4 0xd731dcca,0xd731dcca, 0x42638510,0x42638510
+ data4 0x13972240,0x13972240, 0x84c61120,0x84c61120
+ data4 0x854a247d,0x854a247d, 0xd2bb3df8,0xd2bb3df8
+ data4 0xaef93211,0xaef93211, 0xc729a16d,0xc729a16d
+ data4 0x1d9e2f4b,0x1d9e2f4b, 0xdcb230f3,0xdcb230f3
+ data4 0x0d8652ec,0x0d8652ec, 0x77c1e3d0,0x77c1e3d0
+ data4 0x2bb3166c,0x2bb3166c, 0xa970b999,0xa970b999
+ data4 0x119448fa,0x119448fa, 0x47e96422,0x47e96422
+ data4 0xa8fc8cc4,0xa8fc8cc4, 0xa0f03f1a,0xa0f03f1a
+ data4 0x567d2cd8,0x567d2cd8, 0x223390ef,0x223390ef
+ data4 0x87494ec7,0x87494ec7, 0xd938d1c1,0xd938d1c1
+ data4 0x8ccaa2fe,0x8ccaa2fe, 0x98d40b36,0x98d40b36
+ data4 0xa6f581cf,0xa6f581cf, 0xa57ade28,0xa57ade28
+ data4 0xdab78e26,0xdab78e26, 0x3fadbfa4,0x3fadbfa4
+ data4 0x2c3a9de4,0x2c3a9de4, 0x5078920d,0x5078920d
+ data4 0x6a5fcc9b,0x6a5fcc9b, 0x547e4662,0x547e4662
+ data4 0xf68d13c2,0xf68d13c2, 0x90d8b8e8,0x90d8b8e8
+ data4 0x2e39f75e,0x2e39f75e, 0x82c3aff5,0x82c3aff5
+ data4 0x9f5d80be,0x9f5d80be, 0x69d0937c,0x69d0937c
+ data4 0x6fd52da9,0x6fd52da9, 0xcf2512b3,0xcf2512b3
+ data4 0xc8ac993b,0xc8ac993b, 0x10187da7,0x10187da7
+ data4 0xe89c636e,0xe89c636e, 0xdb3bbb7b,0xdb3bbb7b
+ data4 0xcd267809,0xcd267809, 0x6e5918f4,0x6e5918f4
+ data4 0xec9ab701,0xec9ab701, 0x834f9aa8,0x834f9aa8
+ data4 0xe6956e65,0xe6956e65, 0xaaffe67e,0xaaffe67e
+ data4 0x21bccf08,0x21bccf08, 0xef15e8e6,0xef15e8e6
+ data4 0xbae79bd9,0xbae79bd9, 0x4a6f36ce,0x4a6f36ce
+ data4 0xea9f09d4,0xea9f09d4, 0x29b07cd6,0x29b07cd6
+ data4 0x31a4b2af,0x31a4b2af, 0x2a3f2331,0x2a3f2331
+ data4 0xc6a59430,0xc6a59430, 0x35a266c0,0x35a266c0
+ data4 0x744ebc37,0x744ebc37, 0xfc82caa6,0xfc82caa6
+ data4 0xe090d0b0,0xe090d0b0, 0x33a7d815,0x33a7d815
+ data4 0xf104984a,0xf104984a, 0x41ecdaf7,0x41ecdaf7
+ data4 0x7fcd500e,0x7fcd500e, 0x1791f62f,0x1791f62f
+ data4 0x764dd68d,0x764dd68d, 0x43efb04d,0x43efb04d
+ data4 0xccaa4d54,0xccaa4d54, 0xe49604df,0xe49604df
+ data4 0x9ed1b5e3,0x9ed1b5e3, 0x4c6a881b,0x4c6a881b
+ data4 0xc12c1fb8,0xc12c1fb8, 0x4665517f,0x4665517f
+ data4 0x9d5eea04,0x9d5eea04, 0x018c355d,0x018c355d
+ data4 0xfa877473,0xfa877473, 0xfb0b412e,0xfb0b412e
+ data4 0xb3671d5a,0xb3671d5a, 0x92dbd252,0x92dbd252
+ data4 0xe9105633,0xe9105633, 0x6dd64713,0x6dd64713
+ data4 0x9ad7618c,0x9ad7618c, 0x37a10c7a,0x37a10c7a
+ data4 0x59f8148e,0x59f8148e, 0xeb133c89,0xeb133c89
+ data4 0xcea927ee,0xcea927ee, 0xb761c935,0xb761c935
+ data4 0xe11ce5ed,0xe11ce5ed, 0x7a47b13c,0x7a47b13c
+ data4 0x9cd2df59,0x9cd2df59, 0x55f2733f,0x55f2733f
+ data4 0x1814ce79,0x1814ce79, 0x73c737bf,0x73c737bf
+ data4 0x53f7cdea,0x53f7cdea, 0x5ffdaa5b,0x5ffdaa5b
+ data4 0xdf3d6f14,0xdf3d6f14, 0x7844db86,0x7844db86
+ data4 0xcaaff381,0xcaaff381, 0xb968c43e,0xb968c43e
+ data4 0x3824342c,0x3824342c, 0xc2a3405f,0xc2a3405f
+ data4 0x161dc372,0x161dc372, 0xbce2250c,0xbce2250c
+ data4 0x283c498b,0x283c498b, 0xff0d9541,0xff0d9541
+ data4 0x39a80171,0x39a80171, 0x080cb3de,0x080cb3de
+ data4 0xd8b4e49c,0xd8b4e49c, 0x6456c190,0x6456c190
+ data4 0x7bcb8461,0x7bcb8461, 0xd532b670,0xd532b670
+ data4 0x486c5c74,0x486c5c74, 0xd0b85742,0xd0b85742
+// Td4:
+ data1 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ data1 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ data1 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ data1 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ data1 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ data1 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ data1 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ data1 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ data1 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ data1 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ data1 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ data1 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ data1 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ data1 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ data1 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ data1 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ data1 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ data1 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ data1 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ data1 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ data1 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ data1 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ data1 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ data1 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ data1 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ data1 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ data1 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ data1 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ data1 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ data1 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ data1 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ data1 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+.size AES_Td#,2048+256 // HP-UX assembler fails to ".-AES_Td#"
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-mips.pl b/openssl-1.1.0h/crypto/aes/asm/aes-mips.pl
new file mode 100644
index 0000000..439578d
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-mips.pl
@@ -0,0 +1,2131 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for MIPS
+
+# October 2010
+#
+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
+# faster than gcc-generated code, which is not very impressive. But
+# recall that compressed S-box requires extra processing, namely
+# additional rotations. Rotations are implemented with lwl/lwr pairs,
+# which is normally used for loading unaligned data. Another cool
+# thing about this module is its endian neutrality, which means that
+# it processes data without ever changing byte order...
+
+# September 2012
+#
+# Add MIPS32R2 (~10% less instructions) and SmartMIPS ASE (further
+# ~25% less instructions) code. Note that there is no run-time switch,
+# instead, code path is chosen upon pre-process time, pass -mips32r2
+# or/and -msmartmips.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+ $PTR_LA="dla";
+ $PTR_ADD="dadd"; # incidentally works even on n32
+ $PTR_SUB="dsub"; # incidentally works even on n32
+ $PTR_INS="dins";
+ $REG_S="sd";
+ $REG_L="ld";
+ $PTR_SLL="dsll"; # incidentally works even on n32
+ $SZREG=8;
+} else {
+ $PTR_LA="la";
+ $PTR_ADD="add";
+ $PTR_SUB="sub";
+ $PTR_INS="ins";
+ $REG_S="sw";
+ $REG_L="lw";
+ $PTR_SLL="sll";
+ $SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
+
+for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+{ $big_endian=(unpack('L',pack('N',1))==1); }
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
+
+$code.=<<___;
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
+#define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
+.option pic2
+#endif
+.set noat
+___
+
+{{{
+my $FRAMESIZE=16*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
+
+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
+my ($key0,$cnt)=($gp,$fp);
+
+# instuction ordering is "stolen" from output from MIPSpro assembler
+# invoked with -mips3 -O3 arguments...
+$code.=<<___;
+.align 5
+.ent _mips_AES_encrypt
+_mips_AES_encrypt:
+ .frame $sp,0,$ra
+ .set reorder
+ lw $t0,0($key)
+ lw $t1,4($key)
+ lw $t2,8($key)
+ lw $t3,12($key)
+ lw $cnt,240($key)
+ $PTR_ADD $key0,$key,16
+
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ sub $cnt,1
+#if defined(__mips_smartmips)
+ ext $i0,$s1,16,8
+.Loop_enc:
+ ext $i1,$s2,16,8
+ ext $i2,$s3,16,8
+ ext $i3,$s0,16,8
+ lwxs $t0,$i0($Tbl) # Te1[s1>>16]
+ ext $i0,$s2,8,8
+ lwxs $t1,$i1($Tbl) # Te1[s2>>16]
+ ext $i1,$s3,8,8
+ lwxs $t2,$i2($Tbl) # Te1[s3>>16]
+ ext $i2,$s0,8,8
+ lwxs $t3,$i3($Tbl) # Te1[s0>>16]
+ ext $i3,$s1,8,8
+
+ lwxs $t4,$i0($Tbl) # Te2[s2>>8]
+ ext $i0,$s3,0,8
+ lwxs $t5,$i1($Tbl) # Te2[s3>>8]
+ ext $i1,$s0,0,8
+ lwxs $t6,$i2($Tbl) # Te2[s0>>8]
+ ext $i2,$s1,0,8
+ lwxs $t7,$i3($Tbl) # Te2[s1>>8]
+ ext $i3,$s2,0,8
+
+ lwxs $t8,$i0($Tbl) # Te3[s3]
+ ext $i0,$s0,24,8
+ lwxs $t9,$i1($Tbl) # Te3[s0]
+ ext $i1,$s1,24,8
+ lwxs $t10,$i2($Tbl) # Te3[s1]
+ ext $i2,$s2,24,8
+ lwxs $t11,$i3($Tbl) # Te3[s2]
+ ext $i3,$s3,24,8
+
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ xor $t0,$t4
+ lwxs $t4,$i0($Tbl) # Te0[s0>>24]
+ xor $t1,$t5
+ lwxs $t5,$i1($Tbl) # Te0[s1>>24]
+ xor $t2,$t6
+ lwxs $t6,$i2($Tbl) # Te0[s2>>24]
+ xor $t3,$t7
+ lwxs $t7,$i3($Tbl) # Te0[s3>>24]
+
+ rotr $t8,$t8,24
+ lw $s0,0($key0)
+ rotr $t9,$t9,24
+ lw $s1,4($key0)
+ rotr $t10,$t10,24
+ lw $s2,8($key0)
+ rotr $t11,$t11,24
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_enc
+ ext $i0,$s1,16,8
+
+ _xtr $i0,$s1,16-2
+#else
+ _xtr $i0,$s1,16-2
+.Loop_enc:
+ _xtr $i1,$s2,16-2
+ _xtr $i2,$s3,16-2
+ _xtr $i3,$s0,16-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ lw $t0,0($i0) # Te1[s1>>16]
+ _xtr $i0,$s2,8-2
+ lw $t1,0($i1) # Te1[s2>>16]
+ _xtr $i1,$s3,8-2
+ lw $t2,0($i2) # Te1[s3>>16]
+ _xtr $i2,$s0,8-2
+ lw $t3,0($i3) # Te1[s0>>16]
+ _xtr $i3,$s1,8-2
+#else
+ lwl $t0,3($i0) # Te1[s1>>16]
+ lwl $t1,3($i1) # Te1[s2>>16]
+ lwl $t2,3($i2) # Te1[s3>>16]
+ lwl $t3,3($i3) # Te1[s0>>16]
+ lwr $t0,2($i0) # Te1[s1>>16]
+ _xtr $i0,$s2,8-2
+ lwr $t1,2($i1) # Te1[s2>>16]
+ _xtr $i1,$s3,8-2
+ lwr $t2,2($i2) # Te1[s3>>16]
+ _xtr $i2,$s0,8-2
+ lwr $t3,2($i3) # Te1[s0>>16]
+ _xtr $i3,$s1,8-2
+#endif
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+# if defined(_MIPSEL)
+ lw $t4,0($i0) # Te2[s2>>8]
+ _xtr $i0,$s3,0-2
+ lw $t5,0($i1) # Te2[s3>>8]
+ _xtr $i1,$s0,0-2
+ lw $t6,0($i2) # Te2[s0>>8]
+ _xtr $i2,$s1,0-2
+ lw $t7,0($i3) # Te2[s1>>8]
+ _xtr $i3,$s2,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lw $t8,0($i0) # Te3[s3]
+ $PTR_INS $i0,$s0,2,8
+ lw $t9,0($i1) # Te3[s0]
+ $PTR_INS $i1,$s1,2,8
+ lw $t10,0($i2) # Te3[s1]
+ $PTR_INS $i2,$s2,2,8
+ lw $t11,0($i3) # Te3[s2]
+ $PTR_INS $i3,$s3,2,8
+# else
+ lw $t4,0($i0) # Te2[s2>>8]
+ $PTR_INS $i0,$s3,2,8
+ lw $t5,0($i1) # Te2[s3>>8]
+ $PTR_INS $i1,$s0,2,8
+ lw $t6,0($i2) # Te2[s0>>8]
+ $PTR_INS $i2,$s1,2,8
+ lw $t7,0($i3) # Te2[s1>>8]
+ $PTR_INS $i3,$s2,2,8
+
+ lw $t8,0($i0) # Te3[s3]
+ _xtr $i0,$s0,24-2
+ lw $t9,0($i1) # Te3[s0]
+ _xtr $i1,$s1,24-2
+ lw $t10,0($i2) # Te3[s1]
+ _xtr $i2,$s2,24-2
+ lw $t11,0($i3) # Te3[s2]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+# endif
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ rotr $t8,$t8,24
+ rotr $t9,$t9,24
+ rotr $t10,$t10,24
+ rotr $t11,$t11,24
+#else
+ lwl $t4,2($i0) # Te2[s2>>8]
+ lwl $t5,2($i1) # Te2[s3>>8]
+ lwl $t6,2($i2) # Te2[s0>>8]
+ lwl $t7,2($i3) # Te2[s1>>8]
+ lwr $t4,1($i0) # Te2[s2>>8]
+ _xtr $i0,$s3,0-2
+ lwr $t5,1($i1) # Te2[s3>>8]
+ _xtr $i1,$s0,0-2
+ lwr $t6,1($i2) # Te2[s0>>8]
+ _xtr $i2,$s1,0-2
+ lwr $t7,1($i3) # Te2[s1>>8]
+ _xtr $i3,$s2,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t8,1($i0) # Te3[s3]
+ lwl $t9,1($i1) # Te3[s0]
+ lwl $t10,1($i2) # Te3[s1]
+ lwl $t11,1($i3) # Te3[s2]
+ lwr $t8,0($i0) # Te3[s3]
+ _xtr $i0,$s0,24-2
+ lwr $t9,0($i1) # Te3[s0]
+ _xtr $i1,$s1,24-2
+ lwr $t10,0($i2) # Te3[s1]
+ _xtr $i2,$s2,24-2
+ lwr $t11,0($i3) # Te3[s2]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#endif
+ xor $t0,$t4
+ lw $t4,0($i0) # Te0[s0>>24]
+ xor $t1,$t5
+ lw $t5,0($i1) # Te0[s1>>24]
+ xor $t2,$t6
+ lw $t6,0($i2) # Te0[s2>>24]
+ xor $t3,$t7
+ lw $t7,0($i3) # Te0[s3>>24]
+
+ xor $t0,$t8
+ lw $s0,0($key0)
+ xor $t1,$t9
+ lw $s1,4($key0)
+ xor $t2,$t10
+ lw $s2,8($key0)
+ xor $t3,$t11
+ lw $s3,12($key0)
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_enc
+ _xtr $i0,$s1,16-2
+#endif
+
+ .set reorder
+ _xtr $i1,$s2,16-2
+ _xtr $i2,$s3,16-2
+ _xtr $i3,$s0,16-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t0,2($i0) # Te4[s1>>16]
+ _xtr $i0,$s2,8-2
+ lbu $t1,2($i1) # Te4[s2>>16]
+ _xtr $i1,$s3,8-2
+ lbu $t2,2($i2) # Te4[s3>>16]
+ _xtr $i2,$s0,8-2
+ lbu $t3,2($i3) # Te4[s0>>16]
+ _xtr $i3,$s1,8-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+# if defined(_MIPSEL)
+ lbu $t4,2($i0) # Te4[s2>>8]
+ $PTR_INS $i0,$s0,2,8
+ lbu $t5,2($i1) # Te4[s3>>8]
+ $PTR_INS $i1,$s1,2,8
+ lbu $t6,2($i2) # Te4[s0>>8]
+ $PTR_INS $i2,$s2,2,8
+ lbu $t7,2($i3) # Te4[s1>>8]
+ $PTR_INS $i3,$s3,2,8
+
+ lbu $t8,2($i0) # Te4[s0>>24]
+ _xtr $i0,$s3,0-2
+ lbu $t9,2($i1) # Te4[s1>>24]
+ _xtr $i1,$s0,0-2
+ lbu $t10,2($i2) # Te4[s2>>24]
+ _xtr $i2,$s1,0-2
+ lbu $t11,2($i3) # Te4[s3>>24]
+ _xtr $i3,$s2,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+# else
+ lbu $t4,2($i0) # Te4[s2>>8]
+ _xtr $i0,$s0,24-2
+ lbu $t5,2($i1) # Te4[s3>>8]
+ _xtr $i1,$s1,24-2
+ lbu $t6,2($i2) # Te4[s0>>8]
+ _xtr $i2,$s2,24-2
+ lbu $t7,2($i3) # Te4[s1>>8]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,2($i0) # Te4[s0>>24]
+ $PTR_INS $i0,$s3,2,8
+ lbu $t9,2($i1) # Te4[s1>>24]
+ $PTR_INS $i1,$s0,2,8
+ lbu $t10,2($i2) # Te4[s2>>24]
+ $PTR_INS $i2,$s1,2,8
+ lbu $t11,2($i3) # Te4[s3>>24]
+ $PTR_INS $i3,$s2,2,8
+# endif
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+
+ _ins2 $t0,$t4,8
+ lbu $t4,2($i0) # Te4[s3]
+ _ins2 $t1,$t5,8
+ lbu $t5,2($i1) # Te4[s0]
+ _ins2 $t2,$t6,8
+ lbu $t6,2($i2) # Te4[s1]
+ _ins2 $t3,$t7,8
+ lbu $t7,2($i3) # Te4[s2]
+
+ _ins2 $t0,$t8,24
+ lw $s0,0($key0)
+ _ins2 $t1,$t9,24
+ lw $s1,4($key0)
+ _ins2 $t2,$t10,24
+ lw $s2,8($key0)
+ _ins2 $t3,$t11,24
+ lw $s3,12($key0)
+
+ _ins2 $t0,$t4,0
+ _ins2 $t1,$t5,0
+ _ins2 $t2,$t6,0
+ _ins2 $t3,$t7,0
+#else
+ lbu $t4,2($i0) # Te4[s2>>8]
+ _xtr $i0,$s0,24-2
+ lbu $t5,2($i1) # Te4[s3>>8]
+ _xtr $i1,$s1,24-2
+ lbu $t6,2($i2) # Te4[s0>>8]
+ _xtr $i2,$s2,24-2
+ lbu $t7,2($i3) # Te4[s1>>8]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,2($i0) # Te4[s0>>24]
+ _xtr $i0,$s3,0-2
+ lbu $t9,2($i1) # Te4[s1>>24]
+ _xtr $i1,$s0,0-2
+ lbu $t10,2($i2) # Te4[s2>>24]
+ _xtr $i2,$s1,0-2
+ lbu $t11,2($i3) # Te4[s3>>24]
+ _xtr $i3,$s2,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+
+ _ins $t4,8
+ _ins $t5,8
+ _ins $t6,8
+ _ins $t7,8
+
+ xor $t0,$t4
+ lbu $t4,2($i0) # Te4[s3]
+ xor $t1,$t5
+ lbu $t5,2($i1) # Te4[s0]
+ xor $t2,$t6
+ lbu $t6,2($i2) # Te4[s1]
+ xor $t3,$t7
+ lbu $t7,2($i3) # Te4[s2]
+
+ _ins $t8,24
+ lw $s0,0($key0)
+ _ins $t9,24
+ lw $s1,4($key0)
+ _ins $t10,24
+ lw $s2,8($key0)
+ _ins $t11,24
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ _ins $t4,0
+ _ins $t5,0
+ _ins $t6,0
+ _ins $t7,0
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+#endif
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ jr $ra
+.end _mips_AES_encrypt
+
+.align 5
+.globl AES_encrypt
+.ent AES_encrypt
+AES_encrypt:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,AES_encrypt
+___
+$code.=<<___;
+ .set reorder
+ $PTR_LA $Tbl,AES_Te # PIC-ified 'load address'
+
+ lwl $s0,0+$MSB($inp)
+ lwl $s1,4+$MSB($inp)
+ lwl $s2,8+$MSB($inp)
+ lwl $s3,12+$MSB($inp)
+ lwr $s0,0+$LSB($inp)
+ lwr $s1,4+$LSB($inp)
+ lwr $s2,8+$LSB($inp)
+ lwr $s3,12+$LSB($inp)
+
+ bal _mips_AES_encrypt
+
+ swr $s0,0+$LSB($out)
+ swr $s1,4+$LSB($out)
+ swr $s2,8+$LSB($out)
+ swr $s3,12+$LSB($out)
+ swl $s0,0+$MSB($out)
+ swl $s1,4+$MSB($out)
+ swl $s2,8+$MSB($out)
+ swl $s3,12+$MSB($out)
+
+ .set noreorder
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end AES_encrypt
+___
+
+$code.=<<___;
+.align 5
+.ent _mips_AES_decrypt
+_mips_AES_decrypt:
+ .frame $sp,0,$ra
+ .set reorder
+ lw $t0,0($key)
+ lw $t1,4($key)
+ lw $t2,8($key)
+ lw $t3,12($key)
+ lw $cnt,240($key)
+ $PTR_ADD $key0,$key,16
+
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ sub $cnt,1
+#if defined(__mips_smartmips)
+ ext $i0,$s3,16,8
+.Loop_dec:
+ ext $i1,$s0,16,8
+ ext $i2,$s1,16,8
+ ext $i3,$s2,16,8
+ lwxs $t0,$i0($Tbl) # Td1[s3>>16]
+ ext $i0,$s2,8,8
+ lwxs $t1,$i1($Tbl) # Td1[s0>>16]
+ ext $i1,$s3,8,8
+ lwxs $t2,$i2($Tbl) # Td1[s1>>16]
+ ext $i2,$s0,8,8
+ lwxs $t3,$i3($Tbl) # Td1[s2>>16]
+ ext $i3,$s1,8,8
+
+ lwxs $t4,$i0($Tbl) # Td2[s2>>8]
+ ext $i0,$s1,0,8
+ lwxs $t5,$i1($Tbl) # Td2[s3>>8]
+ ext $i1,$s2,0,8
+ lwxs $t6,$i2($Tbl) # Td2[s0>>8]
+ ext $i2,$s3,0,8
+ lwxs $t7,$i3($Tbl) # Td2[s1>>8]
+ ext $i3,$s0,0,8
+
+ lwxs $t8,$i0($Tbl) # Td3[s1]
+ ext $i0,$s0,24,8
+ lwxs $t9,$i1($Tbl) # Td3[s2]
+ ext $i1,$s1,24,8
+ lwxs $t10,$i2($Tbl) # Td3[s3]
+ ext $i2,$s2,24,8
+ lwxs $t11,$i3($Tbl) # Td3[s0]
+ ext $i3,$s3,24,8
+
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ xor $t0,$t4
+ lwxs $t4,$i0($Tbl) # Td0[s0>>24]
+ xor $t1,$t5
+ lwxs $t5,$i1($Tbl) # Td0[s1>>24]
+ xor $t2,$t6
+ lwxs $t6,$i2($Tbl) # Td0[s2>>24]
+ xor $t3,$t7
+ lwxs $t7,$i3($Tbl) # Td0[s3>>24]
+
+ rotr $t8,$t8,24
+ lw $s0,0($key0)
+ rotr $t9,$t9,24
+ lw $s1,4($key0)
+ rotr $t10,$t10,24
+ lw $s2,8($key0)
+ rotr $t11,$t11,24
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_dec
+ ext $i0,$s3,16,8
+
+ _xtr $i0,$s3,16-2
+#else
+ _xtr $i0,$s3,16-2
+.Loop_dec:
+ _xtr $i1,$s0,16-2
+ _xtr $i2,$s1,16-2
+ _xtr $i3,$s2,16-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ lw $t0,0($i0) # Td1[s3>>16]
+ _xtr $i0,$s2,8-2
+ lw $t1,0($i1) # Td1[s0>>16]
+ _xtr $i1,$s3,8-2
+ lw $t2,0($i2) # Td1[s1>>16]
+ _xtr $i2,$s0,8-2
+ lw $t3,0($i3) # Td1[s2>>16]
+ _xtr $i3,$s1,8-2
+#else
+ lwl $t0,3($i0) # Td1[s3>>16]
+ lwl $t1,3($i1) # Td1[s0>>16]
+ lwl $t2,3($i2) # Td1[s1>>16]
+ lwl $t3,3($i3) # Td1[s2>>16]
+ lwr $t0,2($i0) # Td1[s3>>16]
+ _xtr $i0,$s2,8-2
+ lwr $t1,2($i1) # Td1[s0>>16]
+ _xtr $i1,$s3,8-2
+ lwr $t2,2($i2) # Td1[s1>>16]
+ _xtr $i2,$s0,8-2
+ lwr $t3,2($i3) # Td1[s2>>16]
+ _xtr $i3,$s1,8-2
+#endif
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ rotr $t0,$t0,8
+ rotr $t1,$t1,8
+ rotr $t2,$t2,8
+ rotr $t3,$t3,8
+# if defined(_MIPSEL)
+ lw $t4,0($i0) # Td2[s2>>8]
+ _xtr $i0,$s1,0-2
+ lw $t5,0($i1) # Td2[s3>>8]
+ _xtr $i1,$s2,0-2
+ lw $t6,0($i2) # Td2[s0>>8]
+ _xtr $i2,$s3,0-2
+ lw $t7,0($i3) # Td2[s1>>8]
+ _xtr $i3,$s0,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lw $t8,0($i0) # Td3[s1]
+ $PTR_INS $i0,$s0,2,8
+ lw $t9,0($i1) # Td3[s2]
+ $PTR_INS $i1,$s1,2,8
+ lw $t10,0($i2) # Td3[s3]
+ $PTR_INS $i2,$s2,2,8
+ lw $t11,0($i3) # Td3[s0]
+ $PTR_INS $i3,$s3,2,8
+#else
+ lw $t4,0($i0) # Td2[s2>>8]
+ $PTR_INS $i0,$s1,2,8
+ lw $t5,0($i1) # Td2[s3>>8]
+ $PTR_INS $i1,$s2,2,8
+ lw $t6,0($i2) # Td2[s0>>8]
+ $PTR_INS $i2,$s3,2,8
+ lw $t7,0($i3) # Td2[s1>>8]
+ $PTR_INS $i3,$s0,2,8
+
+ lw $t8,0($i0) # Td3[s1]
+ _xtr $i0,$s0,24-2
+ lw $t9,0($i1) # Td3[s2]
+ _xtr $i1,$s1,24-2
+ lw $t10,0($i2) # Td3[s3]
+ _xtr $i2,$s2,24-2
+ lw $t11,0($i3) # Td3[s0]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#endif
+ rotr $t4,$t4,16
+ rotr $t5,$t5,16
+ rotr $t6,$t6,16
+ rotr $t7,$t7,16
+
+ rotr $t8,$t8,24
+ rotr $t9,$t9,24
+ rotr $t10,$t10,24
+ rotr $t11,$t11,24
+#else
+ lwl $t4,2($i0) # Td2[s2>>8]
+ lwl $t5,2($i1) # Td2[s3>>8]
+ lwl $t6,2($i2) # Td2[s0>>8]
+ lwl $t7,2($i3) # Td2[s1>>8]
+ lwr $t4,1($i0) # Td2[s2>>8]
+ _xtr $i0,$s1,0-2
+ lwr $t5,1($i1) # Td2[s3>>8]
+ _xtr $i1,$s2,0-2
+ lwr $t6,1($i2) # Td2[s0>>8]
+ _xtr $i2,$s3,0-2
+ lwr $t7,1($i3) # Td2[s1>>8]
+ _xtr $i3,$s0,0-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t8,1($i0) # Td3[s1]
+ lwl $t9,1($i1) # Td3[s2]
+ lwl $t10,1($i2) # Td3[s3]
+ lwl $t11,1($i3) # Td3[s0]
+ lwr $t8,0($i0) # Td3[s1]
+ _xtr $i0,$s0,24-2
+ lwr $t9,0($i1) # Td3[s2]
+ _xtr $i1,$s1,24-2
+ lwr $t10,0($i2) # Td3[s3]
+ _xtr $i2,$s2,24-2
+ lwr $t11,0($i3) # Td3[s0]
+ _xtr $i3,$s3,24-2
+
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#endif
+
+ xor $t0,$t4
+ lw $t4,0($i0) # Td0[s0>>24]
+ xor $t1,$t5
+ lw $t5,0($i1) # Td0[s1>>24]
+ xor $t2,$t6
+ lw $t6,0($i2) # Td0[s2>>24]
+ xor $t3,$t7
+ lw $t7,0($i3) # Td0[s3>>24]
+
+ xor $t0,$t8
+ lw $s0,0($key0)
+ xor $t1,$t9
+ lw $s1,4($key0)
+ xor $t2,$t10
+ lw $s2,8($key0)
+ xor $t3,$t11
+ lw $s3,12($key0)
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_dec
+ _xtr $i0,$s3,16-2
+#endif
+
+ .set reorder
+ lw $t4,1024($Tbl) # prefetch Td4
+ _xtr $i0,$s3,16
+ lw $t5,1024+32($Tbl)
+ _xtr $i1,$s0,16
+ lw $t6,1024+64($Tbl)
+ _xtr $i2,$s1,16
+ lw $t7,1024+96($Tbl)
+ _xtr $i3,$s2,16
+ lw $t8,1024+128($Tbl)
+ and $i0,0xff
+ lw $t9,1024+160($Tbl)
+ and $i1,0xff
+ lw $t10,1024+192($Tbl)
+ and $i2,0xff
+ lw $t11,1024+224($Tbl)
+ and $i3,0xff
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t0,1024($i0) # Td4[s3>>16]
+ _xtr $i0,$s2,8
+ lbu $t1,1024($i1) # Td4[s0>>16]
+ _xtr $i1,$s3,8
+ lbu $t2,1024($i2) # Td4[s1>>16]
+ _xtr $i2,$s0,8
+ lbu $t3,1024($i3) # Td4[s2>>16]
+ _xtr $i3,$s1,8
+
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,0xff
+ and $i3,0xff
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+# if defined(_MIPSEL)
+ lbu $t4,1024($i0) # Td4[s2>>8]
+ $PTR_INS $i0,$s0,0,8
+ lbu $t5,1024($i1) # Td4[s3>>8]
+ $PTR_INS $i1,$s1,0,8
+ lbu $t6,1024($i2) # Td4[s0>>8]
+ $PTR_INS $i2,$s2,0,8
+ lbu $t7,1024($i3) # Td4[s1>>8]
+ $PTR_INS $i3,$s3,0,8
+
+ lbu $t8,1024($i0) # Td4[s0>>24]
+ _xtr $i0,$s1,0
+ lbu $t9,1024($i1) # Td4[s1>>24]
+ _xtr $i1,$s2,0
+ lbu $t10,1024($i2) # Td4[s2>>24]
+ _xtr $i2,$s3,0
+ lbu $t11,1024($i3) # Td4[s3>>24]
+ _xtr $i3,$s0,0
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+# else
+ lbu $t4,1024($i0) # Td4[s2>>8]
+ _xtr $i0,$s0,24
+ lbu $t5,1024($i1) # Td4[s3>>8]
+ _xtr $i1,$s1,24
+ lbu $t6,1024($i2) # Td4[s0>>8]
+ _xtr $i2,$s2,24
+ lbu $t7,1024($i3) # Td4[s1>>8]
+ _xtr $i3,$s3,24
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,1024($i0) # Td4[s0>>24]
+ $PTR_INS $i0,$s1,0,8
+ lbu $t9,1024($i1) # Td4[s1>>24]
+ $PTR_INS $i1,$s2,0,8
+ lbu $t10,1024($i2) # Td4[s2>>24]
+ $PTR_INS $i2,$s3,0,8
+ lbu $t11,1024($i3) # Td4[s3>>24]
+ $PTR_INS $i3,$s0,0,8
+# endif
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+
+ _ins2 $t0,$t4,8
+ lbu $t4,1024($i0) # Td4[s1]
+ _ins2 $t1,$t5,8
+ lbu $t5,1024($i1) # Td4[s2]
+ _ins2 $t2,$t6,8
+ lbu $t6,1024($i2) # Td4[s3]
+ _ins2 $t3,$t7,8
+ lbu $t7,1024($i3) # Td4[s0]
+
+ _ins2 $t0,$t8,24
+ lw $s0,0($key0)
+ _ins2 $t1,$t9,24
+ lw $s1,4($key0)
+ _ins2 $t2,$t10,24
+ lw $s2,8($key0)
+ _ins2 $t3,$t11,24
+ lw $s3,12($key0)
+
+ _ins2 $t0,$t4,0
+ _ins2 $t1,$t5,0
+ _ins2 $t2,$t6,0
+ _ins2 $t3,$t7,0
+#else
+ lbu $t4,1024($i0) # Td4[s2>>8]
+ _xtr $i0,$s0,24
+ lbu $t5,1024($i1) # Td4[s3>>8]
+ _xtr $i1,$s1,24
+ lbu $t6,1024($i2) # Td4[s0>>8]
+ _xtr $i2,$s2,24
+ lbu $t7,1024($i3) # Td4[s1>>8]
+ _xtr $i3,$s3,24
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,1024($i0) # Td4[s0>>24]
+ _xtr $i0,$s1,0
+ lbu $t9,1024($i1) # Td4[s1>>24]
+ _xtr $i1,$s2,0
+ lbu $t10,1024($i2) # Td4[s2>>24]
+ _xtr $i2,$s3,0
+ lbu $t11,1024($i3) # Td4[s3>>24]
+ _xtr $i3,$s0,0
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+
+ _ins $t4,8
+ _ins $t5,8
+ _ins $t6,8
+ _ins $t7,8
+
+ xor $t0,$t4
+ lbu $t4,1024($i0) # Td4[s1]
+ xor $t1,$t5
+ lbu $t5,1024($i1) # Td4[s2]
+ xor $t2,$t6
+ lbu $t6,1024($i2) # Td4[s3]
+ xor $t3,$t7
+ lbu $t7,1024($i3) # Td4[s0]
+
+ _ins $t8,24
+ lw $s0,0($key0)
+ _ins $t9,24
+ lw $s1,4($key0)
+ _ins $t10,24
+ lw $s2,8($key0)
+ _ins $t11,24
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ _ins $t4,0
+ _ins $t5,0
+ _ins $t6,0
+ _ins $t7,0
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+#endif
+
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ jr $ra
+.end _mips_AES_decrypt
+
+.align 5
+.globl AES_decrypt
+.ent AES_decrypt
+AES_decrypt:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,AES_decrypt
+___
+$code.=<<___;
+ .set reorder
+ $PTR_LA $Tbl,AES_Td # PIC-ified 'load address'
+
+ lwl $s0,0+$MSB($inp)
+ lwl $s1,4+$MSB($inp)
+ lwl $s2,8+$MSB($inp)
+ lwl $s3,12+$MSB($inp)
+ lwr $s0,0+$LSB($inp)
+ lwr $s1,4+$LSB($inp)
+ lwr $s2,8+$LSB($inp)
+ lwr $s3,12+$LSB($inp)
+
+ bal _mips_AES_decrypt
+
+ swr $s0,0+$LSB($out)
+ swr $s1,4+$LSB($out)
+ swr $s2,8+$LSB($out)
+ swr $s3,12+$LSB($out)
+ swl $s0,0+$MSB($out)
+ swl $s1,4+$MSB($out)
+ swl $s2,8+$MSB($out)
+ swl $s3,12+$MSB($out)
+
+ .set noreorder
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end AES_decrypt
+___
+}}}
+
+{{{
+my $FRAMESIZE=8*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc000f008" : "0xc0000000";
+
+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($rcon,$cnt)=($gp,$fp);
+
+$code.=<<___;
+.align 5
+.ent _mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+ .frame $sp,0,$ra
+ .set noreorder
+ beqz $inp,.Lekey_done
+ li $t0,-1
+ beqz $key,.Lekey_done
+ $PTR_ADD $rcon,$Tbl,256
+
+ .set reorder
+ lwl $rk0,0+$MSB($inp) # load 128 bits
+ lwl $rk1,4+$MSB($inp)
+ lwl $rk2,8+$MSB($inp)
+ lwl $rk3,12+$MSB($inp)
+ li $at,128
+ lwr $rk0,0+$LSB($inp)
+ lwr $rk1,4+$LSB($inp)
+ lwr $rk2,8+$LSB($inp)
+ lwr $rk3,12+$LSB($inp)
+ .set noreorder
+ beq $bits,$at,.L128bits
+ li $cnt,10
+
+ .set reorder
+ lwl $rk4,16+$MSB($inp) # load 192 bits
+ lwl $rk5,20+$MSB($inp)
+ li $at,192
+ lwr $rk4,16+$LSB($inp)
+ lwr $rk5,20+$LSB($inp)
+ .set noreorder
+ beq $bits,$at,.L192bits
+ li $cnt,8
+
+ .set reorder
+ lwl $rk6,24+$MSB($inp) # load 256 bits
+ lwl $rk7,28+$MSB($inp)
+ li $at,256
+ lwr $rk6,24+$LSB($inp)
+ lwr $rk7,28+$LSB($inp)
+ .set noreorder
+ beq $bits,$at,.L256bits
+ li $cnt,7
+
+ b .Lekey_done
+ li $t0,-2
+
+.align 4
+.L128bits:
+ .set reorder
+ srl $i0,$rk3,16
+ srl $i1,$rk3,8
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,$rk3,0xff
+ srl $i3,$rk3,24
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ sw $rk3,12($key)
+ sub $cnt,1
+ $PTR_ADD $key,16
+
+ _bias $i0,24
+ _bias $i1,16
+ _bias $i2,8
+ _bias $i3,0
+
+ xor $rk0,$i0
+ lw $i0,0($rcon)
+ xor $rk0,$i1
+ xor $rk0,$i2
+ xor $rk0,$i3
+ xor $rk0,$i0
+
+ xor $rk1,$rk0
+ xor $rk2,$rk1
+ xor $rk3,$rk2
+
+ .set noreorder
+ bnez $cnt,.L128bits
+ $PTR_ADD $rcon,4
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ li $cnt,10
+ sw $rk3,12($key)
+ li $t0,0
+ sw $cnt,80($key)
+ b .Lekey_done
+ $PTR_SUB $key,10*16
+
+.align 4
+.L192bits:
+ .set reorder
+ srl $i0,$rk5,16
+ srl $i1,$rk5,8
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,$rk5,0xff
+ srl $i3,$rk5,24
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ sw $rk3,12($key)
+ sw $rk4,16($key)
+ sw $rk5,20($key)
+ sub $cnt,1
+ $PTR_ADD $key,24
+
+ _bias $i0,24
+ _bias $i1,16
+ _bias $i2,8
+ _bias $i3,0
+
+ xor $rk0,$i0
+ lw $i0,0($rcon)
+ xor $rk0,$i1
+ xor $rk0,$i2
+ xor $rk0,$i3
+ xor $rk0,$i0
+
+ xor $rk1,$rk0
+ xor $rk2,$rk1
+ xor $rk3,$rk2
+ xor $rk4,$rk3
+ xor $rk5,$rk4
+
+ .set noreorder
+ bnez $cnt,.L192bits
+ $PTR_ADD $rcon,4
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ li $cnt,12
+ sw $rk3,12($key)
+ li $t0,0
+ sw $cnt,48($key)
+ b .Lekey_done
+ $PTR_SUB $key,12*16
+
+.align 4
+.L256bits:
+ .set reorder
+ srl $i0,$rk7,16
+ srl $i1,$rk7,8
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,$rk7,0xff
+ srl $i3,$rk7,24
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ sw $rk3,12($key)
+ sw $rk4,16($key)
+ sw $rk5,20($key)
+ sw $rk6,24($key)
+ sw $rk7,28($key)
+ sub $cnt,1
+
+ _bias $i0,24
+ _bias $i1,16
+ _bias $i2,8
+ _bias $i3,0
+
+ xor $rk0,$i0
+ lw $i0,0($rcon)
+ xor $rk0,$i1
+ xor $rk0,$i2
+ xor $rk0,$i3
+ xor $rk0,$i0
+
+ xor $rk1,$rk0
+ xor $rk2,$rk1
+ xor $rk3,$rk2
+ beqz $cnt,.L256bits_done
+
+ srl $i0,$rk3,24
+ srl $i1,$rk3,16
+ srl $i2,$rk3,8
+ and $i3,$rk3,0xff
+ and $i1,0xff
+ and $i2,0xff
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,0($i0)
+ lbu $i1,0($i1)
+ lbu $i2,0($i2)
+ lbu $i3,0($i3)
+ sll $i0,24
+ sll $i1,16
+ sll $i2,8
+
+ xor $rk4,$i0
+ xor $rk4,$i1
+ xor $rk4,$i2
+ xor $rk4,$i3
+
+ xor $rk5,$rk4
+ xor $rk6,$rk5
+ xor $rk7,$rk6
+
+ $PTR_ADD $key,32
+ .set noreorder
+ b .L256bits
+ $PTR_ADD $rcon,4
+
+.L256bits_done:
+ sw $rk0,32($key)
+ sw $rk1,36($key)
+ sw $rk2,40($key)
+ li $cnt,14
+ sw $rk3,44($key)
+ li $t0,0
+ sw $cnt,48($key)
+ $PTR_SUB $key,12*16
+
+.Lekey_done:
+ jr $ra
+ nop
+.end _mips_AES_set_encrypt_key
+
+.globl AES_set_encrypt_key
+.ent AES_set_encrypt_key
+AES_set_encrypt_key:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,AES_set_encrypt_key
+___
+$code.=<<___;
+ .set reorder
+ $PTR_LA $Tbl,AES_Te4 # PIC-ified 'load address'
+
+ bal _mips_AES_set_encrypt_key
+
+ .set noreorder
+ move $a0,$t0
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end AES_set_encrypt_key
+___
+
+my ($head,$tail)=($inp,$bits);
+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
+$code.=<<___;
+.align 5
+.globl AES_set_decrypt_key
+.ent AES_set_decrypt_key
+AES_set_decrypt_key:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,AES_set_decrypt_key
+___
+$code.=<<___;
+ .set reorder
+ $PTR_LA $Tbl,AES_Te4 # PIC-ified 'load address'
+
+ bal _mips_AES_set_encrypt_key
+
+ bltz $t0,.Ldkey_done
+
+ sll $at,$cnt,4
+ $PTR_ADD $head,$key,0
+ $PTR_ADD $tail,$key,$at
+.align 4
+.Lswap:
+ lw $rk0,0($head)
+ lw $rk1,4($head)
+ lw $rk2,8($head)
+ lw $rk3,12($head)
+ lw $rk4,0($tail)
+ lw $rk5,4($tail)
+ lw $rk6,8($tail)
+ lw $rk7,12($tail)
+ sw $rk0,0($tail)
+ sw $rk1,4($tail)
+ sw $rk2,8($tail)
+ sw $rk3,12($tail)
+ $PTR_ADD $head,16
+ $PTR_SUB $tail,16
+ sw $rk4,-16($head)
+ sw $rk5,-12($head)
+ sw $rk6,-8($head)
+ sw $rk7,-4($head)
+ bne $head,$tail,.Lswap
+
+ lw $tp1,16($key) # modulo-scheduled
+ lui $x80808080,0x8080
+ sub $cnt,1
+ or $x80808080,0x8080
+ sll $cnt,2
+ $PTR_ADD $key,16
+ lui $x1b1b1b1b,0x1b1b
+ nor $x7f7f7f7f,$zero,$x80808080
+ or $x1b1b1b1b,0x1b1b
+.align 4
+.Lmix:
+ and $m,$tp1,$x80808080
+ and $tp2,$tp1,$x7f7f7f7f
+ srl $tp4,$m,7
+ addu $tp2,$tp2 # tp2<<1
+ subu $m,$tp4
+ and $m,$x1b1b1b1b
+ xor $tp2,$m
+
+ and $m,$tp2,$x80808080
+ and $tp4,$tp2,$x7f7f7f7f
+ srl $tp8,$m,7
+ addu $tp4,$tp4 # tp4<<1
+ subu $m,$tp8
+ and $m,$x1b1b1b1b
+ xor $tp4,$m
+
+ and $m,$tp4,$x80808080
+ and $tp8,$tp4,$x7f7f7f7f
+ srl $tp9,$m,7
+ addu $tp8,$tp8 # tp8<<1
+ subu $m,$tp9
+ and $m,$x1b1b1b1b
+ xor $tp8,$m
+
+ xor $tp9,$tp8,$tp1
+ xor $tpe,$tp8,$tp4
+ xor $tpb,$tp9,$tp2
+ xor $tpd,$tp9,$tp4
+
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+ rotr $tp1,$tpd,16
+ xor $tpe,$tp2
+ rotr $tp2,$tp9,8
+ xor $tpe,$tp1
+ rotr $tp4,$tpb,24
+ xor $tpe,$tp2
+ lw $tp1,4($key) # modulo-scheduled
+ xor $tpe,$tp4
+#else
+ _ror $tp1,$tpd,16
+ xor $tpe,$tp2
+ _ror $tp2,$tpd,-16
+ xor $tpe,$tp1
+ _ror $tp1,$tp9,8
+ xor $tpe,$tp2
+ _ror $tp2,$tp9,-24
+ xor $tpe,$tp1
+ _ror $tp1,$tpb,24
+ xor $tpe,$tp2
+ _ror $tp2,$tpb,-8
+ xor $tpe,$tp1
+ lw $tp1,4($key) # modulo-scheduled
+ xor $tpe,$tp2
+#endif
+ sub $cnt,1
+ sw $tpe,0($key)
+ $PTR_ADD $key,4
+ bnez $cnt,.Lmix
+
+ li $t0,0
+.Ldkey_done:
+ .set noreorder
+ move $a0,$t0
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end AES_set_decrypt_key
+___
+}}}
+
+######################################################################
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+.rdata
+.align 10
+AES_Te:
+.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
+.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
+.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
+.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
+.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
+.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
+.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
+.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
+.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
+.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
+.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
+.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
+.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
+.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
+.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
+.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
+.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
+.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
+.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
+.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
+.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
+.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
+.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
+.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
+.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
+.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
+.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
+.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
+.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
+.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
+.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
+.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
+.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
+.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
+.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
+.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
+.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
+.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
+.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
+.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
+.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
+.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
+.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
+.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
+.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
+.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
+.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
+.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
+.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
+.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
+.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
+.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
+.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
+.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
+.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
+.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
+.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
+.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
+.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
+.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
+.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
+.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
+.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
+.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
+.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
+.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
+.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
+.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
+.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
+.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
+.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
+.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
+.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
+.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
+.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
+.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
+.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
+.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
+.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
+.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
+.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
+.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
+.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
+.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
+.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
+.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
+.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
+.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
+.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
+.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
+.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
+.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
+.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
+.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
+.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
+.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
+.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
+.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
+.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
+.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
+.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
+.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
+.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
+.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
+.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
+.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
+.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
+.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
+.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
+.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
+.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
+.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
+.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
+.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
+.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
+.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
+.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
+.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
+.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
+.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
+.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
+.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
+.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
+.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
+.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
+.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
+.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
+.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
+
+AES_Td:
+.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
+.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
+.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
+.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
+.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
+.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
+.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
+.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
+.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
+.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
+.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
+.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
+.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
+.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
+.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
+.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
+.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
+.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
+.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
+.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
+.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
+.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
+.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
+.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
+.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
+.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
+.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
+.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
+.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
+.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
+.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
+.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
+.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
+.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
+.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
+.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
+.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
+.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
+.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
+.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
+.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
+.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
+.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
+.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
+.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
+.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
+.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
+.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
+.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
+.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
+.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
+.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
+.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
+.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
+.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
+.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
+.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
+.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
+.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
+.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
+.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
+.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
+.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
+.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
+.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
+.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
+.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
+.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
+.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
+.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
+.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
+.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
+.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
+.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
+.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
+.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
+.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
+.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
+.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
+.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
+.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
+.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
+.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
+.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
+.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
+.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
+.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
+.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
+.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
+.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
+.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
+.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
+.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
+.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
+.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
+.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
+.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
+.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
+.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
+.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
+.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
+.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
+.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
+.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
+.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
+.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
+.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
+.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
+.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
+.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
+.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
+.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
+.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
+.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
+.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
+.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
+.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
+.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
+.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
+.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
+.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
+.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
+.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
+.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
+.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
+.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
+.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
+.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
+
+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+AES_Te4:
+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
+.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
+.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
+.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
+.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ # made-up _instructions, _xtr, _ins, _ror and _bias, cope
+ # with byte order dependencies...
+ if (/^\s+_/) {
+ s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
+
+ s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
+ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("24-$3"))/e or
+ s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("24-$3"))/e or
+ s/_ins2\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+ sprintf("ins\t$1,$2,%d,8",$big_endian ? eval($3)
+ : eval("24-$3"))/e or
+ s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
+ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("$3*-1"))/e or
+ s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("($3-16)&31"))/e;
+
+ s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
+ sprintf("sll\t$1,$2,$3")/e or
+ s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
+ sprintf("and\t$1,$2,0xff")/e or
+ s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
+ }
+
+ # convert lwl/lwr and swr/swl to little-endian order
+ if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
+ s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
+ sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or
+ s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
+ sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
+ }
+
+ if (!$big_endian) {
+ s/(rotr\s+\$[0-9]+,\$[0-9]+),([0-9]+)/sprintf("$1,%d",32-$2)/e;
+ s/(ext\s+\$[0-9]+,\$[0-9]+),([0-9]+),8/sprintf("$1,%d,8",24-$2)/e;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-parisc.pl b/openssl-1.1.0h/crypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000..2c785bc
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1029 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+} else {
+ $LEVEL ="1.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+ # [+ argument transfer]
+$inp="%r26"; # arg0
+$out="%r25"; # arg1
+$key="%r24"; # arg2
+
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+
+$tbl="%r28";
+$rounds="%r29";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 64
+AES_encrypt
+ .PROC
+ .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+ $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
+ $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+ blr %r0,$tbl
+ ldi 3,$t0
+L\$enc_pic
+ andcm $tbl,$t0,$tbl
+ ldo L\$AES_Te-L\$enc_pic($tbl),$tbl
+
+ and $inp,$t0,$t0
+ sub $inp,$t0,$inp
+ ldw 0($inp),$s0
+ ldw 4($inp),$s1
+ ldw 8($inp),$s2
+ comib,= 0,$t0,L\$enc_inp_aligned
+ ldw 12($inp),$s3
+
+ sh3addl $t0,%r0,$t0
+ subi 32,$t0,$t0
+ mtctl $t0,%cr11
+ ldw 16($inp),$t1
+ vshd $s0,$s1,$s0
+ vshd $s1,$s2,$s1
+ vshd $s2,$s3,$s2
+ vshd $s3,$t1,$s3
+
+L\$enc_inp_aligned
+ bl _parisc_AES_encrypt,%r31
+ nop
+
+ extru,<> $out,31,2,%r0
+ b L\$enc_out_aligned
+ nop
+
+ _srm $s0,24,$acc0
+ _srm $s0,16,$acc1
+ stb $acc0,0($out)
+ _srm $s0,8,$acc2
+ stb $acc1,1($out)
+ _srm $s1,24,$acc4
+ stb $acc2,2($out)
+ _srm $s1,16,$acc5
+ stb $s0,3($out)
+ _srm $s1,8,$acc6
+ stb $acc4,4($out)
+ _srm $s2,24,$acc0
+ stb $acc5,5($out)
+ _srm $s2,16,$acc1
+ stb $acc6,6($out)
+ _srm $s2,8,$acc2
+ stb $s1,7($out)
+ _srm $s3,24,$acc4
+ stb $acc0,8($out)
+ _srm $s3,16,$acc5
+ stb $acc1,9($out)
+ _srm $s3,8,$acc6
+ stb $acc2,10($out)
+ stb $s2,11($out)
+ stb $acc4,12($out)
+ stb $acc5,13($out)
+ stb $acc6,14($out)
+ b L\$enc_done
+ stb $s3,15($out)
+
+L\$enc_out_aligned
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+
+L\$enc_done
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
+ $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 16
+_parisc_AES_encrypt
+ .PROC
+ .CALLINFO MILLICODE
+ .ENTRY
+ ldw 240($key),$rounds
+ ldw 0($key),$t0
+ ldw 4($key),$t1
+ ldw 8($key),$t2
+ _srm $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ldw 12($key),$t3
+ _srm $s0,24,$acc0
+ xor $t1,$s1,$s1
+ ldw 16($key),$t0
+ _srm $s1,16,$acc1
+ xor $t2,$s2,$s2
+ ldw 20($key),$t1
+ xor $t3,$s3,$s3
+ ldw 24($key),$t2
+ ldw 28($key),$t3
+L\$enc_loop
+ _srm $s2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $s3,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $s1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $s2,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $s3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $s0,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $s2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $s3,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $s0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $s1,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $s3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $s0,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $s1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $s2,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ ldwx,s $acc14($tbl),$acc14
+ ldwx,s $acc15($tbl),$acc15
+ addib,= -1,$rounds,L\$enc_last
+ ldo 32($key),$key
+
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ _srm $t1,16,$acc1
+ xor $acc15,$t3,$t3
+
+ _srm $t2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $t3,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $t1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $t2,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $t3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $t0,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $t2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $t3,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $t0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $t1,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $t3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $t0,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $t1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $t2,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ _ror $acc1,8,$acc1
+ ldwx,s $acc14($tbl),$acc14
+
+ _ror $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldwx,s $acc15($tbl),$acc15
+ _ror $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ldw 16($key),$t0
+ _ror $acc5,8,$acc5
+ xor $acc2,$s0,$s0
+ ldw 20($key),$t1
+ _ror $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ldw 24($key),$t2
+ _ror $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ldw 28($key),$t3
+ _ror $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldw 1024+0($tbl),%r0 ; prefetch te4
+ _ror $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldw 1024+32($tbl),%r0 ; prefetch te4
+ _ror $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldw 1024+64($tbl),%r0 ; prefetch te4
+ _ror $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldw 1024+96($tbl),%r0 ; prefetch te4
+ _ror $acc14,16,$acc14
+ xor $acc9,$s2,$s2
+ ldw 1024+128($tbl),%r0 ; prefetch te4
+ _ror $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldw 1024+160($tbl),%r0 ; prefetch te4
+ _srm $s0,24,$acc0
+ xor $acc11,$s2,$s2
+ ldw 1024+192($tbl),%r0 ; prefetch te4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldw 1024+224($tbl),%r0 ; prefetch te4
+ _srm $s1,16,$acc1
+ xor $acc14,$s3,$s3
+ b L\$enc_loop
+ xor $acc15,$s3,$s3
+
+ .ALIGN 16
+L\$enc_last
+ ldo 1024($tbl),$rounds
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ _srm $t1,16,$acc1
+ xor $acc15,$t3,$t3
+
+ _srm $t2,8,$acc2
+ ldbx $acc0($rounds),$acc0
+ _srm $t1,24,$acc4
+ ldbx $acc1($rounds),$acc1
+ _srm $t2,16,$acc5
+ _srm $t3,0,$acc3
+ ldbx $acc2($rounds),$acc2
+ ldbx $acc3($rounds),$acc3
+ _srm $t3,8,$acc6
+ ldbx $acc4($rounds),$acc4
+ _srm $t2,24,$acc8
+ ldbx $acc5($rounds),$acc5
+ _srm $t3,16,$acc9
+ _srm $t0,0,$acc7
+ ldbx $acc6($rounds),$acc6
+ ldbx $acc7($rounds),$acc7
+ _srm $t0,8,$acc10
+ ldbx $acc8($rounds),$acc8
+ _srm $t3,24,$acc12
+ ldbx $acc9($rounds),$acc9
+ _srm $t0,16,$acc13
+ _srm $t1,0,$acc11
+ ldbx $acc10($rounds),$acc10
+ _srm $t1,8,$acc14
+ ldbx $acc11($rounds),$acc11
+ ldbx $acc12($rounds),$acc12
+ ldbx $acc13($rounds),$acc13
+ _srm $t2,0,$acc15
+ ldbx $acc14($rounds),$acc14
+
+ dep $acc0,7,8,$acc3
+ ldbx $acc15($rounds),$acc15
+ dep $acc4,7,8,$acc7
+ dep $acc1,15,8,$acc3
+ dep $acc5,15,8,$acc7
+ dep $acc2,23,8,$acc3
+ dep $acc6,23,8,$acc7
+ xor $acc3,$s0,$s0
+ xor $acc7,$s1,$s1
+ dep $acc8,7,8,$acc11
+ dep $acc12,7,8,$acc15
+ dep $acc9,15,8,$acc11
+ dep $acc13,15,8,$acc15
+ dep $acc10,23,8,$acc11
+ dep $acc14,23,8,$acc15
+ xor $acc11,$s2,$s2
+
+ bv (%r31)
+ .EXIT
+ xor $acc15,$s3,$s3
+ .PROCEND
+
+ .ALIGN 64
+L\$AES_Te
+ .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+ .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+ .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+ .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+ .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+ .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+ .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+ .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+ .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+ .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+ .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+ .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+ .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+ .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+ .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+ .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+ .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+ .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+ .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+ .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+ .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+ .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+ .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+ .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+ .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+ .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+ .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+ .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+ .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+ .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+ .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+ .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+ .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+ .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+ .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+ .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+ .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+ .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+ .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+ .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+ .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+ .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+ .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+ .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+ .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+ .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+ .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+ .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+ .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+ .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+ .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+ .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+ .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+ .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+ .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+ .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+ .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+ .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+ .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+ .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+ .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+ .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+ .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+ .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+ .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+
+$code.=<<___;
+ .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 16
+AES_decrypt
+ .PROC
+ .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+ $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
+ $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+ blr %r0,$tbl
+ ldi 3,$t0
+L\$dec_pic
+ andcm $tbl,$t0,$tbl
+ ldo L\$AES_Td-L\$dec_pic($tbl),$tbl
+
+ and $inp,$t0,$t0
+ sub $inp,$t0,$inp
+ ldw 0($inp),$s0
+ ldw 4($inp),$s1
+ ldw 8($inp),$s2
+ comib,= 0,$t0,L\$dec_inp_aligned
+ ldw 12($inp),$s3
+
+ sh3addl $t0,%r0,$t0
+ subi 32,$t0,$t0
+ mtctl $t0,%cr11
+ ldw 16($inp),$t1
+ vshd $s0,$s1,$s0
+ vshd $s1,$s2,$s1
+ vshd $s2,$s3,$s2
+ vshd $s3,$t1,$s3
+
+L\$dec_inp_aligned
+ bl _parisc_AES_decrypt,%r31
+ nop
+
+ extru,<> $out,31,2,%r0
+ b L\$dec_out_aligned
+ nop
+
+ _srm $s0,24,$acc0
+ _srm $s0,16,$acc1
+ stb $acc0,0($out)
+ _srm $s0,8,$acc2
+ stb $acc1,1($out)
+ _srm $s1,24,$acc4
+ stb $acc2,2($out)
+ _srm $s1,16,$acc5
+ stb $s0,3($out)
+ _srm $s1,8,$acc6
+ stb $acc4,4($out)
+ _srm $s2,24,$acc0
+ stb $acc5,5($out)
+ _srm $s2,16,$acc1
+ stb $acc6,6($out)
+ _srm $s2,8,$acc2
+ stb $s1,7($out)
+ _srm $s3,24,$acc4
+ stb $acc0,8($out)
+ _srm $s3,16,$acc5
+ stb $acc1,9($out)
+ _srm $s3,8,$acc6
+ stb $acc2,10($out)
+ stb $s2,11($out)
+ stb $acc4,12($out)
+ stb $acc5,13($out)
+ stb $acc6,14($out)
+ b L\$dec_done
+ stb $s3,15($out)
+
+L\$dec_out_aligned
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+
+L\$dec_done
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
+ $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 16
+_parisc_AES_decrypt
+ .PROC
+ .CALLINFO MILLICODE
+ .ENTRY
+ ldw 240($key),$rounds
+ ldw 0($key),$t0
+ ldw 4($key),$t1
+ ldw 8($key),$t2
+ ldw 12($key),$t3
+ _srm $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ldw 16($key),$t0
+ xor $t1,$s1,$s1
+ ldw 20($key),$t1
+ _srm $s0,24,$acc0
+ xor $t2,$s2,$s2
+ ldw 24($key),$t2
+ xor $t3,$s3,$s3
+ ldw 28($key),$t3
+ _srm $s3,16,$acc1
+L\$dec_loop
+ _srm $s2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $s1,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $s1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $s0,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $s3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $s2,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $s2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $s1,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $s0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $s3,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $s3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $s2,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $s1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $s0,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ ldwx,s $acc14($tbl),$acc14
+ ldwx,s $acc15($tbl),$acc15
+ addib,= -1,$rounds,L\$dec_last
+ ldo 32($key),$key
+
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3
+ _srm $t3,16,$acc1
+
+ _srm $t2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $t1,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $t1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $t0,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $t3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $t2,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $t2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $t1,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $t0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $t3,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $t3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $t2,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $t1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $t0,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ _ror $acc1,8,$acc1
+ ldwx,s $acc14($tbl),$acc14
+
+ _ror $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldwx,s $acc15($tbl),$acc15
+ _ror $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ldw 16($key),$t0
+ _ror $acc5,8,$acc5
+ xor $acc2,$s0,$s0
+ ldw 20($key),$t1
+ _ror $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ldw 24($key),$t2
+ _ror $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ldw 28($key),$t3
+ _ror $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldw 1024+0($tbl),%r0 ; prefetch td4
+ _ror $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldw 1024+32($tbl),%r0 ; prefetch td4
+ _ror $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldw 1024+64($tbl),%r0 ; prefetch td4
+ _ror $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldw 1024+96($tbl),%r0 ; prefetch td4
+ _ror $acc14,16,$acc14
+ xor $acc9,$s2,$s2
+ ldw 1024+128($tbl),%r0 ; prefetch td4
+ _ror $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldw 1024+160($tbl),%r0 ; prefetch td4
+ _srm $s0,24,$acc0
+ xor $acc11,$s2,$s2
+ ldw 1024+192($tbl),%r0 ; prefetch td4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldw 1024+224($tbl),%r0 ; prefetch td4
+ xor $acc14,$s3,$s3
+ xor $acc15,$s3,$s3
+ b L\$dec_loop
+ _srm $s3,16,$acc1
+
+ .ALIGN 16
+L\$dec_last
+ ldo 1024($tbl),$rounds
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3
+ _srm $t3,16,$acc1
+
+ _srm $t2,8,$acc2
+ ldbx $acc0($rounds),$acc0
+ _srm $t1,24,$acc4
+ ldbx $acc1($rounds),$acc1
+ _srm $t0,16,$acc5
+ _srm $t1,0,$acc3
+ ldbx $acc2($rounds),$acc2
+ ldbx $acc3($rounds),$acc3
+ _srm $t3,8,$acc6
+ ldbx $acc4($rounds),$acc4
+ _srm $t2,24,$acc8
+ ldbx $acc5($rounds),$acc5
+ _srm $t1,16,$acc9
+ _srm $t2,0,$acc7
+ ldbx $acc6($rounds),$acc6
+ ldbx $acc7($rounds),$acc7
+ _srm $t0,8,$acc10
+ ldbx $acc8($rounds),$acc8
+ _srm $t3,24,$acc12
+ ldbx $acc9($rounds),$acc9
+ _srm $t2,16,$acc13
+ _srm $t3,0,$acc11
+ ldbx $acc10($rounds),$acc10
+ _srm $t1,8,$acc14
+ ldbx $acc11($rounds),$acc11
+ ldbx $acc12($rounds),$acc12
+ ldbx $acc13($rounds),$acc13
+ _srm $t0,0,$acc15
+ ldbx $acc14($rounds),$acc14
+
+ dep $acc0,7,8,$acc3
+ ldbx $acc15($rounds),$acc15
+ dep $acc4,7,8,$acc7
+ dep $acc1,15,8,$acc3
+ dep $acc5,15,8,$acc7
+ dep $acc2,23,8,$acc3
+ dep $acc6,23,8,$acc7
+ xor $acc3,$s0,$s0
+ xor $acc7,$s1,$s1
+ dep $acc8,7,8,$acc11
+ dep $acc12,7,8,$acc15
+ dep $acc9,15,8,$acc11
+ dep $acc13,15,8,$acc15
+ dep $acc10,23,8,$acc11
+ dep $acc14,23,8,$acc15
+ xor $acc11,$s2,$s2
+
+ bv (%r31)
+ .EXIT
+ xor $acc15,$s3,$s3
+ .PROCEND
+
+ .ALIGN 64
+L\$AES_Td
+ .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+ .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+ .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+ .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+ .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+ .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+ .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+ .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+ .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+ .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+ .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+ .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+ .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+ .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+ .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+ .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+ .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+ .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+ .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+ .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+ .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+ .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+ .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+ .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+ .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+ .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+ .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+ .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+ .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+ .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+ .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+ .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+ .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+ .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+ .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+ .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+ .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+ .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+ .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+ .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+ .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+ .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+ .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+ .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+ .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+ .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+ .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+ .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+ .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+ .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+ .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+ .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+ .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+ .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+ .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+ .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+ .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+ .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+ .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+ .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+ .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+ .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+ .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+ .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+ .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ # translate made up instructons: _ror, _srm
+ s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
+
+ s/_srm(\s+%r[0-9]+),([0-9]+),/
+ $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+ : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+
+ s/,\*/,/ if ($SIZE_T==4);
+ s/\bbv\b(.*\(%r2\))/bve$1/ if ($SIZE_T==8);
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-ppc.pl b/openssl-1.1.0h/crypto/aes/asm/aes-ppc.pl
new file mode 100644
index 0000000..1558d8e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-ppc.pl
@@ -0,0 +1,1459 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Needs more work: key setup, CBC routine...
+#
+# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
+# 128-bit key, which is ~40% better than 64-bit code generated by gcc
+# 4.0. But these are not the ones currently used! Their "compact"
+# counterparts are, for security reason. ppc_AES_encrypt_compact runs
+# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
+# at 1/3 of ppc_AES_decrypt.
+
+# February 2010
+#
+# Rescheduling instructions to favour Power6 pipeline gave 10%
+# performance improvement on the platform in question (and marginal
+# improvement even on others). It should be noted that Power6 fails
+# to process byte in 18 cycles, only in 23, because it fails to issue
+# 4 load instructions in two cycles, only in 3. As result non-compact
+# block subroutines are 25% slower than one would expect. Compact
+# functions scale better, because they have pure computational part,
+# which scales perfectly with clock frequency. To be specific
+# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
+# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=32*$SIZE_T;
+
+sub _data_word()
+{ my $i;
+ while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
+}
+
+$sp="r1";
+$toc="r2";
+$inp="r3";
+$out="r4";
+$key="r5";
+
+$Tbl0="r3";
+$Tbl1="r6";
+$Tbl2="r7";
+$Tbl3=$out; # stay away from "r2"; $out is offloaded to stack
+
+$s0="r8";
+$s1="r9";
+$s2="r10";
+$s3="r11";
+
+$t0="r12";
+$t1="r0"; # stay away from "r13";
+$t2="r14";
+$t3="r15";
+
+$acc00="r16";
+$acc01="r17";
+$acc02="r18";
+$acc03="r19";
+
+$acc04="r20";
+$acc05="r21";
+$acc06="r22";
+$acc07="r23";
+
+$acc08="r24";
+$acc09="r25";
+$acc10="r26";
+$acc11="r27";
+
+$acc12="r28";
+$acc13="r29";
+$acc14="r30";
+$acc15="r31";
+
+$mask80=$Tbl2;
+$mask1b=$Tbl3;
+
+$code.=<<___;
+.machine "any"
+.text
+
+.align 7
+LAES_Te:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
+ addi $Tbl0,$Tbl0,`128-8`
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
+LAES_Td:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
+ addi $Tbl0,$Tbl0,`128-64-8+2048+256`
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `128-64-9*4`
+___
+&_data_word(
+ 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
+ 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
+ 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
+ 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
+ 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
+ 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
+ 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
+ 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
+ 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
+ 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
+ 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
+ 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
+ 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
+ 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
+ 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
+ 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
+ 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
+ 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
+ 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
+ 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
+ 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
+ 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
+ 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
+ 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
+ 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
+ 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
+ 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
+ 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
+ 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
+ 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
+ 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
+ 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
+ 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
+ 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
+ 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
+ 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
+ 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
+ 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
+ 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
+ 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
+ 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
+ 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
+ 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
+ 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
+ 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
+ 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
+ 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
+ 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
+ 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
+ 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
+ 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
+ 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
+ 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
+ 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
+ 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
+ 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
+ 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
+ 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
+ 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
+ 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
+ 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
+ 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
+ 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
+ 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
+$code.=<<___;
+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+&_data_word(
+ 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
+ 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
+ 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
+ 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
+ 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
+ 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
+ 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
+ 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
+ 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
+ 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
+ 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
+ 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
+ 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
+ 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
+ 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
+ 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
+ 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
+ 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
+ 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
+ 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
+ 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
+ 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
+ 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
+ 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
+ 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
+ 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
+ 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
+ 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
+ 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
+ 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
+ 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
+ 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
+ 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
+ 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
+ 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
+ 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
+ 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
+ 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
+ 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
+ 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
+ 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
+ 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
+ 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
+ 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
+ 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
+ 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
+ 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
+ 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
+ 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
+ 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
+ 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
+ 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
+ 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
+ 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
+ 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
+ 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
+ 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
+ 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
+ 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
+ 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
+ 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
+ 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
+ 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
+ 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
+$code.=<<___;
+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+
+.globl .AES_encrypt
+.align 7
+.AES_encrypt:
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+
+ $PUSH $out,`$FRAME-$SIZE_T*19`($sp)
+ $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ andi. $t0,$inp,3
+ andi. $t1,$out,3
+ or. $t0,$t0,$t1
+ bne Lenc_unaligned
+
+Lenc_unaligned_ok:
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ lwz $s0,0($inp)
+ lwz $s1,4($inp)
+ lwz $s2,8($inp)
+ lwz $s3,12($inp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lwz $t0,0($inp)
+ lwz $t1,4($inp)
+ lwz $t2,8($inp)
+ lwz $t3,12($inp)
+ rotlwi $s0,$t0,8
+ rotlwi $s1,$t1,8
+ rotlwi $s2,$t2,8
+ rotlwi $s3,$t3,8
+ rlwimi $s0,$t0,24,0,7
+ rlwimi $s1,$t1,24,0,7
+ rlwimi $s2,$t2,24,0,7
+ rlwimi $s3,$t3,24,0,7
+ rlwimi $s0,$t0,24,16,23
+ rlwimi $s1,$t1,24,16,23
+ rlwimi $s2,$t2,24,16,23
+ rlwimi $s3,$t3,24,16,23
+___
+$code.=<<___;
+ bl LAES_Te
+ bl Lppc_AES_encrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ rotlwi $t0,$s0,8
+ rotlwi $t1,$s1,8
+ rotlwi $t2,$s2,8
+ rotlwi $t3,$s3,8
+ rlwimi $t0,$s0,24,0,7
+ rlwimi $t1,$s1,24,0,7
+ rlwimi $t2,$s2,24,0,7
+ rlwimi $t3,$s3,24,0,7
+ rlwimi $t0,$s0,24,16,23
+ rlwimi $t1,$s1,24,16,23
+ rlwimi $t2,$s2,24,16,23
+ rlwimi $t3,$s3,24,16,23
+ stw $t0,0($out)
+ stw $t1,4($out)
+ stw $t2,8($out)
+ stw $t3,12($out)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+___
+$code.=<<___;
+ b Lenc_done
+
+Lenc_unaligned:
+ subfic $t0,$inp,4096
+ subfic $t1,$out,4096
+ andi. $t0,$t0,4096-16
+ beq Lenc_xpage
+ andi. $t1,$t1,4096-16
+ bne Lenc_unaligned_ok
+
+Lenc_xpage:
+ lbz $acc00,0($inp)
+ lbz $acc01,1($inp)
+ lbz $acc02,2($inp)
+ lbz $s0,3($inp)
+ lbz $acc04,4($inp)
+ lbz $acc05,5($inp)
+ lbz $acc06,6($inp)
+ lbz $s1,7($inp)
+ lbz $acc08,8($inp)
+ lbz $acc09,9($inp)
+ lbz $acc10,10($inp)
+ insrwi $s0,$acc00,8,0
+ lbz $s2,11($inp)
+ insrwi $s1,$acc04,8,0
+ lbz $acc12,12($inp)
+ insrwi $s0,$acc01,8,8
+ lbz $acc13,13($inp)
+ insrwi $s1,$acc05,8,8
+ lbz $acc14,14($inp)
+ insrwi $s0,$acc02,8,16
+ lbz $s3,15($inp)
+ insrwi $s1,$acc06,8,16
+ insrwi $s2,$acc08,8,0
+ insrwi $s3,$acc12,8,0
+ insrwi $s2,$acc09,8,8
+ insrwi $s3,$acc13,8,8
+ insrwi $s2,$acc10,8,16
+ insrwi $s3,$acc14,8,16
+
+ bl LAES_Te
+ bl Lppc_AES_encrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
+
+ extrwi $acc00,$s0,8,0
+ extrwi $acc01,$s0,8,8
+ stb $acc00,0($out)
+ extrwi $acc02,$s0,8,16
+ stb $acc01,1($out)
+ stb $acc02,2($out)
+ extrwi $acc04,$s1,8,0
+ stb $s0,3($out)
+ extrwi $acc05,$s1,8,8
+ stb $acc04,4($out)
+ extrwi $acc06,$s1,8,16
+ stb $acc05,5($out)
+ stb $acc06,6($out)
+ extrwi $acc08,$s2,8,0
+ stb $s1,7($out)
+ extrwi $acc09,$s2,8,8
+ stb $acc08,8($out)
+ extrwi $acc10,$s2,8,16
+ stb $acc09,9($out)
+ stb $acc10,10($out)
+ extrwi $acc12,$s3,8,0
+ stb $s2,11($out)
+ extrwi $acc13,$s3,8,8
+ stb $acc12,12($out)
+ extrwi $acc14,$s3,8,16
+ stb $acc13,13($out)
+ stb $acc14,14($out)
+ stb $s3,15($out)
+
+Lenc_done:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
+
+.align 5
+Lppc_AES_encrypt:
+ lwz $acc00,240($key)
+ addi $Tbl1,$Tbl0,3
+ lwz $t0,0($key)
+ addi $Tbl2,$Tbl0,2
+ lwz $t1,4($key)
+ addi $Tbl3,$Tbl0,1
+ lwz $t2,8($key)
+ addi $acc00,$acc00,-1
+ lwz $t3,12($key)
+ addi $key,$key,16
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ xor $s2,$s2,$t2
+ xor $s3,$s3,$t3
+ mtctr $acc00
+.align 4
+Lenc_loop:
+ rlwinm $acc00,$s0,`32-24+3`,21,28
+ rlwinm $acc01,$s1,`32-24+3`,21,28
+ rlwinm $acc02,$s2,`32-24+3`,21,28
+ rlwinm $acc03,$s3,`32-24+3`,21,28
+ lwz $t0,0($key)
+ rlwinm $acc04,$s1,`32-16+3`,21,28
+ lwz $t1,4($key)
+ rlwinm $acc05,$s2,`32-16+3`,21,28
+ lwz $t2,8($key)
+ rlwinm $acc06,$s3,`32-16+3`,21,28
+ lwz $t3,12($key)
+ rlwinm $acc07,$s0,`32-16+3`,21,28
+ lwzx $acc00,$Tbl0,$acc00
+ rlwinm $acc08,$s2,`32-8+3`,21,28
+ lwzx $acc01,$Tbl0,$acc01
+ rlwinm $acc09,$s3,`32-8+3`,21,28
+ lwzx $acc02,$Tbl0,$acc02
+ rlwinm $acc10,$s0,`32-8+3`,21,28
+ lwzx $acc03,$Tbl0,$acc03
+ rlwinm $acc11,$s1,`32-8+3`,21,28
+ lwzx $acc04,$Tbl1,$acc04
+ rlwinm $acc12,$s3,`0+3`,21,28
+ lwzx $acc05,$Tbl1,$acc05
+ rlwinm $acc13,$s0,`0+3`,21,28
+ lwzx $acc06,$Tbl1,$acc06
+ rlwinm $acc14,$s1,`0+3`,21,28
+ lwzx $acc07,$Tbl1,$acc07
+ rlwinm $acc15,$s2,`0+3`,21,28
+ lwzx $acc08,$Tbl2,$acc08
+ xor $t0,$t0,$acc00
+ lwzx $acc09,$Tbl2,$acc09
+ xor $t1,$t1,$acc01
+ lwzx $acc10,$Tbl2,$acc10
+ xor $t2,$t2,$acc02
+ lwzx $acc11,$Tbl2,$acc11
+ xor $t3,$t3,$acc03
+ lwzx $acc12,$Tbl3,$acc12
+ xor $t0,$t0,$acc04
+ lwzx $acc13,$Tbl3,$acc13
+ xor $t1,$t1,$acc05
+ lwzx $acc14,$Tbl3,$acc14
+ xor $t2,$t2,$acc06
+ lwzx $acc15,$Tbl3,$acc15
+ xor $t3,$t3,$acc07
+ xor $t0,$t0,$acc08
+ xor $t1,$t1,$acc09
+ xor $t2,$t2,$acc10
+ xor $t3,$t3,$acc11
+ xor $s0,$t0,$acc12
+ xor $s1,$t1,$acc13
+ xor $s2,$t2,$acc14
+ xor $s3,$t3,$acc15
+ addi $key,$key,16
+ bdnz Lenc_loop
+
+ addi $Tbl2,$Tbl0,2048
+ nop
+ lwz $t0,0($key)
+ rlwinm $acc00,$s0,`32-24`,24,31
+ lwz $t1,4($key)
+ rlwinm $acc01,$s1,`32-24`,24,31
+ lwz $t2,8($key)
+ rlwinm $acc02,$s2,`32-24`,24,31
+ lwz $t3,12($key)
+ rlwinm $acc03,$s3,`32-24`,24,31
+ lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
+ rlwinm $acc04,$s1,`32-16`,24,31
+ lwz $acc09,`2048+32`($Tbl0)
+ rlwinm $acc05,$s2,`32-16`,24,31
+ lwz $acc10,`2048+64`($Tbl0)
+ rlwinm $acc06,$s3,`32-16`,24,31
+ lwz $acc11,`2048+96`($Tbl0)
+ rlwinm $acc07,$s0,`32-16`,24,31
+ lwz $acc12,`2048+128`($Tbl0)
+ rlwinm $acc08,$s2,`32-8`,24,31
+ lwz $acc13,`2048+160`($Tbl0)
+ rlwinm $acc09,$s3,`32-8`,24,31
+ lwz $acc14,`2048+192`($Tbl0)
+ rlwinm $acc10,$s0,`32-8`,24,31
+ lwz $acc15,`2048+224`($Tbl0)
+ rlwinm $acc11,$s1,`32-8`,24,31
+ lbzx $acc00,$Tbl2,$acc00
+ rlwinm $acc12,$s3,`0`,24,31
+ lbzx $acc01,$Tbl2,$acc01
+ rlwinm $acc13,$s0,`0`,24,31
+ lbzx $acc02,$Tbl2,$acc02
+ rlwinm $acc14,$s1,`0`,24,31
+ lbzx $acc03,$Tbl2,$acc03
+ rlwinm $acc15,$s2,`0`,24,31
+ lbzx $acc04,$Tbl2,$acc04
+ rlwinm $s0,$acc00,24,0,7
+ lbzx $acc05,$Tbl2,$acc05
+ rlwinm $s1,$acc01,24,0,7
+ lbzx $acc06,$Tbl2,$acc06
+ rlwinm $s2,$acc02,24,0,7
+ lbzx $acc07,$Tbl2,$acc07
+ rlwinm $s3,$acc03,24,0,7
+ lbzx $acc08,$Tbl2,$acc08
+ rlwimi $s0,$acc04,16,8,15
+ lbzx $acc09,$Tbl2,$acc09
+ rlwimi $s1,$acc05,16,8,15
+ lbzx $acc10,$Tbl2,$acc10
+ rlwimi $s2,$acc06,16,8,15
+ lbzx $acc11,$Tbl2,$acc11
+ rlwimi $s3,$acc07,16,8,15
+ lbzx $acc12,$Tbl2,$acc12
+ rlwimi $s0,$acc08,8,16,23
+ lbzx $acc13,$Tbl2,$acc13
+ rlwimi $s1,$acc09,8,16,23
+ lbzx $acc14,$Tbl2,$acc14
+ rlwimi $s2,$acc10,8,16,23
+ lbzx $acc15,$Tbl2,$acc15
+ rlwimi $s3,$acc11,8,16,23
+ or $s0,$s0,$acc12
+ or $s1,$s1,$acc13
+ or $s2,$s2,$acc14
+ or $s3,$s3,$acc15
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ xor $s2,$s2,$t2
+ xor $s3,$s3,$t3
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.align 4
+Lppc_AES_encrypt_compact:
+ lwz $acc00,240($key)
+ addi $Tbl1,$Tbl0,2048
+ lwz $t0,0($key)
+ lis $mask80,0x8080
+ lwz $t1,4($key)
+ lis $mask1b,0x1b1b
+ lwz $t2,8($key)
+ ori $mask80,$mask80,0x8080
+ lwz $t3,12($key)
+ ori $mask1b,$mask1b,0x1b1b
+ addi $key,$key,16
+ mtctr $acc00
+.align 4
+Lenc_compact_loop:
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ rlwinm $acc00,$s0,`32-24`,24,31
+ xor $s2,$s2,$t2
+ rlwinm $acc01,$s1,`32-24`,24,31
+ xor $s3,$s3,$t3
+ rlwinm $acc02,$s2,`32-24`,24,31
+ rlwinm $acc03,$s3,`32-24`,24,31
+ rlwinm $acc04,$s1,`32-16`,24,31
+ rlwinm $acc05,$s2,`32-16`,24,31
+ rlwinm $acc06,$s3,`32-16`,24,31
+ rlwinm $acc07,$s0,`32-16`,24,31
+ lbzx $acc00,$Tbl1,$acc00
+ rlwinm $acc08,$s2,`32-8`,24,31
+ lbzx $acc01,$Tbl1,$acc01
+ rlwinm $acc09,$s3,`32-8`,24,31
+ lbzx $acc02,$Tbl1,$acc02
+ rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl1,$acc03
+ rlwinm $acc11,$s1,`32-8`,24,31
+ lbzx $acc04,$Tbl1,$acc04
+ rlwinm $acc12,$s3,`0`,24,31
+ lbzx $acc05,$Tbl1,$acc05
+ rlwinm $acc13,$s0,`0`,24,31
+ lbzx $acc06,$Tbl1,$acc06
+ rlwinm $acc14,$s1,`0`,24,31
+ lbzx $acc07,$Tbl1,$acc07
+ rlwinm $acc15,$s2,`0`,24,31
+ lbzx $acc08,$Tbl1,$acc08
+ rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl1,$acc09
+ rlwinm $s1,$acc01,24,0,7
+ lbzx $acc10,$Tbl1,$acc10
+ rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl1,$acc11
+ rlwinm $s3,$acc03,24,0,7
+ lbzx $acc12,$Tbl1,$acc12
+ rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl1,$acc13
+ rlwimi $s1,$acc05,16,8,15
+ lbzx $acc14,$Tbl1,$acc14
+ rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl1,$acc15
+ rlwimi $s3,$acc07,16,8,15
+ rlwimi $s0,$acc08,8,16,23
+ rlwimi $s1,$acc09,8,16,23
+ rlwimi $s2,$acc10,8,16,23
+ rlwimi $s3,$acc11,8,16,23
+ lwz $t0,0($key)
+ or $s0,$s0,$acc12
+ lwz $t1,4($key)
+ or $s1,$s1,$acc13
+ lwz $t2,8($key)
+ or $s2,$s2,$acc14
+ lwz $t3,12($key)
+ or $s3,$s3,$acc15
+
+ addi $key,$key,16
+ bdz Lenc_compact_done
+
+ and $acc00,$s0,$mask80 # r1=r0&0x80808080
+ and $acc01,$s1,$mask80
+ and $acc02,$s2,$mask80
+ and $acc03,$s3,$mask80
+ srwi $acc04,$acc00,7 # r1>>7
+ andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ srwi $acc05,$acc01,7
+ andc $acc09,$s1,$mask80
+ srwi $acc06,$acc02,7
+ andc $acc10,$s2,$mask80
+ srwi $acc07,$acc03,7
+ andc $acc11,$s3,$mask80
+ sub $acc00,$acc00,$acc04 # r1-(r1>>7)
+ sub $acc01,$acc01,$acc05
+ sub $acc02,$acc02,$acc06
+ sub $acc03,$acc03,$acc07
+ add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
+ add $acc09,$acc09,$acc09
+ add $acc10,$acc10,$acc10
+ add $acc11,$acc11,$acc11
+ and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc01,$acc01,$mask1b
+ and $acc02,$acc02,$mask1b
+ and $acc03,$acc03,$mask1b
+ xor $acc00,$acc00,$acc08 # r2
+ xor $acc01,$acc01,$acc09
+ rotlwi $acc12,$s0,16 # ROTATE(r0,16)
+ xor $acc02,$acc02,$acc10
+ rotlwi $acc13,$s1,16
+ xor $acc03,$acc03,$acc11
+ rotlwi $acc14,$s2,16
+
+ xor $s0,$s0,$acc00 # r0^r2
+ rotlwi $acc15,$s3,16
+ xor $s1,$s1,$acc01
+ rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
+ xor $s2,$s2,$acc02
+ rotrwi $s1,$s1,24
+ xor $s3,$s3,$acc03
+ rotrwi $s2,$s2,24
+ xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
+ rotrwi $s3,$s3,24
+ xor $s1,$s1,$acc01
+ xor $s2,$s2,$acc02
+ xor $s3,$s3,$acc03
+ rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
+ xor $s0,$s0,$acc12 #
+ rotlwi $acc09,$acc13,8
+ xor $s1,$s1,$acc13
+ rotlwi $acc10,$acc14,8
+ xor $s2,$s2,$acc14
+ rotlwi $acc11,$acc15,8
+ xor $s3,$s3,$acc15
+ xor $s0,$s0,$acc08 #
+ xor $s1,$s1,$acc09
+ xor $s2,$s2,$acc10
+ xor $s3,$s3,$acc11
+
+ b Lenc_compact_loop
+.align 4
+Lenc_compact_done:
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ xor $s2,$s2,$t2
+ xor $s3,$s3,$t3
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size .AES_encrypt,.-.AES_encrypt
+
+.globl .AES_decrypt
+.align 7
+.AES_decrypt:
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+
+ $PUSH $out,`$FRAME-$SIZE_T*19`($sp)
+ $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ andi. $t0,$inp,3
+ andi. $t1,$out,3
+ or. $t0,$t0,$t1
+ bne Ldec_unaligned
+
+Ldec_unaligned_ok:
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ lwz $s0,0($inp)
+ lwz $s1,4($inp)
+ lwz $s2,8($inp)
+ lwz $s3,12($inp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ lwz $t0,0($inp)
+ lwz $t1,4($inp)
+ lwz $t2,8($inp)
+ lwz $t3,12($inp)
+ rotlwi $s0,$t0,8
+ rotlwi $s1,$t1,8
+ rotlwi $s2,$t2,8
+ rotlwi $s3,$t3,8
+ rlwimi $s0,$t0,24,0,7
+ rlwimi $s1,$t1,24,0,7
+ rlwimi $s2,$t2,24,0,7
+ rlwimi $s3,$t3,24,0,7
+ rlwimi $s0,$t0,24,16,23
+ rlwimi $s1,$t1,24,16,23
+ rlwimi $s2,$t2,24,16,23
+ rlwimi $s3,$t3,24,16,23
+___
+$code.=<<___;
+ bl LAES_Td
+ bl Lppc_AES_decrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+ rotlwi $t0,$s0,8
+ rotlwi $t1,$s1,8
+ rotlwi $t2,$s2,8
+ rotlwi $t3,$s3,8
+ rlwimi $t0,$s0,24,0,7
+ rlwimi $t1,$s1,24,0,7
+ rlwimi $t2,$s2,24,0,7
+ rlwimi $t3,$s3,24,0,7
+ rlwimi $t0,$s0,24,16,23
+ rlwimi $t1,$s1,24,16,23
+ rlwimi $t2,$s2,24,16,23
+ rlwimi $t3,$s3,24,16,23
+ stw $t0,0($out)
+ stw $t1,4($out)
+ stw $t2,8($out)
+ stw $t3,12($out)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+___
+$code.=<<___;
+ b Ldec_done
+
+Ldec_unaligned:
+ subfic $t0,$inp,4096
+ subfic $t1,$out,4096
+ andi. $t0,$t0,4096-16
+ beq Ldec_xpage
+ andi. $t1,$t1,4096-16
+ bne Ldec_unaligned_ok
+
+Ldec_xpage:
+ lbz $acc00,0($inp)
+ lbz $acc01,1($inp)
+ lbz $acc02,2($inp)
+ lbz $s0,3($inp)
+ lbz $acc04,4($inp)
+ lbz $acc05,5($inp)
+ lbz $acc06,6($inp)
+ lbz $s1,7($inp)
+ lbz $acc08,8($inp)
+ lbz $acc09,9($inp)
+ lbz $acc10,10($inp)
+ insrwi $s0,$acc00,8,0
+ lbz $s2,11($inp)
+ insrwi $s1,$acc04,8,0
+ lbz $acc12,12($inp)
+ insrwi $s0,$acc01,8,8
+ lbz $acc13,13($inp)
+ insrwi $s1,$acc05,8,8
+ lbz $acc14,14($inp)
+ insrwi $s0,$acc02,8,16
+ lbz $s3,15($inp)
+ insrwi $s1,$acc06,8,16
+ insrwi $s2,$acc08,8,0
+ insrwi $s3,$acc12,8,0
+ insrwi $s2,$acc09,8,8
+ insrwi $s3,$acc13,8,8
+ insrwi $s2,$acc10,8,16
+ insrwi $s3,$acc14,8,16
+
+ bl LAES_Td
+ bl Lppc_AES_decrypt_compact
+ $POP $out,`$FRAME-$SIZE_T*19`($sp)
+
+ extrwi $acc00,$s0,8,0
+ extrwi $acc01,$s0,8,8
+ stb $acc00,0($out)
+ extrwi $acc02,$s0,8,16
+ stb $acc01,1($out)
+ stb $acc02,2($out)
+ extrwi $acc04,$s1,8,0
+ stb $s0,3($out)
+ extrwi $acc05,$s1,8,8
+ stb $acc04,4($out)
+ extrwi $acc06,$s1,8,16
+ stb $acc05,5($out)
+ stb $acc06,6($out)
+ extrwi $acc08,$s2,8,0
+ stb $s1,7($out)
+ extrwi $acc09,$s2,8,8
+ stb $acc08,8($out)
+ extrwi $acc10,$s2,8,16
+ stb $acc09,9($out)
+ stb $acc10,10($out)
+ extrwi $acc12,$s3,8,0
+ stb $s2,11($out)
+ extrwi $acc13,$s3,8,8
+ stb $acc12,12($out)
+ extrwi $acc14,$s3,8,16
+ stb $acc13,13($out)
+ stb $acc14,14($out)
+ stb $s3,15($out)
+
+Ldec_done:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
+
+.align 5
+Lppc_AES_decrypt:
+ lwz $acc00,240($key)
+ addi $Tbl1,$Tbl0,3
+ lwz $t0,0($key)
+ addi $Tbl2,$Tbl0,2
+ lwz $t1,4($key)
+ addi $Tbl3,$Tbl0,1
+ lwz $t2,8($key)
+ addi $acc00,$acc00,-1
+ lwz $t3,12($key)
+ addi $key,$key,16
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ xor $s2,$s2,$t2
+ xor $s3,$s3,$t3
+ mtctr $acc00
+.align 4
+Ldec_loop:
+ rlwinm $acc00,$s0,`32-24+3`,21,28
+ rlwinm $acc01,$s1,`32-24+3`,21,28
+ rlwinm $acc02,$s2,`32-24+3`,21,28
+ rlwinm $acc03,$s3,`32-24+3`,21,28
+ lwz $t0,0($key)
+ rlwinm $acc04,$s3,`32-16+3`,21,28
+ lwz $t1,4($key)
+ rlwinm $acc05,$s0,`32-16+3`,21,28
+ lwz $t2,8($key)
+ rlwinm $acc06,$s1,`32-16+3`,21,28
+ lwz $t3,12($key)
+ rlwinm $acc07,$s2,`32-16+3`,21,28
+ lwzx $acc00,$Tbl0,$acc00
+ rlwinm $acc08,$s2,`32-8+3`,21,28
+ lwzx $acc01,$Tbl0,$acc01
+ rlwinm $acc09,$s3,`32-8+3`,21,28
+ lwzx $acc02,$Tbl0,$acc02
+ rlwinm $acc10,$s0,`32-8+3`,21,28
+ lwzx $acc03,$Tbl0,$acc03
+ rlwinm $acc11,$s1,`32-8+3`,21,28
+ lwzx $acc04,$Tbl1,$acc04
+ rlwinm $acc12,$s1,`0+3`,21,28
+ lwzx $acc05,$Tbl1,$acc05
+ rlwinm $acc13,$s2,`0+3`,21,28
+ lwzx $acc06,$Tbl1,$acc06
+ rlwinm $acc14,$s3,`0+3`,21,28
+ lwzx $acc07,$Tbl1,$acc07
+ rlwinm $acc15,$s0,`0+3`,21,28
+ lwzx $acc08,$Tbl2,$acc08
+ xor $t0,$t0,$acc00
+ lwzx $acc09,$Tbl2,$acc09
+ xor $t1,$t1,$acc01
+ lwzx $acc10,$Tbl2,$acc10
+ xor $t2,$t2,$acc02
+ lwzx $acc11,$Tbl2,$acc11
+ xor $t3,$t3,$acc03
+ lwzx $acc12,$Tbl3,$acc12
+ xor $t0,$t0,$acc04
+ lwzx $acc13,$Tbl3,$acc13
+ xor $t1,$t1,$acc05
+ lwzx $acc14,$Tbl3,$acc14
+ xor $t2,$t2,$acc06
+ lwzx $acc15,$Tbl3,$acc15
+ xor $t3,$t3,$acc07
+ xor $t0,$t0,$acc08
+ xor $t1,$t1,$acc09
+ xor $t2,$t2,$acc10
+ xor $t3,$t3,$acc11
+ xor $s0,$t0,$acc12
+ xor $s1,$t1,$acc13
+ xor $s2,$t2,$acc14
+ xor $s3,$t3,$acc15
+ addi $key,$key,16
+ bdnz Ldec_loop
+
+ addi $Tbl2,$Tbl0,2048
+ nop
+ lwz $t0,0($key)
+ rlwinm $acc00,$s0,`32-24`,24,31
+ lwz $t1,4($key)
+ rlwinm $acc01,$s1,`32-24`,24,31
+ lwz $t2,8($key)
+ rlwinm $acc02,$s2,`32-24`,24,31
+ lwz $t3,12($key)
+ rlwinm $acc03,$s3,`32-24`,24,31
+ lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
+ rlwinm $acc04,$s3,`32-16`,24,31
+ lwz $acc09,`2048+32`($Tbl0)
+ rlwinm $acc05,$s0,`32-16`,24,31
+ lwz $acc10,`2048+64`($Tbl0)
+ lbzx $acc00,$Tbl2,$acc00
+ lwz $acc11,`2048+96`($Tbl0)
+ lbzx $acc01,$Tbl2,$acc01
+ lwz $acc12,`2048+128`($Tbl0)
+ rlwinm $acc06,$s1,`32-16`,24,31
+ lwz $acc13,`2048+160`($Tbl0)
+ rlwinm $acc07,$s2,`32-16`,24,31
+ lwz $acc14,`2048+192`($Tbl0)
+ rlwinm $acc08,$s2,`32-8`,24,31
+ lwz $acc15,`2048+224`($Tbl0)
+ rlwinm $acc09,$s3,`32-8`,24,31
+ lbzx $acc02,$Tbl2,$acc02
+ rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl2,$acc03
+ rlwinm $acc11,$s1,`32-8`,24,31
+ lbzx $acc04,$Tbl2,$acc04
+ rlwinm $acc12,$s1,`0`,24,31
+ lbzx $acc05,$Tbl2,$acc05
+ rlwinm $acc13,$s2,`0`,24,31
+ lbzx $acc06,$Tbl2,$acc06
+ rlwinm $acc14,$s3,`0`,24,31
+ lbzx $acc07,$Tbl2,$acc07
+ rlwinm $acc15,$s0,`0`,24,31
+ lbzx $acc08,$Tbl2,$acc08
+ rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl2,$acc09
+ rlwinm $s1,$acc01,24,0,7
+ lbzx $acc10,$Tbl2,$acc10
+ rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl2,$acc11
+ rlwinm $s3,$acc03,24,0,7
+ lbzx $acc12,$Tbl2,$acc12
+ rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl2,$acc13
+ rlwimi $s1,$acc05,16,8,15
+ lbzx $acc14,$Tbl2,$acc14
+ rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl2,$acc15
+ rlwimi $s3,$acc07,16,8,15
+ rlwimi $s0,$acc08,8,16,23
+ rlwimi $s1,$acc09,8,16,23
+ rlwimi $s2,$acc10,8,16,23
+ rlwimi $s3,$acc11,8,16,23
+ or $s0,$s0,$acc12
+ or $s1,$s1,$acc13
+ or $s2,$s2,$acc14
+ or $s3,$s3,$acc15
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ xor $s2,$s2,$t2
+ xor $s3,$s3,$t3
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.align 4
+Lppc_AES_decrypt_compact:
+ lwz $acc00,240($key)
+ addi $Tbl1,$Tbl0,2048
+ lwz $t0,0($key)
+ lis $mask80,0x8080
+ lwz $t1,4($key)
+ lis $mask1b,0x1b1b
+ lwz $t2,8($key)
+ ori $mask80,$mask80,0x8080
+ lwz $t3,12($key)
+ ori $mask1b,$mask1b,0x1b1b
+ addi $key,$key,16
+___
+$code.=<<___ if ($SIZE_T==8);
+ insrdi $mask80,$mask80,32,0
+ insrdi $mask1b,$mask1b,32,0
+___
+$code.=<<___;
+ mtctr $acc00
+.align 4
+Ldec_compact_loop:
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ rlwinm $acc00,$s0,`32-24`,24,31
+ xor $s2,$s2,$t2
+ rlwinm $acc01,$s1,`32-24`,24,31
+ xor $s3,$s3,$t3
+ rlwinm $acc02,$s2,`32-24`,24,31
+ rlwinm $acc03,$s3,`32-24`,24,31
+ rlwinm $acc04,$s3,`32-16`,24,31
+ rlwinm $acc05,$s0,`32-16`,24,31
+ rlwinm $acc06,$s1,`32-16`,24,31
+ rlwinm $acc07,$s2,`32-16`,24,31
+ lbzx $acc00,$Tbl1,$acc00
+ rlwinm $acc08,$s2,`32-8`,24,31
+ lbzx $acc01,$Tbl1,$acc01
+ rlwinm $acc09,$s3,`32-8`,24,31
+ lbzx $acc02,$Tbl1,$acc02
+ rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl1,$acc03
+ rlwinm $acc11,$s1,`32-8`,24,31
+ lbzx $acc04,$Tbl1,$acc04
+ rlwinm $acc12,$s1,`0`,24,31
+ lbzx $acc05,$Tbl1,$acc05
+ rlwinm $acc13,$s2,`0`,24,31
+ lbzx $acc06,$Tbl1,$acc06
+ rlwinm $acc14,$s3,`0`,24,31
+ lbzx $acc07,$Tbl1,$acc07
+ rlwinm $acc15,$s0,`0`,24,31
+ lbzx $acc08,$Tbl1,$acc08
+ rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl1,$acc09
+ rlwinm $s1,$acc01,24,0,7
+ lbzx $acc10,$Tbl1,$acc10
+ rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl1,$acc11
+ rlwinm $s3,$acc03,24,0,7
+ lbzx $acc12,$Tbl1,$acc12
+ rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl1,$acc13
+ rlwimi $s1,$acc05,16,8,15
+ lbzx $acc14,$Tbl1,$acc14
+ rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl1,$acc15
+ rlwimi $s3,$acc07,16,8,15
+ rlwimi $s0,$acc08,8,16,23
+ rlwimi $s1,$acc09,8,16,23
+ rlwimi $s2,$acc10,8,16,23
+ rlwimi $s3,$acc11,8,16,23
+ lwz $t0,0($key)
+ or $s0,$s0,$acc12
+ lwz $t1,4($key)
+ or $s1,$s1,$acc13
+ lwz $t2,8($key)
+ or $s2,$s2,$acc14
+ lwz $t3,12($key)
+ or $s3,$s3,$acc15
+
+ addi $key,$key,16
+ bdz Ldec_compact_done
+___
+$code.=<<___ if ($SIZE_T==8);
+ # vectorized permutation improves decrypt performance by 10%
+ insrdi $s0,$s1,32,0
+ insrdi $s2,$s3,32,0
+
+ and $acc00,$s0,$mask80 # r1=r0&0x80808080
+ and $acc02,$s2,$mask80
+ srdi $acc04,$acc00,7 # r1>>7
+ srdi $acc06,$acc02,7
+ andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ andc $acc10,$s2,$mask80
+ sub $acc00,$acc00,$acc04 # r1-(r1>>7)
+ sub $acc02,$acc02,$acc06
+ add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
+ add $acc10,$acc10,$acc10
+ and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc02,$acc02,$mask1b
+ xor $acc00,$acc00,$acc08 # r2
+ xor $acc02,$acc02,$acc10
+
+ and $acc04,$acc00,$mask80 # r1=r2&0x80808080
+ and $acc06,$acc02,$mask80
+ srdi $acc08,$acc04,7 # r1>>7
+ srdi $acc10,$acc06,7
+ andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
+ andc $acc14,$acc02,$mask80
+ sub $acc04,$acc04,$acc08 # r1-(r1>>7)
+ sub $acc06,$acc06,$acc10
+ add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
+ add $acc14,$acc14,$acc14
+ and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc06,$acc06,$mask1b
+ xor $acc04,$acc04,$acc12 # r4
+ xor $acc06,$acc06,$acc14
+
+ and $acc08,$acc04,$mask80 # r1=r4&0x80808080
+ and $acc10,$acc06,$mask80
+ srdi $acc12,$acc08,7 # r1>>7
+ srdi $acc14,$acc10,7
+ sub $acc08,$acc08,$acc12 # r1-(r1>>7)
+ sub $acc10,$acc10,$acc14
+ andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
+ andc $acc14,$acc06,$mask80
+ add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
+ add $acc14,$acc14,$acc14
+ and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc10,$acc10,$mask1b
+ xor $acc08,$acc08,$acc12 # r8
+ xor $acc10,$acc10,$acc14
+
+ xor $acc00,$acc00,$s0 # r2^r0
+ xor $acc02,$acc02,$s2
+ xor $acc04,$acc04,$s0 # r4^r0
+ xor $acc06,$acc06,$s2
+
+ extrdi $acc01,$acc00,32,0
+ extrdi $acc03,$acc02,32,0
+ extrdi $acc05,$acc04,32,0
+ extrdi $acc07,$acc06,32,0
+ extrdi $acc09,$acc08,32,0
+ extrdi $acc11,$acc10,32,0
+___
+$code.=<<___ if ($SIZE_T==4);
+ and $acc00,$s0,$mask80 # r1=r0&0x80808080
+ and $acc01,$s1,$mask80
+ and $acc02,$s2,$mask80
+ and $acc03,$s3,$mask80
+ srwi $acc04,$acc00,7 # r1>>7
+ andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ srwi $acc05,$acc01,7
+ andc $acc09,$s1,$mask80
+ srwi $acc06,$acc02,7
+ andc $acc10,$s2,$mask80
+ srwi $acc07,$acc03,7
+ andc $acc11,$s3,$mask80
+ sub $acc00,$acc00,$acc04 # r1-(r1>>7)
+ sub $acc01,$acc01,$acc05
+ sub $acc02,$acc02,$acc06
+ sub $acc03,$acc03,$acc07
+ add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
+ add $acc09,$acc09,$acc09
+ add $acc10,$acc10,$acc10
+ add $acc11,$acc11,$acc11
+ and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc01,$acc01,$mask1b
+ and $acc02,$acc02,$mask1b
+ and $acc03,$acc03,$mask1b
+ xor $acc00,$acc00,$acc08 # r2
+ xor $acc01,$acc01,$acc09
+ xor $acc02,$acc02,$acc10
+ xor $acc03,$acc03,$acc11
+
+ and $acc04,$acc00,$mask80 # r1=r2&0x80808080
+ and $acc05,$acc01,$mask80
+ and $acc06,$acc02,$mask80
+ and $acc07,$acc03,$mask80
+ srwi $acc08,$acc04,7 # r1>>7
+ andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
+ srwi $acc09,$acc05,7
+ andc $acc13,$acc01,$mask80
+ srwi $acc10,$acc06,7
+ andc $acc14,$acc02,$mask80
+ srwi $acc11,$acc07,7
+ andc $acc15,$acc03,$mask80
+ sub $acc04,$acc04,$acc08 # r1-(r1>>7)
+ sub $acc05,$acc05,$acc09
+ sub $acc06,$acc06,$acc10
+ sub $acc07,$acc07,$acc11
+ add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
+ add $acc13,$acc13,$acc13
+ add $acc14,$acc14,$acc14
+ add $acc15,$acc15,$acc15
+ and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc05,$acc05,$mask1b
+ and $acc06,$acc06,$mask1b
+ and $acc07,$acc07,$mask1b
+ xor $acc04,$acc04,$acc12 # r4
+ xor $acc05,$acc05,$acc13
+ xor $acc06,$acc06,$acc14
+ xor $acc07,$acc07,$acc15
+
+ and $acc08,$acc04,$mask80 # r1=r4&0x80808080
+ and $acc09,$acc05,$mask80
+ srwi $acc12,$acc08,7 # r1>>7
+ and $acc10,$acc06,$mask80
+ srwi $acc13,$acc09,7
+ and $acc11,$acc07,$mask80
+ srwi $acc14,$acc10,7
+ sub $acc08,$acc08,$acc12 # r1-(r1>>7)
+ srwi $acc15,$acc11,7
+ sub $acc09,$acc09,$acc13
+ sub $acc10,$acc10,$acc14
+ sub $acc11,$acc11,$acc15
+ andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
+ andc $acc13,$acc05,$mask80
+ andc $acc14,$acc06,$mask80
+ andc $acc15,$acc07,$mask80
+ add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
+ add $acc13,$acc13,$acc13
+ add $acc14,$acc14,$acc14
+ add $acc15,$acc15,$acc15
+ and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc09,$acc09,$mask1b
+ and $acc10,$acc10,$mask1b
+ and $acc11,$acc11,$mask1b
+ xor $acc08,$acc08,$acc12 # r8
+ xor $acc09,$acc09,$acc13
+ xor $acc10,$acc10,$acc14
+ xor $acc11,$acc11,$acc15
+
+ xor $acc00,$acc00,$s0 # r2^r0
+ xor $acc01,$acc01,$s1
+ xor $acc02,$acc02,$s2
+ xor $acc03,$acc03,$s3
+ xor $acc04,$acc04,$s0 # r4^r0
+ xor $acc05,$acc05,$s1
+ xor $acc06,$acc06,$s2
+ xor $acc07,$acc07,$s3
+___
+$code.=<<___;
+ rotrwi $s0,$s0,8 # = ROTATE(r0,8)
+ rotrwi $s1,$s1,8
+ xor $s0,$s0,$acc00 # ^= r2^r0
+ rotrwi $s2,$s2,8
+ xor $s1,$s1,$acc01
+ rotrwi $s3,$s3,8
+ xor $s2,$s2,$acc02
+ xor $s3,$s3,$acc03
+ xor $acc00,$acc00,$acc08
+ xor $acc01,$acc01,$acc09
+ xor $acc02,$acc02,$acc10
+ xor $acc03,$acc03,$acc11
+ xor $s0,$s0,$acc04 # ^= r4^r0
+ rotrwi $acc00,$acc00,24
+ xor $s1,$s1,$acc05
+ rotrwi $acc01,$acc01,24
+ xor $s2,$s2,$acc06
+ rotrwi $acc02,$acc02,24
+ xor $s3,$s3,$acc07
+ rotrwi $acc03,$acc03,24
+ xor $acc04,$acc04,$acc08
+ xor $acc05,$acc05,$acc09
+ xor $acc06,$acc06,$acc10
+ xor $acc07,$acc07,$acc11
+ xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
+ rotrwi $acc04,$acc04,16
+ xor $s1,$s1,$acc09
+ rotrwi $acc05,$acc05,16
+ xor $s2,$s2,$acc10
+ rotrwi $acc06,$acc06,16
+ xor $s3,$s3,$acc11
+ rotrwi $acc07,$acc07,16
+ xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
+ rotrwi $acc08,$acc08,8
+ xor $s1,$s1,$acc01
+ rotrwi $acc09,$acc09,8
+ xor $s2,$s2,$acc02
+ rotrwi $acc10,$acc10,8
+ xor $s3,$s3,$acc03
+ rotrwi $acc11,$acc11,8
+ xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
+ xor $s1,$s1,$acc05
+ xor $s2,$s2,$acc06
+ xor $s3,$s3,$acc07
+ xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
+ xor $s1,$s1,$acc09
+ xor $s2,$s2,$acc10
+ xor $s3,$s3,$acc11
+
+ b Ldec_compact_loop
+.align 4
+Ldec_compact_done:
+ xor $s0,$s0,$t0
+ xor $s1,$s1,$t1
+ xor $s2,$s2,$t2
+ xor $s3,$s3,$t3
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size .AES_decrypt,.-.AES_decrypt
+
+.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
+.align 7
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-s390x.pl b/openssl-1.1.0h/crypto/aes/asm/aes-s390x.pl
new file mode 100644
index 0000000..fd8a737
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-s390x.pl
@@ -0,0 +1,2226 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for s390x.
+
+# April 2007.
+#
+# Software performance improvement over gcc-generated code is ~70% and
+# in absolute terms is ~73 cycles per byte processed with 128-bit key.
+# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
+# *strictly* in-order execution and issued instruction [in this case
+# load value from memory is critical] has to complete before execution
+# flow proceeds. S-boxes are compressed to 2KB[+256B].
+#
+# As for hardware acceleration support. It's basically a "teaser," as
+# it can and should be improved in several ways. Most notably support
+# for CBC is not utilized, nor multiple blocks are ever processed.
+# Then software key schedule can be postponed till hardware support
+# detection... Performance improvement over assembler is reportedly
+# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
+# support is implemented.
+
+# May 2007.
+#
+# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
+# for 128-bit keys, if hardware support is detected.
+
+# Januray 2009.
+#
+# Add support for hardware AES192/256 and reschedule instructions to
+# minimize/avoid Address Generation Interlock hazard and to favour
+# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
+# almost 50% on z9. The gain is smaller on z10, because being dual-
+# issue z10 makes it improssible to eliminate the interlock condition:
+# critial path is not long enough. Yet it spends ~24 cycles per byte
+# processed with 128-bit key.
+#
+# Unlike previous version hardware support detection takes place only
+# at the moment of key schedule setup, which is denoted in key->rounds.
+# This is done, because deferred key setup can't be made MT-safe, not
+# for keys longer than 128 bits.
+#
+# Add AES_cbc_encrypt, which gives incredible performance improvement,
+# it was measured to be ~6.6x. It's less than previously mentioned 8x,
+# because software implementation was optimized.
+
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0; # allow hardware support
+
+$t0="%r0"; $mask="%r0";
+$t1="%r1";
+$t2="%r2"; $inp="%r2";
+$t3="%r3"; $out="%r3"; $bits="%r3";
+$key="%r4";
+$i1="%r5";
+$i2="%r6";
+$i3="%r7";
+$s0="%r8";
+$s1="%r9";
+$s2="%r10";
+$s3="%r11";
+$tbl="%r12";
+$rounds="%r13";
+$ra="%r14";
+$sp="%r15";
+
+$stdframe=16*$SIZE_T+4*8;
+
+sub _data_word()
+{ my $i;
+ while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
+}
+
+$code=<<___;
+.text
+
+.type AES_Te,\@object
+.align 256
+AES_Te:
+___
+&_data_word(
+ 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
+ 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
+ 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
+ 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
+ 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
+ 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
+ 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
+ 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
+ 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
+ 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
+ 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
+ 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
+ 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
+ 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
+ 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
+ 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
+ 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
+ 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
+ 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
+ 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
+ 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
+ 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
+ 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
+ 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
+ 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
+ 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
+ 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
+ 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
+ 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
+ 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
+ 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
+ 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
+ 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
+ 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
+ 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
+ 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
+ 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
+ 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
+ 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
+ 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
+ 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
+ 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
+ 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
+ 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
+ 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
+ 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
+ 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
+ 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
+ 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
+ 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
+ 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
+ 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
+ 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
+ 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
+ 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
+ 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
+ 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
+ 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
+ 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
+ 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
+ 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
+ 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
+ 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
+ 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
+$code.=<<___;
+# Te4[256]
+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+# rcon[]
+.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
+.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
+.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
+.align 256
+.size AES_Te,.-AES_Te
+
+# void AES_encrypt(const unsigned char *inp, unsigned char *out,
+# const AES_KEY *key) {
+.globl AES_encrypt
+.type AES_encrypt,\@function
+AES_encrypt:
+___
+$code.=<<___ if (!$softonly);
+ l %r0,240($key)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lesoft
+
+ la %r1,0($key)
+ #la %r2,0($inp)
+ la %r4,0($out)
+ lghi %r3,16 # single block length
+ .long 0xb92e0042 # km %r4,%r2
+ brc 1,.-4 # can this happen?
+ br %r14
+.align 64
+.Lesoft:
+___
+$code.=<<___;
+ stm${g} %r3,$ra,3*$SIZE_T($sp)
+
+ llgf $s0,0($inp)
+ llgf $s1,4($inp)
+ llgf $s2,8($inp)
+ llgf $s3,12($inp)
+
+ larl $tbl,AES_Te
+ bras $ra,_s390x_AES_encrypt
+
+ l${g} $out,3*$SIZE_T($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_encrypt,.-AES_encrypt
+
+.type _s390x_AES_encrypt,\@function
+.align 16
+_s390x_AES_encrypt:
+ st${g} $ra,15*$SIZE_T($sp)
+ x $s0,0($key)
+ x $s1,4($key)
+ x $s2,8($key)
+ x $s3,12($key)
+ l $rounds,240($key)
+ llill $mask,`0xff<<3`
+ aghi $rounds,-1
+ j .Lenc_loop
+.align 16
+.Lenc_loop:
+ sllg $t1,$s0,`0+3`
+ srlg $t2,$s0,`8-3`
+ srlg $t3,$s0,`16-3`
+ srl $s0,`24-3`
+ nr $s0,$mask
+ ngr $t1,$mask
+ nr $t2,$mask
+ nr $t3,$mask
+
+ srlg $i1,$s1,`16-3` # i0
+ sllg $i2,$s1,`0+3`
+ srlg $i3,$s1,`8-3`
+ srl $s1,`24-3`
+ nr $i1,$mask
+ nr $s1,$mask
+ ngr $i2,$mask
+ nr $i3,$mask
+
+ l $s0,0($s0,$tbl) # Te0[s0>>24]
+ l $t1,1($t1,$tbl) # Te3[s0>>0]
+ l $t2,2($t2,$tbl) # Te2[s0>>8]
+ l $t3,3($t3,$tbl) # Te1[s0>>16]
+
+ x $s0,3($i1,$tbl) # Te1[s1>>16]
+ l $s1,0($s1,$tbl) # Te0[s1>>24]
+ x $t2,1($i2,$tbl) # Te3[s1>>0]
+ x $t3,2($i3,$tbl) # Te2[s1>>8]
+
+ srlg $i1,$s2,`8-3` # i0
+ srlg $i2,$s2,`16-3` # i1
+ nr $i1,$mask
+ nr $i2,$mask
+ sllg $i3,$s2,`0+3`
+ srl $s2,`24-3`
+ nr $s2,$mask
+ ngr $i3,$mask
+
+ xr $s1,$t1
+ srlg $ra,$s3,`8-3` # i1
+ sllg $t1,$s3,`0+3` # i0
+ nr $ra,$mask
+ la $key,16($key)
+ ngr $t1,$mask
+
+ x $s0,2($i1,$tbl) # Te2[s2>>8]
+ x $s1,3($i2,$tbl) # Te1[s2>>16]
+ l $s2,0($s2,$tbl) # Te0[s2>>24]
+ x $t3,1($i3,$tbl) # Te3[s2>>0]
+
+ srlg $i3,$s3,`16-3` # i2
+ xr $s2,$t2
+ srl $s3,`24-3`
+ nr $i3,$mask
+ nr $s3,$mask
+
+ x $s0,0($key)
+ x $s1,4($key)
+ x $s2,8($key)
+ x $t3,12($key)
+
+ x $s0,1($t1,$tbl) # Te3[s3>>0]
+ x $s1,2($ra,$tbl) # Te2[s3>>8]
+ x $s2,3($i3,$tbl) # Te1[s3>>16]
+ l $s3,0($s3,$tbl) # Te0[s3>>24]
+ xr $s3,$t3
+
+ brct $rounds,.Lenc_loop
+ .align 16
+
+ sllg $t1,$s0,`0+3`
+ srlg $t2,$s0,`8-3`
+ ngr $t1,$mask
+ srlg $t3,$s0,`16-3`
+ srl $s0,`24-3`
+ nr $s0,$mask
+ nr $t2,$mask
+ nr $t3,$mask
+
+ srlg $i1,$s1,`16-3` # i0
+ sllg $i2,$s1,`0+3`
+ ngr $i2,$mask
+ srlg $i3,$s1,`8-3`
+ srl $s1,`24-3`
+ nr $i1,$mask
+ nr $s1,$mask
+ nr $i3,$mask
+
+ llgc $s0,2($s0,$tbl) # Te4[s0>>24]
+ llgc $t1,2($t1,$tbl) # Te4[s0>>0]
+ sll $s0,24
+ llgc $t2,2($t2,$tbl) # Te4[s0>>8]
+ llgc $t3,2($t3,$tbl) # Te4[s0>>16]
+ sll $t2,8
+ sll $t3,16
+
+ llgc $i1,2($i1,$tbl) # Te4[s1>>16]
+ llgc $s1,2($s1,$tbl) # Te4[s1>>24]
+ llgc $i2,2($i2,$tbl) # Te4[s1>>0]
+ llgc $i3,2($i3,$tbl) # Te4[s1>>8]
+ sll $i1,16
+ sll $s1,24
+ sll $i3,8
+ or $s0,$i1
+ or $s1,$t1
+ or $t2,$i2
+ or $t3,$i3
+
+ srlg $i1,$s2,`8-3` # i0
+ srlg $i2,$s2,`16-3` # i1
+ nr $i1,$mask
+ nr $i2,$mask
+ sllg $i3,$s2,`0+3`
+ srl $s2,`24-3`
+ ngr $i3,$mask
+ nr $s2,$mask
+
+ sllg $t1,$s3,`0+3` # i0
+ srlg $ra,$s3,`8-3` # i1
+ ngr $t1,$mask
+
+ llgc $i1,2($i1,$tbl) # Te4[s2>>8]
+ llgc $i2,2($i2,$tbl) # Te4[s2>>16]
+ sll $i1,8
+ llgc $s2,2($s2,$tbl) # Te4[s2>>24]
+ llgc $i3,2($i3,$tbl) # Te4[s2>>0]
+ sll $i2,16
+ nr $ra,$mask
+ sll $s2,24
+ or $s0,$i1
+ or $s1,$i2
+ or $s2,$t2
+ or $t3,$i3
+
+ srlg $i3,$s3,`16-3` # i2
+ srl $s3,`24-3`
+ nr $i3,$mask
+ nr $s3,$mask
+
+ l $t0,16($key)
+ l $t2,20($key)
+
+ llgc $i1,2($t1,$tbl) # Te4[s3>>0]
+ llgc $i2,2($ra,$tbl) # Te4[s3>>8]
+ llgc $i3,2($i3,$tbl) # Te4[s3>>16]
+ llgc $s3,2($s3,$tbl) # Te4[s3>>24]
+ sll $i2,8
+ sll $i3,16
+ sll $s3,24
+ or $s0,$i1
+ or $s1,$i2
+ or $s2,$i3
+ or $s3,$t3
+
+ l${g} $ra,15*$SIZE_T($sp)
+ xr $s0,$t0
+ xr $s1,$t2
+ x $s2,24($key)
+ x $s3,28($key)
+
+ br $ra
+.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
+___
+
+$code.=<<___;
+.type AES_Td,\@object
+.align 256
+AES_Td:
+___
+&_data_word(
+ 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
+ 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
+ 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
+ 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
+ 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
+ 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
+ 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
+ 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
+ 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
+ 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
+ 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
+ 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
+ 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
+ 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
+ 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
+ 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
+ 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
+ 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
+ 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
+ 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
+ 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
+ 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
+ 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
+ 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
+ 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
+ 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
+ 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
+ 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
+ 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
+ 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
+ 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
+ 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
+ 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
+ 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
+ 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
+ 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
+ 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
+ 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
+ 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
+ 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
+ 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
+ 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
+ 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
+ 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
+ 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
+ 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
+ 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
+ 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
+ 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
+ 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
+ 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
+ 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
+ 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
+ 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
+ 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
+ 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
+ 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
+ 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
+ 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
+ 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
+ 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
+ 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
+ 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
+ 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
+$code.=<<___;
+# Td4[256]
+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+.size AES_Td,.-AES_Td
+
+# void AES_decrypt(const unsigned char *inp, unsigned char *out,
+# const AES_KEY *key) {
+.globl AES_decrypt
+.type AES_decrypt,\@function
+AES_decrypt:
+___
+$code.=<<___ if (!$softonly);
+ l %r0,240($key)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Ldsoft
+
+ la %r1,0($key)
+ #la %r2,0($inp)
+ la %r4,0($out)
+ lghi %r3,16 # single block length
+ .long 0xb92e0042 # km %r4,%r2
+ brc 1,.-4 # can this happen?
+ br %r14
+.align 64
+.Ldsoft:
+___
+$code.=<<___;
+ stm${g} %r3,$ra,3*$SIZE_T($sp)
+
+ llgf $s0,0($inp)
+ llgf $s1,4($inp)
+ llgf $s2,8($inp)
+ llgf $s3,12($inp)
+
+ larl $tbl,AES_Td
+ bras $ra,_s390x_AES_decrypt
+
+ l${g} $out,3*$SIZE_T($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_decrypt,.-AES_decrypt
+
+.type _s390x_AES_decrypt,\@function
+.align 16
+_s390x_AES_decrypt:
+ st${g} $ra,15*$SIZE_T($sp)
+ x $s0,0($key)
+ x $s1,4($key)
+ x $s2,8($key)
+ x $s3,12($key)
+ l $rounds,240($key)
+ llill $mask,`0xff<<3`
+ aghi $rounds,-1
+ j .Ldec_loop
+.align 16
+.Ldec_loop:
+ srlg $t1,$s0,`16-3`
+ srlg $t2,$s0,`8-3`
+ sllg $t3,$s0,`0+3`
+ srl $s0,`24-3`
+ nr $s0,$mask
+ nr $t1,$mask
+ nr $t2,$mask
+ ngr $t3,$mask
+
+ sllg $i1,$s1,`0+3` # i0
+ srlg $i2,$s1,`16-3`
+ srlg $i3,$s1,`8-3`
+ srl $s1,`24-3`
+ ngr $i1,$mask
+ nr $s1,$mask
+ nr $i2,$mask
+ nr $i3,$mask
+
+ l $s0,0($s0,$tbl) # Td0[s0>>24]
+ l $t1,3($t1,$tbl) # Td1[s0>>16]
+ l $t2,2($t2,$tbl) # Td2[s0>>8]
+ l $t3,1($t3,$tbl) # Td3[s0>>0]
+
+ x $s0,1($i1,$tbl) # Td3[s1>>0]
+ l $s1,0($s1,$tbl) # Td0[s1>>24]
+ x $t2,3($i2,$tbl) # Td1[s1>>16]
+ x $t3,2($i3,$tbl) # Td2[s1>>8]
+
+ srlg $i1,$s2,`8-3` # i0
+ sllg $i2,$s2,`0+3` # i1
+ srlg $i3,$s2,`16-3`
+ srl $s2,`24-3`
+ nr $i1,$mask
+ ngr $i2,$mask
+ nr $s2,$mask
+ nr $i3,$mask
+
+ xr $s1,$t1
+ srlg $ra,$s3,`8-3` # i1
+ srlg $t1,$s3,`16-3` # i0
+ nr $ra,$mask
+ la $key,16($key)
+ nr $t1,$mask
+
+ x $s0,2($i1,$tbl) # Td2[s2>>8]
+ x $s1,1($i2,$tbl) # Td3[s2>>0]
+ l $s2,0($s2,$tbl) # Td0[s2>>24]
+ x $t3,3($i3,$tbl) # Td1[s2>>16]
+
+ sllg $i3,$s3,`0+3` # i2
+ srl $s3,`24-3`
+ ngr $i3,$mask
+ nr $s3,$mask
+
+ xr $s2,$t2
+ x $s0,0($key)
+ x $s1,4($key)
+ x $s2,8($key)
+ x $t3,12($key)
+
+ x $s0,3($t1,$tbl) # Td1[s3>>16]
+ x $s1,2($ra,$tbl) # Td2[s3>>8]
+ x $s2,1($i3,$tbl) # Td3[s3>>0]
+ l $s3,0($s3,$tbl) # Td0[s3>>24]
+ xr $s3,$t3
+
+ brct $rounds,.Ldec_loop
+ .align 16
+
+ l $t1,`2048+0`($tbl) # prefetch Td4
+ l $t2,`2048+64`($tbl)
+ l $t3,`2048+128`($tbl)
+ l $i1,`2048+192`($tbl)
+ llill $mask,0xff
+
+ srlg $i3,$s0,24 # i0
+ srlg $t1,$s0,16
+ srlg $t2,$s0,8
+ nr $s0,$mask # i3
+ nr $t1,$mask
+
+ srlg $i1,$s1,24
+ nr $t2,$mask
+ srlg $i2,$s1,16
+ srlg $ra,$s1,8
+ nr $s1,$mask # i0
+ nr $i2,$mask
+ nr $ra,$mask
+
+ llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
+ llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
+ llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
+ sll $t1,16
+ llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
+ sllg $s0,$i3,24
+ sll $t2,8
+
+ llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
+ llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
+ llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
+ sll $i1,24
+ llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
+ sll $i2,16
+ sll $i3,8
+ or $s0,$s1
+ or $t1,$i1
+ or $t2,$i2
+ or $t3,$i3
+
+ srlg $i1,$s2,8 # i0
+ srlg $i2,$s2,24
+ srlg $i3,$s2,16
+ nr $s2,$mask # i1
+ nr $i1,$mask
+ nr $i3,$mask
+ llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
+ llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
+ llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
+ llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
+ sll $i1,8
+ sll $i2,24
+ or $s0,$i1
+ sll $i3,16
+ or $t2,$i2
+ or $t3,$i3
+
+ srlg $i1,$s3,16 # i0
+ srlg $i2,$s3,8 # i1
+ srlg $i3,$s3,24
+ nr $s3,$mask # i2
+ nr $i1,$mask
+ nr $i2,$mask
+
+ l${g} $ra,15*$SIZE_T($sp)
+ or $s1,$t1
+ l $t0,16($key)
+ l $t1,20($key)
+
+ llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
+ llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
+ sll $i1,16
+ llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
+ llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
+ sll $i2,8
+ sll $s3,24
+ or $s0,$i1
+ or $s1,$i2
+ or $s2,$t2
+ or $s3,$t3
+
+ xr $s0,$t0
+ xr $s1,$t1
+ x $s2,24($key)
+ x $s3,28($key)
+
+ br $ra
+.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
+___
+
+$code.=<<___;
+# void AES_set_encrypt_key(const unsigned char *in, int bits,
+# AES_KEY *key) {
+.globl AES_set_encrypt_key
+.type AES_set_encrypt_key,\@function
+.align 16
+AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
+ lghi $t0,0
+ cl${g}r $inp,$t0
+ je .Lminus1
+ cl${g}r $key,$t0
+ je .Lminus1
+
+ lghi $t0,128
+ clr $bits,$t0
+ je .Lproceed
+ lghi $t0,192
+ clr $bits,$t0
+ je .Lproceed
+ lghi $t0,256
+ clr $bits,$t0
+ je .Lproceed
+ lghi %r2,-2
+ br %r14
+
+.align 16
+.Lproceed:
+___
+$code.=<<___ if (!$softonly);
+ # convert bits to km(c) code, [128,192,256]->[18,19,20]
+ lhi %r5,-128
+ lhi %r0,18
+ ar %r5,$bits
+ srl %r5,6
+ ar %r5,%r0
+
+ larl %r1,OPENSSL_s390xcap_P
+ llihh %r0,0x8000
+ srlg %r0,%r0,0(%r5)
+ ng %r0,32(%r1) # check availability of both km...
+ ng %r0,48(%r1) # ...and kmc support for given key length
+ jz .Lekey_internal
+
+ lmg %r0,%r1,0($inp) # just copy 128 bits...
+ stmg %r0,%r1,0($key)
+ lhi %r0,192
+ cr $bits,%r0
+ jl 1f
+ lg %r1,16($inp)
+ stg %r1,16($key)
+ je 1f
+ lg %r1,24($inp)
+ stg %r1,24($key)
+1: st $bits,236($key) # save bits [for debugging purposes]
+ lgr $t0,%r5
+ st %r5,240($key) # save km(c) code
+ lghi %r2,0
+ br %r14
+___
+$code.=<<___;
+.align 16
+.Lekey_internal:
+ stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
+
+ larl $tbl,AES_Te+2048
+
+ llgf $s0,0($inp)
+ llgf $s1,4($inp)
+ llgf $s2,8($inp)
+ llgf $s3,12($inp)
+ st $s0,0($key)
+ st $s1,4($key)
+ st $s2,8($key)
+ st $s3,12($key)
+ lghi $t0,128
+ cr $bits,$t0
+ jne .Lnot128
+
+ llill $mask,0xff
+ lghi $t3,0 # i=0
+ lghi $rounds,10
+ st $rounds,240($key)
+
+ llgfr $t2,$s3 # temp=rk[3]
+ srlg $i1,$s3,8
+ srlg $i2,$s3,16
+ srlg $i3,$s3,24
+ nr $t2,$mask
+ nr $i1,$mask
+ nr $i2,$mask
+
+.align 16
+.L128_loop:
+ la $t2,0($t2,$tbl)
+ la $i1,0($i1,$tbl)
+ la $i2,0($i2,$tbl)
+ la $i3,0($i3,$tbl)
+ icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
+ icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
+ icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
+ icm $t2,1,0($i3) # Te4[rk[3]>>24]
+ x $t2,256($t3,$tbl) # rcon[i]
+ xr $s0,$t2 # rk[4]=rk[0]^...
+ xr $s1,$s0 # rk[5]=rk[1]^rk[4]
+ xr $s2,$s1 # rk[6]=rk[2]^rk[5]
+ xr $s3,$s2 # rk[7]=rk[3]^rk[6]
+
+ llgfr $t2,$s3 # temp=rk[3]
+ srlg $i1,$s3,8
+ srlg $i2,$s3,16
+ nr $t2,$mask
+ nr $i1,$mask
+ srlg $i3,$s3,24
+ nr $i2,$mask
+
+ st $s0,16($key)
+ st $s1,20($key)
+ st $s2,24($key)
+ st $s3,28($key)
+ la $key,16($key) # key+=4
+ la $t3,4($t3) # i++
+ brct $rounds,.L128_loop
+ lghi $t0,10
+ lghi %r2,0
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
+ br $ra
+
+.align 16
+.Lnot128:
+ llgf $t0,16($inp)
+ llgf $t1,20($inp)
+ st $t0,16($key)
+ st $t1,20($key)
+ lghi $t0,192
+ cr $bits,$t0
+ jne .Lnot192
+
+ llill $mask,0xff
+ lghi $t3,0 # i=0
+ lghi $rounds,12
+ st $rounds,240($key)
+ lghi $rounds,8
+
+ srlg $i1,$t1,8
+ srlg $i2,$t1,16
+ srlg $i3,$t1,24
+ nr $t1,$mask
+ nr $i1,$mask
+ nr $i2,$mask
+
+.align 16
+.L192_loop:
+ la $t1,0($t1,$tbl)
+ la $i1,0($i1,$tbl)
+ la $i2,0($i2,$tbl)
+ la $i3,0($i3,$tbl)
+ icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
+ icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
+ icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
+ icm $t1,1,0($i3) # Te4[rk[5]>>24]
+ x $t1,256($t3,$tbl) # rcon[i]
+ xr $s0,$t1 # rk[6]=rk[0]^...
+ xr $s1,$s0 # rk[7]=rk[1]^rk[6]
+ xr $s2,$s1 # rk[8]=rk[2]^rk[7]
+ xr $s3,$s2 # rk[9]=rk[3]^rk[8]
+
+ st $s0,24($key)
+ st $s1,28($key)
+ st $s2,32($key)
+ st $s3,36($key)
+ brct $rounds,.L192_continue
+ lghi $t0,12
+ lghi %r2,0
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
+ br $ra
+
+.align 16
+.L192_continue:
+ lgr $t1,$s3
+ x $t1,16($key) # rk[10]=rk[4]^rk[9]
+ st $t1,40($key)
+ x $t1,20($key) # rk[11]=rk[5]^rk[10]
+ st $t1,44($key)
+
+ srlg $i1,$t1,8
+ srlg $i2,$t1,16
+ srlg $i3,$t1,24
+ nr $t1,$mask
+ nr $i1,$mask
+ nr $i2,$mask
+
+ la $key,24($key) # key+=6
+ la $t3,4($t3) # i++
+ j .L192_loop
+
+.align 16
+.Lnot192:
+ llgf $t0,24($inp)
+ llgf $t1,28($inp)
+ st $t0,24($key)
+ st $t1,28($key)
+ llill $mask,0xff
+ lghi $t3,0 # i=0
+ lghi $rounds,14
+ st $rounds,240($key)
+ lghi $rounds,7
+
+ srlg $i1,$t1,8
+ srlg $i2,$t1,16
+ srlg $i3,$t1,24
+ nr $t1,$mask
+ nr $i1,$mask
+ nr $i2,$mask
+
+.align 16
+.L256_loop:
+ la $t1,0($t1,$tbl)
+ la $i1,0($i1,$tbl)
+ la $i2,0($i2,$tbl)
+ la $i3,0($i3,$tbl)
+ icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
+ icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
+ icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
+ icm $t1,1,0($i3) # Te4[rk[7]>>24]
+ x $t1,256($t3,$tbl) # rcon[i]
+ xr $s0,$t1 # rk[8]=rk[0]^...
+ xr $s1,$s0 # rk[9]=rk[1]^rk[8]
+ xr $s2,$s1 # rk[10]=rk[2]^rk[9]
+ xr $s3,$s2 # rk[11]=rk[3]^rk[10]
+ st $s0,32($key)
+ st $s1,36($key)
+ st $s2,40($key)
+ st $s3,44($key)
+ brct $rounds,.L256_continue
+ lghi $t0,14
+ lghi %r2,0
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
+ br $ra
+
+.align 16
+.L256_continue:
+ lgr $t1,$s3 # temp=rk[11]
+ srlg $i1,$s3,8
+ srlg $i2,$s3,16
+ srlg $i3,$s3,24
+ nr $t1,$mask
+ nr $i1,$mask
+ nr $i2,$mask
+ la $t1,0($t1,$tbl)
+ la $i1,0($i1,$tbl)
+ la $i2,0($i2,$tbl)
+ la $i3,0($i3,$tbl)
+ llgc $t1,0($t1) # Te4[rk[11]>>0]
+ icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
+ icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
+ icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
+ x $t1,16($key) # rk[12]=rk[4]^...
+ st $t1,48($key)
+ x $t1,20($key) # rk[13]=rk[5]^rk[12]
+ st $t1,52($key)
+ x $t1,24($key) # rk[14]=rk[6]^rk[13]
+ st $t1,56($key)
+ x $t1,28($key) # rk[15]=rk[7]^rk[14]
+ st $t1,60($key)
+
+ srlg $i1,$t1,8
+ srlg $i2,$t1,16
+ srlg $i3,$t1,24
+ nr $t1,$mask
+ nr $i1,$mask
+ nr $i2,$mask
+
+ la $key,32($key) # key+=8
+ la $t3,4($t3) # i++
+ j .L256_loop
+
+.Lminus1:
+ lghi %r2,-1
+ br $ra
+.size AES_set_encrypt_key,.-AES_set_encrypt_key
+
+# void AES_set_decrypt_key(const unsigned char *in, int bits,
+# AES_KEY *key) {
+.globl AES_set_decrypt_key
+.type AES_set_decrypt_key,\@function
+.align 16
+AES_set_decrypt_key:
+ #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
+ st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
+ bras $ra,_s390x_AES_set_encrypt_key
+ #l${g} $key,4*$SIZE_T($sp)
+ l${g} $ra,14*$SIZE_T($sp)
+ ltgr %r2,%r2
+ bnzr $ra
+___
+$code.=<<___ if (!$softonly);
+ #l $t0,240($key)
+ lhi $t1,16
+ cr $t0,$t1
+ jl .Lgo
+ oill $t0,0x80 # set "decrypt" bit
+ st $t0,240($key)
+ br $ra
+___
+$code.=<<___;
+.align 16
+.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
+ la $i1,0($key)
+ sllg $i2,$rounds,4
+ la $i2,0($i2,$key)
+ srl $rounds,1
+ lghi $t1,-16
+
+.align 16
+.Linv: lmg $s0,$s1,0($i1)
+ lmg $s2,$s3,0($i2)
+ stmg $s0,$s1,0($i2)
+ stmg $s2,$s3,0($i1)
+ la $i1,16($i1)
+ la $i2,0($t1,$i2)
+ brct $rounds,.Linv
+___
+$mask80=$i1;
+$mask1b=$i2;
+$maskfe=$i3;
+$code.=<<___;
+ llgf $rounds,240($key)
+ aghi $rounds,-1
+ sll $rounds,2 # (rounds-1)*4
+ llilh $mask80,0x8080
+ llilh $mask1b,0x1b1b
+ llilh $maskfe,0xfefe
+ oill $mask80,0x8080
+ oill $mask1b,0x1b1b
+ oill $maskfe,0xfefe
+
+.align 16
+.Lmix: l $s0,16($key) # tp1
+ lr $s1,$s0
+ ngr $s1,$mask80
+ srlg $t1,$s1,7
+ slr $s1,$t1
+ nr $s1,$mask1b
+ sllg $t1,$s0,1
+ nr $t1,$maskfe
+ xr $s1,$t1 # tp2
+
+ lr $s2,$s1
+ ngr $s2,$mask80
+ srlg $t1,$s2,7
+ slr $s2,$t1
+ nr $s2,$mask1b
+ sllg $t1,$s1,1
+ nr $t1,$maskfe
+ xr $s2,$t1 # tp4
+
+ lr $s3,$s2
+ ngr $s3,$mask80
+ srlg $t1,$s3,7
+ slr $s3,$t1
+ nr $s3,$mask1b
+ sllg $t1,$s2,1
+ nr $t1,$maskfe
+ xr $s3,$t1 # tp8
+
+ xr $s1,$s0 # tp2^tp1
+ xr $s2,$s0 # tp4^tp1
+ rll $s0,$s0,24 # = ROTATE(tp1,8)
+ xr $s2,$s3 # ^=tp8
+ xr $s0,$s1 # ^=tp2^tp1
+ xr $s1,$s3 # tp2^tp1^tp8
+ xr $s0,$s2 # ^=tp4^tp1^tp8
+ rll $s1,$s1,8
+ rll $s2,$s2,16
+ xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
+ rll $s3,$s3,24
+ xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
+ xr $s0,$s3 # ^= ROTATE(tp8,8)
+
+ st $s0,16($key)
+ la $key,4($key)
+ brct $rounds,.Lmix
+
+ lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
+ lghi %r2,0
+ br $ra
+.size AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivec, const int enc)
+{
+my $inp="%r2";
+my $out="%r4"; # length and out are swapped
+my $len="%r3";
+my $key="%r5";
+my $ivp="%r6";
+
+$code.=<<___;
+.globl AES_cbc_encrypt
+.type AES_cbc_encrypt,\@function
+.align 16
+AES_cbc_encrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, out and len
+ xgr %r4,%r3
+ xgr %r3,%r4
+___
+$code.=<<___ if (!$softonly);
+ lhi %r0,16
+ cl %r0,240($key)
+ jh .Lcbc_software
+
+ lg %r0,0($ivp) # copy ivec
+ lg %r1,8($ivp)
+ stmg %r0,%r1,16($sp)
+ lmg %r0,%r1,0($key) # copy key, cover 256 bit
+ stmg %r0,%r1,32($sp)
+ lmg %r0,%r1,16($key)
+ stmg %r0,%r1,48($sp)
+ l %r0,240($key) # load kmc code
+ lghi $key,15 # res=len%16, len-=res;
+ ngr $key,$len
+ sl${g}r $len,$key
+ la %r1,16($sp) # parameter block - ivec || key
+ jz .Lkmc_truncated
+ .long 0xb92f0042 # kmc %r4,%r2
+ brc 1,.-4 # pay attention to "partial completion"
+ ltr $key,$key
+ jnz .Lkmc_truncated
+.Lkmc_done:
+ lmg %r0,%r1,16($sp) # copy ivec to caller
+ stg %r0,0($ivp)
+ stg %r1,8($ivp)
+ br $ra
+.align 16
+.Lkmc_truncated:
+ ahi $key,-1 # it's the way it's encoded in mvc
+ tmll %r0,0x80
+ jnz .Lkmc_truncated_dec
+ lghi %r1,0
+ stg %r1,16*$SIZE_T($sp)
+ stg %r1,16*$SIZE_T+8($sp)
+ bras %r1,1f
+ mvc 16*$SIZE_T(1,$sp),0($inp)
+1: ex $key,0(%r1)
+ la %r1,16($sp) # restore parameter block
+ la $inp,16*$SIZE_T($sp)
+ lghi $len,16
+ .long 0xb92f0042 # kmc %r4,%r2
+ j .Lkmc_done
+.align 16
+.Lkmc_truncated_dec:
+ st${g} $out,4*$SIZE_T($sp)
+ la $out,16*$SIZE_T($sp)
+ lghi $len,16
+ .long 0xb92f0042 # kmc %r4,%r2
+ l${g} $out,4*$SIZE_T($sp)
+ bras %r1,2f
+ mvc 0(1,$out),16*$SIZE_T($sp)
+2: ex $key,0(%r1)
+ j .Lkmc_done
+.align 16
+.Lcbc_software:
+___
+$code.=<<___;
+ stm${g} $key,$ra,5*$SIZE_T($sp)
+ lhi %r0,0
+ cl %r0,`$stdframe+$SIZE_T-4`($sp)
+ je .Lcbc_decrypt
+
+ larl $tbl,AES_Te
+
+ llgf $s0,0($ivp)
+ llgf $s1,4($ivp)
+ llgf $s2,8($ivp)
+ llgf $s3,12($ivp)
+
+ lghi $t0,16
+ sl${g}r $len,$t0
+ brc 4,.Lcbc_enc_tail # if borrow
+.Lcbc_enc_loop:
+ stm${g} $inp,$out,2*$SIZE_T($sp)
+ x $s0,0($inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ lgr %r4,$key
+
+ bras $ra,_s390x_AES_encrypt
+
+ lm${g} $inp,$key,2*$SIZE_T($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+
+ la $inp,16($inp)
+ la $out,16($out)
+ lghi $t0,16
+ lt${g}r $len,$len
+ jz .Lcbc_enc_done
+ sl${g}r $len,$t0
+ brc 4,.Lcbc_enc_tail # if borrow
+ j .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_done:
+ l${g} $ivp,6*$SIZE_T($sp)
+ st $s0,0($ivp)
+ st $s1,4($ivp)
+ st $s2,8($ivp)
+ st $s3,12($ivp)
+
+ lm${g} %r7,$ra,7*$SIZE_T($sp)
+ br $ra
+
+.align 16
+.Lcbc_enc_tail:
+ aghi $len,15
+ lghi $t0,0
+ stg $t0,16*$SIZE_T($sp)
+ stg $t0,16*$SIZE_T+8($sp)
+ bras $t1,3f
+ mvc 16*$SIZE_T(1,$sp),0($inp)
+3: ex $len,0($t1)
+ lghi $len,0
+ la $inp,16*$SIZE_T($sp)
+ j .Lcbc_enc_loop
+
+.align 16
+.Lcbc_decrypt:
+ larl $tbl,AES_Td
+
+ lg $t0,0($ivp)
+ lg $t1,8($ivp)
+ stmg $t0,$t1,16*$SIZE_T($sp)
+
+.Lcbc_dec_loop:
+ stm${g} $inp,$out,2*$SIZE_T($sp)
+ llgf $s0,0($inp)
+ llgf $s1,4($inp)
+ llgf $s2,8($inp)
+ llgf $s3,12($inp)
+ lgr %r4,$key
+
+ bras $ra,_s390x_AES_decrypt
+
+ lm${g} $inp,$key,2*$SIZE_T($sp)
+ sllg $s0,$s0,32
+ sllg $s2,$s2,32
+ lr $s0,$s1
+ lr $s2,$s3
+
+ lg $t0,0($inp)
+ lg $t1,8($inp)
+ xg $s0,16*$SIZE_T($sp)
+ xg $s2,16*$SIZE_T+8($sp)
+ lghi $s1,16
+ sl${g}r $len,$s1
+ brc 4,.Lcbc_dec_tail # if borrow
+ brc 2,.Lcbc_dec_done # if zero
+ stg $s0,0($out)
+ stg $s2,8($out)
+ stmg $t0,$t1,16*$SIZE_T($sp)
+
+ la $inp,16($inp)
+ la $out,16($out)
+ j .Lcbc_dec_loop
+
+.Lcbc_dec_done:
+ stg $s0,0($out)
+ stg $s2,8($out)
+.Lcbc_dec_exit:
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ stmg $t0,$t1,0($ivp)
+
+ br $ra
+
+.align 16
+.Lcbc_dec_tail:
+ aghi $len,15
+ stg $s0,16*$SIZE_T($sp)
+ stg $s2,16*$SIZE_T+8($sp)
+ bras $s1,4f
+ mvc 0(1,$out),16*$SIZE_T($sp)
+4: ex $len,0($s1)
+ j .Lcbc_dec_exit
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+# size_t blocks, const AES_KEY *key,
+# const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4"; # blocks and out are swapped
+my $len="%r3";
+my $key="%r5"; my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl AES_ctr32_encrypt
+.type AES_ctr32_encrypt,\@function
+.align 16
+AES_ctr32_encrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+ llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+ l %r0,240($key)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lctr32_software
+
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ slgr $out,$inp
+ la %r1,0($key) # %r1 is permanent copy of $key
+ lg $iv0,0($ivp) # load ivec
+ lg $ivp,8($ivp)
+
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
+ lghi $s1,-4096
+ algr $s0,$sp
+ lgr $fp,$sp
+ ngr $s0,$s1 # align at page boundary
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
+ st${g} $s2,0($sp) # back-chain
+ st${g} $fp,$SIZE_T($sp)
+
+ slgr $len,$fp
+ brc 1,.Lctr32_hw_switch # not zero, no borrow
+ algr $fp,$len # input is shorter than allocated buffer
+ lghi $len,0
+ st${g} $fp,$SIZE_T($sp)
+
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
+ llgfr $s0,%r0
+ lgr $s1,%r1
+ larl %r1,OPENSSL_s390xcap_P
+ llihh %r0,0x8000 # check if kmctr supports the function code
+ srlg %r0,%r0,0($s0)
+ ng %r0,64(%r1) # check kmctr capability vector
+ lgr %r0,$s0
+ lgr %r1,$s1
+ jz .Lctr32_km_loop
+
+####### kmctr code
+ algr $out,$inp # restore $out
+ lgr $s1,$len # $s1 undertakes $len
+ j .Lctr32_kmctr_loop
+.align 16
+.Lctr32_kmctr_loop:
+ la $s2,16($sp)
+ lgr $s3,$fp
+.Lctr32_kmctr_prepare:
+ stg $iv0,0($s2)
+ stg $ivp,8($s2)
+ la $s2,16($s2)
+ ahi $ivp,1 # 32-bit increment, preserves upper half
+ brct $s3,.Lctr32_kmctr_prepare
+
+ #la $inp,0($inp) # inp
+ sllg $len,$fp,4 # len
+ #la $out,0($out) # out
+ la $s2,16($sp) # iv
+ .long 0xb92da042 # kmctr $out,$s2,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+
+ slgr $s1,$fp
+ brc 1,.Lctr32_kmctr_loop # not zero, no borrow
+ algr $fp,$s1
+ lghi $s1,0
+ brc 4+1,.Lctr32_kmctr_loop # not zero
+
+ l${g} $sp,0($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+___
+$code.=<<___ if (!$softonly);
+.Lctr32_km_loop:
+ la $s2,16($sp)
+ lgr $s3,$fp
+.Lctr32_km_prepare:
+ stg $iv0,0($s2)
+ stg $ivp,8($s2)
+ la $s2,16($s2)
+ ahi $ivp,1 # 32-bit increment, preserves upper half
+ brct $s3,.Lctr32_km_prepare
+
+ la $s0,16($sp) # inp
+ sllg $s1,$fp,4 # len
+ la $s2,16($sp) # out
+ .long 0xb92e00a8 # km %r10,%r8
+ brc 1,.-4 # pay attention to "partial completion"
+
+ la $s2,16($sp)
+ lgr $s3,$fp
+ slgr $s2,$inp
+.Lctr32_km_xor:
+ lg $s0,0($inp)
+ lg $s1,8($inp)
+ xg $s0,0($s2,$inp)
+ xg $s1,8($s2,$inp)
+ stg $s0,0($out,$inp)
+ stg $s1,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lctr32_km_xor
+
+ slgr $len,$fp
+ brc 1,.Lctr32_km_loop # not zero, no borrow
+ algr $fp,$len
+ lghi $len,0
+ brc 4+1,.Lctr32_km_loop # not zero
+
+ l${g} $s0,0($sp)
+ l${g} $s1,$SIZE_T($sp)
+ la $s2,16($sp)
+.Lctr32_km_zap:
+ stg $s0,0($s2)
+ stg $s0,8($s2)
+ la $s2,16($s2)
+ brct $s1,.Lctr32_km_zap
+
+ la $sp,0($s0)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lctr32_software:
+___
+$code.=<<___;
+ stm${g} $key,$ra,5*$SIZE_T($sp)
+ sl${g}r $inp,$out
+ larl $tbl,AES_Te
+ llgf $t1,12($ivp)
+
+.Lctr32_loop:
+ stm${g} $inp,$out,2*$SIZE_T($sp)
+ llgf $s0,0($ivp)
+ llgf $s1,4($ivp)
+ llgf $s2,8($ivp)
+ lgr $s3,$t1
+ st $t1,16*$SIZE_T($sp)
+ lgr %r4,$key
+
+ bras $ra,_s390x_AES_encrypt
+
+ lm${g} $inp,$ivp,2*$SIZE_T($sp)
+ llgf $t1,16*$SIZE_T($sp)
+ x $s0,0($inp,$out)
+ x $s1,4($inp,$out)
+ x $s2,8($inp,$out)
+ x $s3,12($inp,$out)
+ stm $s0,$s3,0($out)
+
+ la $out,16($out)
+ ahi $t1,1 # 32-bit increment
+ brct $len,.Lctr32_loop
+
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4"; # len and out are swapped
+my $len="%r3";
+my $key1="%r5"; # $i1
+my $key2="%r6"; # $i2
+my $fp="%r7"; # $i3
+my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
+
+$code.=<<___;
+.type _s390x_xts_km,\@function
+.align 16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+ llgfr $s0,%r0 # put aside the function code
+ lghi $s1,0x7f
+ nr $s1,%r0
+ larl %r1,OPENSSL_s390xcap_P
+ llihh %r0,0x8000
+ srlg %r0,%r0,32($s1) # check for 32+function code
+ ng %r0,32(%r1) # check km capability vector
+ lgr %r0,$s0 # restore the function code
+ la %r1,0($key1) # restore $key1
+ jz .Lxts_km_vanilla
+
+ lmg $i2,$i3,$tweak($sp) # put aside the tweak value
+ algr $out,$inp
+
+ oill %r0,32 # switch to xts function code
+ aghi $s1,-18 #
+ sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
+ la %r1,$tweak-16($sp)
+ slgr %r1,$s1 # parameter block position
+ lmg $s0,$s3,0($key1) # load 256 bits of key material,
+ stmg $s0,$s3,0(%r1) # and copy it to parameter block.
+ # yes, it contains junk and overlaps
+ # with the tweak in 128-bit case.
+ # it's done to avoid conditional
+ # branch.
+ stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
+
+ .long 0xb92e0042 # km %r4,%r2
+ brc 1,.-4 # pay attention to "partial completion"
+
+ lrvg $s0,$tweak+0($sp) # load the last tweak
+ lrvg $s1,$tweak+8($sp)
+ stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
+
+ nill %r0,0xffdf # switch back to original function code
+ la %r1,0($key1) # restore pointer to $key1
+ slgr $out,$inp
+
+ llgc $len,2*$SIZE_T-1($sp)
+ nill $len,0x0f # $len%=16
+ br $ra
+
+.align 16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
+ lghi $s1,-4096
+ algr $s0,$sp
+ lgr $fp,$sp
+ ngr $s0,$s1 # align at page boundary
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ nill $fp,0xfff0 # round to 16*n
+ st${g} $s2,0($sp) # back-chain
+ nill $len,0xfff0 # redundant
+ st${g} $fp,$SIZE_T($sp)
+
+ slgr $len,$fp
+ brc 1,.Lxts_km_go # not zero, no borrow
+ algr $fp,$len # input is shorter than allocated buffer
+ lghi $len,0
+ st${g} $fp,$SIZE_T($sp)
+
+.Lxts_km_go:
+ lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
+ lrvg $s1,$tweak+8($s2)
+
+ la $s2,16($sp) # vector of ascending tweak values
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+ j .Lxts_km_start
+
+.Lxts_km_loop:
+ la $s2,16($sp)
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+.Lxts_km_prepare:
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+.Lxts_km_start:
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ stg $i1,0($s2,$inp)
+ stg $i2,8($s2,$inp)
+ xg $i1,0($inp)
+ xg $i2,8($inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lxts_km_prepare
+
+ slgr $inp,$fp # rewind $inp
+ la $s2,0($out,$inp)
+ lgr $s3,$fp
+ .long 0xb92e00aa # km $s2,$s2
+ brc 1,.-4 # pay attention to "partial completion"
+
+ la $s2,16($sp)
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+.Lxts_km_xor:
+ lg $i1,0($out,$inp)
+ lg $i2,8($out,$inp)
+ xg $i1,0($s2,$inp)
+ xg $i2,8($s2,$inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lxts_km_xor
+
+ slgr $len,$fp
+ brc 1,.Lxts_km_loop # not zero, no borrow
+ algr $fp,$len
+ lghi $len,0
+ brc 4+1,.Lxts_km_loop # not zero
+
+ l${g} $i1,0($sp) # back-chain
+ llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
+ la $i2,16($sp)
+ srlg $fp,$fp,4
+.Lxts_km_zap:
+ stg $i1,0($i2)
+ stg $i1,8($i2)
+ la $i2,16($i2)
+ brct $fp,.Lxts_km_zap
+
+ la $sp,0($i1)
+ llgc $len,2*$SIZE_T-1($i1)
+ nill $len,0x0f # $len%=16
+ bzr $ra
+
+ # generate one more tweak...
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+
+ ltr $len,$len # clear zero flag
+ br $ra
+.size _s390x_xts_km,.-_s390x_xts_km
+
+.globl AES_xts_encrypt
+.type AES_xts_encrypt,\@function
+.align 16
+AES_xts_encrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+ llgfr $len,$len
+___
+$code.=<<___;
+ st${g} $len,1*$SIZE_T($sp) # save copy of $len
+ srag $len,$len,4 # formally wrong, because it expands
+ # sign byte, but who can afford asking
+ # to process more than 2^63-1 bytes?
+ # I use it, because it sets condition
+ # code...
+ bcr 8,$ra # abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+ llgf %r0,240($key2)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lxts_enc_software
+
+ st${g} $ra,5*$SIZE_T($sp)
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ sllg $len,$len,4 # $len&=~15
+ slgr $out,$inp
+
+ # generate the tweak value
+ l${g} $s3,$stdframe($sp) # pointer to iv
+ la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
+ lghi $s3,16
+ stmg $s0,$s1,0($s2)
+ la %r1,0($key2) # $key2 is not needed anymore
+ .long 0xb92e00aa # km $s2,$s2, generate the tweak
+ brc 1,.-4 # can this happen?
+
+ l %r0,240($key1)
+ la %r1,0($key1) # $key1 is not needed anymore
+ bras $ra,_s390x_xts_km
+ jz .Lxts_enc_km_done
+
+ aghi $inp,-16 # take one step back
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_enc_km_steal:
+ llgc $i1,16($inp)
+ llgc $i2,0($out,$inp)
+ stc $i1,0($out,$inp)
+ stc $i2,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_enc_km_steal
+
+ la $s2,0($i3)
+ lghi $s3,16
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ xg $i1,0($s2)
+ xg $i2,8($s2)
+ stg $i1,0($s2)
+ stg $i2,8($s2)
+ .long 0xb92e00aa # km $s2,$s2
+ brc 1,.-4 # can this happen?
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ xg $i1,0($i3)
+ xg $i2,8($i3)
+ stg $i1,0($i3)
+ stg $i2,8($i3)
+
+.Lxts_enc_km_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lxts_enc_software:
+___
+$code.=<<___;
+ stm${g} %r6,$ra,6*$SIZE_T($sp)
+
+ slgr $out,$inp
+
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
+ stm${g} %r2,%r5,2*$SIZE_T($sp)
+ la $key,0($key2)
+ larl $tbl,AES_Te
+ bras $ra,_s390x_AES_encrypt # generate the tweak
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ stm $s0,$s3,$tweak($sp) # save the tweak
+ j .Lxts_enc_enter
+
+.align 16
+.Lxts_enc_loop:
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+ la $inp,16($inp) # $inp+=16
+.Lxts_enc_enter:
+ x $s0,0($inp) # ^=*($inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
+ la $key,0($key1)
+ bras $ra,_s390x_AES_encrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+ brct${g} $len,.Lxts_enc_loop
+
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ jz .Lxts_enc_done
+
+ la $i3,0($inp,$out) # put aside real $out
+.Lxts_enc_steal:
+ llgc %r0,16($inp)
+ llgc %r1,0($out,$inp)
+ stc %r0,0($out,$inp)
+ stc %r1,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_enc_steal
+ la $out,0($i3) # restore real $out
+
+ # generate last tweak...
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+
+ x $s0,0($out) # ^=*(inp)|stolen cipther-text
+ x $s1,4($out)
+ x $s2,8($out)
+ x $s3,12($out)
+ st${g} $out,4*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_encrypt
+ l${g} $out,4*$SIZE_T($sp)
+ x $s0,`$tweak+0`($sp) # ^=tweak
+ x $s1,`$tweak+4`($sp)
+ x $s2,`$tweak+8`($sp)
+ x $s3,`$tweak+12`($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+
+.Lxts_enc_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$twesk+8($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+$code.=<<___;
+.globl AES_xts_decrypt
+.type AES_xts_decrypt,\@function
+.align 16
+AES_xts_decrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+ llgfr $len,$len
+___
+$code.=<<___;
+ st${g} $len,1*$SIZE_T($sp) # save copy of $len
+ aghi $len,-16
+ bcr 4,$ra # abort if less than zero. formally
+ # wrong, because $len is unsigned,
+ # but who can afford asking to
+ # process more than 2^63-1 bytes?
+ tmll $len,0x0f
+ jnz .Lxts_dec_proceed
+ aghi $len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+ llgf %r0,240($key2)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lxts_dec_software
+
+ st${g} $ra,5*$SIZE_T($sp)
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ nill $len,0xfff0 # $len&=~15
+ slgr $out,$inp
+
+ # generate the tweak value
+ l${g} $s3,$stdframe($sp) # pointer to iv
+ la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
+ lghi $s3,16
+ stmg $s0,$s1,0($s2)
+ la %r1,0($key2) # $key2 is not needed past this point
+ .long 0xb92e00aa # km $s2,$s2, generate the tweak
+ brc 1,.-4 # can this happen?
+
+ l %r0,240($key1)
+ la %r1,0($key1) # $key1 is not needed anymore
+
+ ltgr $len,$len
+ jz .Lxts_dec_km_short
+ bras $ra,_s390x_xts_km
+ jz .Lxts_dec_km_done
+
+ lrvgr $s2,$s0 # make copy in reverse byte order
+ lrvgr $s3,$s1
+ j .Lxts_dec_km_2ndtweak
+
+.Lxts_dec_km_short:
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%=16
+ lrvg $s0,$tweak+0($sp) # load the tweak
+ lrvg $s1,$tweak+8($sp)
+ lrvgr $s2,$s0 # make copy in reverse byte order
+ lrvgr $s3,$s1
+
+.Lxts_dec_km_2ndtweak:
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+
+ xg $i1,0($inp)
+ xg $i2,8($inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $i2,0($out,$inp)
+ lghi $i3,16
+ .long 0xb92e0066 # km $i2,$i2
+ brc 1,.-4 # can this happen?
+ lrvgr $i1,$s0
+ lrvgr $i2,$s1
+ xg $i1,0($out,$inp)
+ xg $i2,8($out,$inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_dec_km_steal:
+ llgc $i1,16($inp)
+ llgc $i2,0($out,$inp)
+ stc $i1,0($out,$inp)
+ stc $i2,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_dec_km_steal
+
+ lgr $s0,$s2
+ lgr $s1,$s3
+ xg $s0,0($i3)
+ xg $s1,8($i3)
+ stg $s0,0($i3)
+ stg $s1,8($i3)
+ la $s0,0($i3)
+ lghi $s1,16
+ .long 0xb92e0088 # km $s0,$s0
+ brc 1,.-4 # can this happen?
+ xg $s2,0($i3)
+ xg $s3,8($i3)
+ stg $s2,0($i3)
+ stg $s3,8($i3)
+.Lxts_dec_km_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lxts_dec_software:
+___
+$code.=<<___;
+ stm${g} %r6,$ra,6*$SIZE_T($sp)
+
+ srlg $len,$len,4
+ slgr $out,$inp
+
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
+ stm${g} %r2,%r5,2*$SIZE_T($sp)
+ la $key,0($key2)
+ larl $tbl,AES_Te
+ bras $ra,_s390x_AES_encrypt # generate the tweak
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ larl $tbl,AES_Td
+ lt${g}r $len,$len
+ stm $s0,$s3,$tweak($sp) # save the tweak
+ jz .Lxts_dec_short
+ j .Lxts_dec_enter
+
+.align 16
+.Lxts_dec_loop:
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+.Lxts_dec_enter:
+ x $s0,0($inp) # tweak^=*(inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+ la $inp,16($inp)
+ brct${g} $len,.Lxts_dec_loop
+
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ jz .Lxts_dec_done
+
+ # generate pair of tweaks...
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $i2,$s1 # flip byte order
+ lrvgr $i3,$s3
+ stmg $i2,$i3,$tweak($sp) # save the 1st tweak
+ j .Lxts_dec_2ndtweak
+
+.align 16
+.Lxts_dec_short:
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak-16+0($sp) # save the 2nd tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak-16+8($sp)
+ llgfr $s3,$s3
+
+ x $s0,0($inp) # tweak_the_2nd^=*(inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
+ x $s1,$tweak-16+4($sp)
+ x $s2,$tweak-16+8($sp)
+ x $s3,$tweak-16+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_dec_steal:
+ llgc %r0,16($inp)
+ llgc %r1,0($out,$inp)
+ stc %r0,0($out,$inp)
+ stc %r1,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_dec_steal
+ la $out,0($i3) # restore real $out
+
+ lm $s0,$s3,$tweak($sp) # load the 1st tweak
+ x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
+ x $s1,4($out)
+ x $s2,8($out)
+ x $s3,12($out)
+ st${g} $out,4*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ l${g} $out,4*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+ stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
+ stg $sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$twesk+8($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_xts_decrypt,.-AES_xts_decrypt
+___
+}
+$code.=<<___;
+.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT; # force flush
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-sparcv9.pl b/openssl-1.1.0h/crypto/aes/asm/aes-sparcv9.pl
new file mode 100755
index 0000000..883fae8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-sparcv9.pl
@@ -0,0 +1,1192 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
+# Version 1.1
+#
+# The major reason for undertaken effort was to mitigate the hazard of
+# cache-timing attack. This is [currently and initially!] addressed in
+# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
+# 2. References to them are scheduled for L2 cache latency, meaning
+# that the tables don't have to reside in L1 cache. Once again, this
+# is an initial draft and one should expect more countermeasures to
+# be implemented...
+#
+# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
+# round.
+#
+# Even though performance was not the primary goal [on the contrary,
+# extra shifts "induced" by compressed S-box and longer loop epilogue
+# "induced" by scheduling for L2 have negative effect on performance],
+# the code turned out to run in ~23 cycles per processed byte en-/
+# decrypted with 128-bit key. This is pretty good result for code
+# with mentioned qualities and UltraSPARC core. Compared to Sun C
+# generated code my encrypt procedure runs just few percents faster,
+# while decrypt one - whole 50% faster [yes, Sun C failed to generate
+# optimal decrypt procedure]. Compared to GNU C generated code both
+# procedures are more than 60% faster:-)
+
+$output = pop;
+open STDOUT,">$output";
+
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
+$locals=16;
+
+$acc0="%l0";
+$acc1="%o0";
+$acc2="%o1";
+$acc3="%o2";
+
+$acc4="%l1";
+$acc5="%o3";
+$acc6="%o4";
+$acc7="%o5";
+
+$acc8="%l2";
+$acc9="%o7";
+$acc10="%g1";
+$acc11="%g2";
+
+$acc12="%l3";
+$acc13="%g3";
+$acc14="%g4";
+$acc15="%g5";
+
+$t0="%l4";
+$t1="%l5";
+$t2="%l6";
+$t3="%l7";
+
+$s0="%i0";
+$s1="%i1";
+$s2="%i2";
+$s3="%i3";
+$tbl="%i4";
+$key="%i5";
+$rounds="%i7"; # aliases with return address, which is off-loaded to stack
+
+sub _data_word()
+{ my $i;
+ while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
+}
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+.section ".text",#alloc,#execinstr
+
+.align 256
+AES_Te:
+___
+&_data_word(
+ 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
+ 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
+ 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
+ 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
+ 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
+ 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
+ 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
+ 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
+ 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
+ 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
+ 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
+ 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
+ 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
+ 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
+ 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
+ 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
+ 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
+ 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
+ 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
+ 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
+ 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
+ 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
+ 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
+ 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
+ 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
+ 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
+ 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
+ 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
+ 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
+ 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
+ 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
+ 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
+ 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
+ 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
+ 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
+ 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
+ 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
+ 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
+ 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
+ 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
+ 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
+ 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
+ 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
+ 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
+ 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
+ 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
+ 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
+ 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
+ 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
+ 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
+ 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
+ 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
+ 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
+ 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
+ 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
+ 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
+ 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
+ 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
+ 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
+ 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
+ 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
+ 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
+ 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
+ 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
+$code.=<<___;
+ .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+.type AES_Te,#object
+.size AES_Te,(.-AES_Te)
+
+.align 64
+.skip 16
+_sparcv9_AES_encrypt:
+ save %sp,-$frame-$locals,%sp
+ stx %i7,[%sp+$bias+$frame+0] ! off-load return address
+ ld [$key+240],$rounds
+ ld [$key+0],$t0
+ ld [$key+4],$t1 !
+ ld [$key+8],$t2
+ srl $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ld [$key+12],$t3
+ srl $s0,21,$acc0
+ xor $t1,$s1,$s1
+ ld [$key+16],$t0
+ srl $s1,13,$acc1 !
+ xor $t2,$s2,$s2
+ ld [$key+20],$t1
+ xor $t3,$s3,$s3
+ ld [$key+24],$t2
+ and $acc0,2040,$acc0
+ ld [$key+28],$t3
+ nop
+.Lenc_loop:
+ srl $s2,5,$acc2 !
+ and $acc1,2040,$acc1
+ ldx [$tbl+$acc0],$acc0
+ sll $s3,3,$acc3
+ and $acc2,2040,$acc2
+ ldx [$tbl+$acc1],$acc1
+ srl $s1,21,$acc4
+ and $acc3,2040,$acc3
+ ldx [$tbl+$acc2],$acc2 !
+ srl $s2,13,$acc5
+ and $acc4,2040,$acc4
+ ldx [$tbl+$acc3],$acc3
+ srl $s3,5,$acc6
+ and $acc5,2040,$acc5
+ ldx [$tbl+$acc4],$acc4
+ fmovs %f0,%f0
+ sll $s0,3,$acc7 !
+ and $acc6,2040,$acc6
+ ldx [$tbl+$acc5],$acc5
+ srl $s2,21,$acc8
+ and $acc7,2040,$acc7
+ ldx [$tbl+$acc6],$acc6
+ srl $s3,13,$acc9
+ and $acc8,2040,$acc8
+ ldx [$tbl+$acc7],$acc7 !
+ srl $s0,5,$acc10
+ and $acc9,2040,$acc9
+ ldx [$tbl+$acc8],$acc8
+ sll $s1,3,$acc11
+ and $acc10,2040,$acc10
+ ldx [$tbl+$acc9],$acc9
+ fmovs %f0,%f0
+ srl $s3,21,$acc12 !
+ and $acc11,2040,$acc11
+ ldx [$tbl+$acc10],$acc10
+ srl $s0,13,$acc13
+ and $acc12,2040,$acc12
+ ldx [$tbl+$acc11],$acc11
+ srl $s1,5,$acc14
+ and $acc13,2040,$acc13
+ ldx [$tbl+$acc12],$acc12 !
+ sll $s2,3,$acc15
+ and $acc14,2040,$acc14
+ ldx [$tbl+$acc13],$acc13
+ and $acc15,2040,$acc15
+ add $key,32,$key
+ ldx [$tbl+$acc14],$acc14
+ fmovs %f0,%f0
+ subcc $rounds,1,$rounds !
+ ldx [$tbl+$acc15],$acc15
+ bz,a,pn %icc,.Lenc_last
+ add $tbl,2048,$rounds
+
+ srlx $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ld [$key+0],$s0
+ fmovs %f0,%f0
+ srlx $acc2,16,$acc2 !
+ xor $acc1,$t0,$t0
+ ld [$key+4],$s1
+ srlx $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ld [$key+8],$s2
+ srlx $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ld [$key+12],$s3 !
+ srlx $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ fmovs %f0,%f0
+ srlx $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ srlx $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ srlx $acc10,16,$acc10 !
+ xor $acc7,$t1,$t1
+ srlx $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ srlx $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ srlx $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ srlx $acc15,24,$acc15 !
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ srl $t0,21,$acc0
+ xor $acc14,$t3,$t3
+ srl $t1,13,$acc1
+ xor $acc15,$t3,$t3
+
+ and $acc0,2040,$acc0 !
+ srl $t2,5,$acc2
+ and $acc1,2040,$acc1
+ ldx [$tbl+$acc0],$acc0
+ sll $t3,3,$acc3
+ and $acc2,2040,$acc2
+ ldx [$tbl+$acc1],$acc1
+ fmovs %f0,%f0
+ srl $t1,21,$acc4 !
+ and $acc3,2040,$acc3
+ ldx [$tbl+$acc2],$acc2
+ srl $t2,13,$acc5
+ and $acc4,2040,$acc4
+ ldx [$tbl+$acc3],$acc3
+ srl $t3,5,$acc6
+ and $acc5,2040,$acc5
+ ldx [$tbl+$acc4],$acc4 !
+ sll $t0,3,$acc7
+ and $acc6,2040,$acc6
+ ldx [$tbl+$acc5],$acc5
+ srl $t2,21,$acc8
+ and $acc7,2040,$acc7
+ ldx [$tbl+$acc6],$acc6
+ fmovs %f0,%f0
+ srl $t3,13,$acc9 !
+ and $acc8,2040,$acc8
+ ldx [$tbl+$acc7],$acc7
+ srl $t0,5,$acc10
+ and $acc9,2040,$acc9
+ ldx [$tbl+$acc8],$acc8
+ sll $t1,3,$acc11
+ and $acc10,2040,$acc10
+ ldx [$tbl+$acc9],$acc9 !
+ srl $t3,21,$acc12
+ and $acc11,2040,$acc11
+ ldx [$tbl+$acc10],$acc10
+ srl $t0,13,$acc13
+ and $acc12,2040,$acc12
+ ldx [$tbl+$acc11],$acc11
+ fmovs %f0,%f0
+ srl $t1,5,$acc14 !
+ and $acc13,2040,$acc13
+ ldx [$tbl+$acc12],$acc12
+ sll $t2,3,$acc15
+ and $acc14,2040,$acc14
+ ldx [$tbl+$acc13],$acc13
+ srlx $acc1,8,$acc1
+ and $acc15,2040,$acc15
+ ldx [$tbl+$acc14],$acc14 !
+
+ srlx $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldx [$tbl+$acc15],$acc15
+ srlx $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ld [$key+16],$t0
+ fmovs %f0,%f0
+ srlx $acc5,8,$acc5 !
+ xor $acc2,$s0,$s0
+ ld [$key+20],$t1
+ srlx $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ld [$key+24],$t2
+ srlx $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ld [$key+28],$t3 !
+ srlx $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldx [$tbl+2048+0],%g0 ! prefetch te4
+ srlx $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldx [$tbl+2048+32],%g0 ! prefetch te4
+ srlx $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldx [$tbl+2048+64],%g0 ! prefetch te4
+ srlx $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldx [$tbl+2048+96],%g0 ! prefetch te4
+ srlx $acc14,16,$acc14 !
+ xor $acc9,$s2,$s2
+ ldx [$tbl+2048+128],%g0 ! prefetch te4
+ srlx $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldx [$tbl+2048+160],%g0 ! prefetch te4
+ srl $s0,21,$acc0
+ xor $acc11,$s2,$s2
+ ldx [$tbl+2048+192],%g0 ! prefetch te4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldx [$tbl+2048+224],%g0 ! prefetch te4
+ srl $s1,13,$acc1 !
+ xor $acc14,$s3,$s3
+ xor $acc15,$s3,$s3
+ ba .Lenc_loop
+ and $acc0,2040,$acc0
+
+.align 32
+.Lenc_last:
+ srlx $acc1,8,$acc1 !
+ xor $acc0,$t0,$t0
+ ld [$key+0],$s0
+ srlx $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ld [$key+4],$s1
+ srlx $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ld [$key+8],$s2 !
+ srlx $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ld [$key+12],$s3
+ srlx $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ srlx $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ srlx $acc9,8,$acc9 !
+ xor $acc6,$t1,$t1
+ srlx $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ srlx $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ srlx $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ srlx $acc14,16,$acc14 !
+ xor $acc10,$t2,$t2
+ srlx $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ srl $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ srl $t1,16,$acc1 !
+ xor $acc15,$t3,$t3
+
+ srl $t2,8,$acc2
+ and $acc1,255,$acc1
+ ldub [$rounds+$acc0],$acc0
+ srl $t1,24,$acc4
+ and $acc2,255,$acc2
+ ldub [$rounds+$acc1],$acc1
+ srl $t2,16,$acc5 !
+ and $t3,255,$acc3
+ ldub [$rounds+$acc2],$acc2
+ ldub [$rounds+$acc3],$acc3
+ srl $t3,8,$acc6
+ and $acc5,255,$acc5
+ ldub [$rounds+$acc4],$acc4
+ fmovs %f0,%f0
+ srl $t2,24,$acc8 !
+ and $acc6,255,$acc6
+ ldub [$rounds+$acc5],$acc5
+ srl $t3,16,$acc9
+ and $t0,255,$acc7
+ ldub [$rounds+$acc6],$acc6
+ ldub [$rounds+$acc7],$acc7
+ fmovs %f0,%f0
+ srl $t0,8,$acc10 !
+ and $acc9,255,$acc9
+ ldub [$rounds+$acc8],$acc8
+ srl $t3,24,$acc12
+ and $acc10,255,$acc10
+ ldub [$rounds+$acc9],$acc9
+ srl $t0,16,$acc13
+ and $t1,255,$acc11
+ ldub [$rounds+$acc10],$acc10 !
+ srl $t1,8,$acc14
+ and $acc13,255,$acc13
+ ldub [$rounds+$acc11],$acc11
+ ldub [$rounds+$acc12],$acc12
+ and $acc14,255,$acc14
+ ldub [$rounds+$acc13],$acc13
+ and $t2,255,$acc15
+ ldub [$rounds+$acc14],$acc14 !
+
+ sll $acc0,24,$acc0
+ xor $acc3,$s0,$s0
+ ldub [$rounds+$acc15],$acc15
+ sll $acc1,16,$acc1
+ xor $acc0,$s0,$s0
+ ldx [%sp+$bias+$frame+0],%i7 ! restore return address
+ fmovs %f0,%f0
+ sll $acc2,8,$acc2 !
+ xor $acc1,$s0,$s0
+ sll $acc4,24,$acc4
+ xor $acc2,$s0,$s0
+ sll $acc5,16,$acc5
+ xor $acc7,$s1,$s1
+ sll $acc6,8,$acc6
+ xor $acc4,$s1,$s1
+ sll $acc8,24,$acc8 !
+ xor $acc5,$s1,$s1
+ sll $acc9,16,$acc9
+ xor $acc11,$s2,$s2
+ sll $acc10,8,$acc10
+ xor $acc6,$s1,$s1
+ sll $acc12,24,$acc12
+ xor $acc8,$s2,$s2
+ sll $acc13,16,$acc13 !
+ xor $acc9,$s2,$s2
+ sll $acc14,8,$acc14
+ xor $acc10,$s2,$s2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ xor $acc14,$s3,$s3
+ xor $acc15,$s3,$s3
+
+ ret
+ restore
+.type _sparcv9_AES_encrypt,#function
+.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
+
+.align 32
+.globl AES_encrypt
+AES_encrypt:
+ or %o0,%o1,%g1
+ andcc %g1,3,%g0
+ bnz,pn %xcc,.Lunaligned_enc
+ save %sp,-$frame,%sp
+
+ ld [%i0+0],%o0
+ ld [%i0+4],%o1
+ ld [%i0+8],%o2
+ ld [%i0+12],%o3
+
+1: call .+8
+ add %o7,AES_Te-1b,%o4
+ call _sparcv9_AES_encrypt
+ mov %i2,%o5
+
+ st %o0,[%i1+0]
+ st %o1,[%i1+4]
+ st %o2,[%i1+8]
+ st %o3,[%i1+12]
+
+ ret
+ restore
+
+.align 32
+.Lunaligned_enc:
+ ldub [%i0+0],%l0
+ ldub [%i0+1],%l1
+ ldub [%i0+2],%l2
+
+ sll %l0,24,%l0
+ ldub [%i0+3],%l3
+ sll %l1,16,%l1
+ ldub [%i0+4],%l4
+ sll %l2,8,%l2
+ or %l1,%l0,%l0
+ ldub [%i0+5],%l5
+ sll %l4,24,%l4
+ or %l3,%l2,%l2
+ ldub [%i0+6],%l6
+ sll %l5,16,%l5
+ or %l0,%l2,%o0
+ ldub [%i0+7],%l7
+
+ sll %l6,8,%l6
+ or %l5,%l4,%l4
+ ldub [%i0+8],%l0
+ or %l7,%l6,%l6
+ ldub [%i0+9],%l1
+ or %l4,%l6,%o1
+ ldub [%i0+10],%l2
+
+ sll %l0,24,%l0
+ ldub [%i0+11],%l3
+ sll %l1,16,%l1
+ ldub [%i0+12],%l4
+ sll %l2,8,%l2
+ or %l1,%l0,%l0
+ ldub [%i0+13],%l5
+ sll %l4,24,%l4
+ or %l3,%l2,%l2
+ ldub [%i0+14],%l6
+ sll %l5,16,%l5
+ or %l0,%l2,%o2
+ ldub [%i0+15],%l7
+
+ sll %l6,8,%l6
+ or %l5,%l4,%l4
+ or %l7,%l6,%l6
+ or %l4,%l6,%o3
+
+1: call .+8
+ add %o7,AES_Te-1b,%o4
+ call _sparcv9_AES_encrypt
+ mov %i2,%o5
+
+ srl %o0,24,%l0
+ srl %o0,16,%l1
+ stb %l0,[%i1+0]
+ srl %o0,8,%l2
+ stb %l1,[%i1+1]
+ stb %l2,[%i1+2]
+ srl %o1,24,%l4
+ stb %o0,[%i1+3]
+
+ srl %o1,16,%l5
+ stb %l4,[%i1+4]
+ srl %o1,8,%l6
+ stb %l5,[%i1+5]
+ stb %l6,[%i1+6]
+ srl %o2,24,%l0
+ stb %o1,[%i1+7]
+
+ srl %o2,16,%l1
+ stb %l0,[%i1+8]
+ srl %o2,8,%l2
+ stb %l1,[%i1+9]
+ stb %l2,[%i1+10]
+ srl %o3,24,%l4
+ stb %o2,[%i1+11]
+
+ srl %o3,16,%l5
+ stb %l4,[%i1+12]
+ srl %o3,8,%l6
+ stb %l5,[%i1+13]
+ stb %l6,[%i1+14]
+ stb %o3,[%i1+15]
+
+ ret
+ restore
+.type AES_encrypt,#function
+.size AES_encrypt,(.-AES_encrypt)
+
+___
+
+$code.=<<___;
+.align 256
+AES_Td:
+___
+&_data_word(
+ 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
+ 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
+ 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
+ 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
+ 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
+ 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
+ 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
+ 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
+ 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
+ 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
+ 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
+ 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
+ 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
+ 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
+ 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
+ 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
+ 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
+ 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
+ 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
+ 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
+ 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
+ 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
+ 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
+ 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
+ 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
+ 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
+ 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
+ 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
+ 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
+ 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
+ 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
+ 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
+ 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
+ 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
+ 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
+ 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
+ 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
+ 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
+ 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
+ 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
+ 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
+ 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
+ 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
+ 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
+ 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
+ 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
+ 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
+ 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
+ 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
+ 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
+ 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
+ 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
+ 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
+ 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
+ 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
+ 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
+ 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
+ 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
+ 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
+ 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
+ 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
+ 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
+ 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
+ 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
+$code.=<<___;
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+.type AES_Td,#object
+.size AES_Td,(.-AES_Td)
+
+.align 64
+.skip 16
+_sparcv9_AES_decrypt:
+ save %sp,-$frame-$locals,%sp
+ stx %i7,[%sp+$bias+$frame+0] ! off-load return address
+ ld [$key+240],$rounds
+ ld [$key+0],$t0
+ ld [$key+4],$t1 !
+ ld [$key+8],$t2
+ ld [$key+12],$t3
+ srl $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ld [$key+16],$t0
+ xor $t1,$s1,$s1
+ ld [$key+20],$t1
+ srl $s0,21,$acc0 !
+ xor $t2,$s2,$s2
+ ld [$key+24],$t2
+ xor $t3,$s3,$s3
+ and $acc0,2040,$acc0
+ ld [$key+28],$t3
+ srl $s3,13,$acc1
+ nop
+.Ldec_loop:
+ srl $s2,5,$acc2 !
+ and $acc1,2040,$acc1
+ ldx [$tbl+$acc0],$acc0
+ sll $s1,3,$acc3
+ and $acc2,2040,$acc2
+ ldx [$tbl+$acc1],$acc1
+ srl $s1,21,$acc4
+ and $acc3,2040,$acc3
+ ldx [$tbl+$acc2],$acc2 !
+ srl $s0,13,$acc5
+ and $acc4,2040,$acc4
+ ldx [$tbl+$acc3],$acc3
+ srl $s3,5,$acc6
+ and $acc5,2040,$acc5
+ ldx [$tbl+$acc4],$acc4
+ fmovs %f0,%f0
+ sll $s2,3,$acc7 !
+ and $acc6,2040,$acc6
+ ldx [$tbl+$acc5],$acc5
+ srl $s2,21,$acc8
+ and $acc7,2040,$acc7
+ ldx [$tbl+$acc6],$acc6
+ srl $s1,13,$acc9
+ and $acc8,2040,$acc8
+ ldx [$tbl+$acc7],$acc7 !
+ srl $s0,5,$acc10
+ and $acc9,2040,$acc9
+ ldx [$tbl+$acc8],$acc8
+ sll $s3,3,$acc11
+ and $acc10,2040,$acc10
+ ldx [$tbl+$acc9],$acc9
+ fmovs %f0,%f0
+ srl $s3,21,$acc12 !
+ and $acc11,2040,$acc11
+ ldx [$tbl+$acc10],$acc10
+ srl $s2,13,$acc13
+ and $acc12,2040,$acc12
+ ldx [$tbl+$acc11],$acc11
+ srl $s1,5,$acc14
+ and $acc13,2040,$acc13
+ ldx [$tbl+$acc12],$acc12 !
+ sll $s0,3,$acc15
+ and $acc14,2040,$acc14
+ ldx [$tbl+$acc13],$acc13
+ and $acc15,2040,$acc15
+ add $key,32,$key
+ ldx [$tbl+$acc14],$acc14
+ fmovs %f0,%f0
+ subcc $rounds,1,$rounds !
+ ldx [$tbl+$acc15],$acc15
+ bz,a,pn %icc,.Ldec_last
+ add $tbl,2048,$rounds
+
+ srlx $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ld [$key+0],$s0
+ fmovs %f0,%f0
+ srlx $acc2,16,$acc2 !
+ xor $acc1,$t0,$t0
+ ld [$key+4],$s1
+ srlx $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ld [$key+8],$s2
+ srlx $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ld [$key+12],$s3 !
+ srlx $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ fmovs %f0,%f0
+ srlx $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ srlx $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ srlx $acc10,16,$acc10 !
+ xor $acc7,$t1,$t1
+ srlx $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ srlx $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ srlx $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ srlx $acc15,24,$acc15 !
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ srl $t0,21,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3
+ srl $t3,13,$acc1
+
+ and $acc0,2040,$acc0 !
+ srl $t2,5,$acc2
+ and $acc1,2040,$acc1
+ ldx [$tbl+$acc0],$acc0
+ sll $t1,3,$acc3
+ and $acc2,2040,$acc2
+ ldx [$tbl+$acc1],$acc1
+ fmovs %f0,%f0
+ srl $t1,21,$acc4 !
+ and $acc3,2040,$acc3
+ ldx [$tbl+$acc2],$acc2
+ srl $t0,13,$acc5
+ and $acc4,2040,$acc4
+ ldx [$tbl+$acc3],$acc3
+ srl $t3,5,$acc6
+ and $acc5,2040,$acc5
+ ldx [$tbl+$acc4],$acc4 !
+ sll $t2,3,$acc7
+ and $acc6,2040,$acc6
+ ldx [$tbl+$acc5],$acc5
+ srl $t2,21,$acc8
+ and $acc7,2040,$acc7
+ ldx [$tbl+$acc6],$acc6
+ fmovs %f0,%f0
+ srl $t1,13,$acc9 !
+ and $acc8,2040,$acc8
+ ldx [$tbl+$acc7],$acc7
+ srl $t0,5,$acc10
+ and $acc9,2040,$acc9
+ ldx [$tbl+$acc8],$acc8
+ sll $t3,3,$acc11
+ and $acc10,2040,$acc10
+ ldx [$tbl+$acc9],$acc9 !
+ srl $t3,21,$acc12
+ and $acc11,2040,$acc11
+ ldx [$tbl+$acc10],$acc10
+ srl $t2,13,$acc13
+ and $acc12,2040,$acc12
+ ldx [$tbl+$acc11],$acc11
+ fmovs %f0,%f0
+ srl $t1,5,$acc14 !
+ and $acc13,2040,$acc13
+ ldx [$tbl+$acc12],$acc12
+ sll $t0,3,$acc15
+ and $acc14,2040,$acc14
+ ldx [$tbl+$acc13],$acc13
+ srlx $acc1,8,$acc1
+ and $acc15,2040,$acc15
+ ldx [$tbl+$acc14],$acc14 !
+
+ srlx $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldx [$tbl+$acc15],$acc15
+ srlx $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ld [$key+16],$t0
+ fmovs %f0,%f0
+ srlx $acc5,8,$acc5 !
+ xor $acc2,$s0,$s0
+ ld [$key+20],$t1
+ srlx $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ld [$key+24],$t2
+ srlx $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ld [$key+28],$t3 !
+ srlx $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldx [$tbl+2048+0],%g0 ! prefetch td4
+ srlx $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldx [$tbl+2048+32],%g0 ! prefetch td4
+ srlx $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldx [$tbl+2048+64],%g0 ! prefetch td4
+ srlx $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldx [$tbl+2048+96],%g0 ! prefetch td4
+ srlx $acc14,16,$acc14 !
+ xor $acc9,$s2,$s2
+ ldx [$tbl+2048+128],%g0 ! prefetch td4
+ srlx $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldx [$tbl+2048+160],%g0 ! prefetch td4
+ srl $s0,21,$acc0
+ xor $acc11,$s2,$s2
+ ldx [$tbl+2048+192],%g0 ! prefetch td4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldx [$tbl+2048+224],%g0 ! prefetch td4
+ and $acc0,2040,$acc0 !
+ xor $acc14,$s3,$s3
+ xor $acc15,$s3,$s3
+ ba .Ldec_loop
+ srl $s3,13,$acc1
+
+.align 32
+.Ldec_last:
+ srlx $acc1,8,$acc1 !
+ xor $acc0,$t0,$t0
+ ld [$key+0],$s0
+ srlx $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ld [$key+4],$s1
+ srlx $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ld [$key+8],$s2 !
+ srlx $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ld [$key+12],$s3
+ srlx $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ srlx $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ srlx $acc9,8,$acc9 !
+ xor $acc6,$t1,$t1
+ srlx $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ srlx $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ srlx $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ srlx $acc14,16,$acc14 !
+ xor $acc10,$t2,$t2
+ srlx $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ srl $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3 !
+ srl $t3,16,$acc1
+
+ srl $t2,8,$acc2
+ and $acc1,255,$acc1
+ ldub [$rounds+$acc0],$acc0
+ srl $t1,24,$acc4
+ and $acc2,255,$acc2
+ ldub [$rounds+$acc1],$acc1
+ srl $t0,16,$acc5 !
+ and $t1,255,$acc3
+ ldub [$rounds+$acc2],$acc2
+ ldub [$rounds+$acc3],$acc3
+ srl $t3,8,$acc6
+ and $acc5,255,$acc5
+ ldub [$rounds+$acc4],$acc4
+ fmovs %f0,%f0
+ srl $t2,24,$acc8 !
+ and $acc6,255,$acc6
+ ldub [$rounds+$acc5],$acc5
+ srl $t1,16,$acc9
+ and $t2,255,$acc7
+ ldub [$rounds+$acc6],$acc6
+ ldub [$rounds+$acc7],$acc7
+ fmovs %f0,%f0
+ srl $t0,8,$acc10 !
+ and $acc9,255,$acc9
+ ldub [$rounds+$acc8],$acc8
+ srl $t3,24,$acc12
+ and $acc10,255,$acc10
+ ldub [$rounds+$acc9],$acc9
+ srl $t2,16,$acc13
+ and $t3,255,$acc11
+ ldub [$rounds+$acc10],$acc10 !
+ srl $t1,8,$acc14
+ and $acc13,255,$acc13
+ ldub [$rounds+$acc11],$acc11
+ ldub [$rounds+$acc12],$acc12
+ and $acc14,255,$acc14
+ ldub [$rounds+$acc13],$acc13
+ and $t0,255,$acc15
+ ldub [$rounds+$acc14],$acc14 !
+
+ sll $acc0,24,$acc0
+ xor $acc3,$s0,$s0
+ ldub [$rounds+$acc15],$acc15
+ sll $acc1,16,$acc1
+ xor $acc0,$s0,$s0
+ ldx [%sp+$bias+$frame+0],%i7 ! restore return address
+ fmovs %f0,%f0
+ sll $acc2,8,$acc2 !
+ xor $acc1,$s0,$s0
+ sll $acc4,24,$acc4
+ xor $acc2,$s0,$s0
+ sll $acc5,16,$acc5
+ xor $acc7,$s1,$s1
+ sll $acc6,8,$acc6
+ xor $acc4,$s1,$s1
+ sll $acc8,24,$acc8 !
+ xor $acc5,$s1,$s1
+ sll $acc9,16,$acc9
+ xor $acc11,$s2,$s2
+ sll $acc10,8,$acc10
+ xor $acc6,$s1,$s1
+ sll $acc12,24,$acc12
+ xor $acc8,$s2,$s2
+ sll $acc13,16,$acc13 !
+ xor $acc9,$s2,$s2
+ sll $acc14,8,$acc14
+ xor $acc10,$s2,$s2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ xor $acc14,$s3,$s3
+ xor $acc15,$s3,$s3
+
+ ret
+ restore
+.type _sparcv9_AES_decrypt,#function
+.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
+
+.align 32
+.globl AES_decrypt
+AES_decrypt:
+ or %o0,%o1,%g1
+ andcc %g1,3,%g0
+ bnz,pn %xcc,.Lunaligned_dec
+ save %sp,-$frame,%sp
+
+ ld [%i0+0],%o0
+ ld [%i0+4],%o1
+ ld [%i0+8],%o2
+ ld [%i0+12],%o3
+
+1: call .+8
+ add %o7,AES_Td-1b,%o4
+ call _sparcv9_AES_decrypt
+ mov %i2,%o5
+
+ st %o0,[%i1+0]
+ st %o1,[%i1+4]
+ st %o2,[%i1+8]
+ st %o3,[%i1+12]
+
+ ret
+ restore
+
+.align 32
+.Lunaligned_dec:
+ ldub [%i0+0],%l0
+ ldub [%i0+1],%l1
+ ldub [%i0+2],%l2
+
+ sll %l0,24,%l0
+ ldub [%i0+3],%l3
+ sll %l1,16,%l1
+ ldub [%i0+4],%l4
+ sll %l2,8,%l2
+ or %l1,%l0,%l0
+ ldub [%i0+5],%l5
+ sll %l4,24,%l4
+ or %l3,%l2,%l2
+ ldub [%i0+6],%l6
+ sll %l5,16,%l5
+ or %l0,%l2,%o0
+ ldub [%i0+7],%l7
+
+ sll %l6,8,%l6
+ or %l5,%l4,%l4
+ ldub [%i0+8],%l0
+ or %l7,%l6,%l6
+ ldub [%i0+9],%l1
+ or %l4,%l6,%o1
+ ldub [%i0+10],%l2
+
+ sll %l0,24,%l0
+ ldub [%i0+11],%l3
+ sll %l1,16,%l1
+ ldub [%i0+12],%l4
+ sll %l2,8,%l2
+ or %l1,%l0,%l0
+ ldub [%i0+13],%l5
+ sll %l4,24,%l4
+ or %l3,%l2,%l2
+ ldub [%i0+14],%l6
+ sll %l5,16,%l5
+ or %l0,%l2,%o2
+ ldub [%i0+15],%l7
+
+ sll %l6,8,%l6
+ or %l5,%l4,%l4
+ or %l7,%l6,%l6
+ or %l4,%l6,%o3
+
+1: call .+8
+ add %o7,AES_Td-1b,%o4
+ call _sparcv9_AES_decrypt
+ mov %i2,%o5
+
+ srl %o0,24,%l0
+ srl %o0,16,%l1
+ stb %l0,[%i1+0]
+ srl %o0,8,%l2
+ stb %l1,[%i1+1]
+ stb %l2,[%i1+2]
+ srl %o1,24,%l4
+ stb %o0,[%i1+3]
+
+ srl %o1,16,%l5
+ stb %l4,[%i1+4]
+ srl %o1,8,%l6
+ stb %l5,[%i1+5]
+ stb %l6,[%i1+6]
+ srl %o2,24,%l0
+ stb %o1,[%i1+7]
+
+ srl %o2,16,%l1
+ stb %l0,[%i1+8]
+ srl %o2,8,%l2
+ stb %l1,[%i1+9]
+ stb %l2,[%i1+10]
+ srl %o3,24,%l4
+ stb %o2,[%i1+11]
+
+ srl %o3,16,%l5
+ stb %l4,[%i1+12]
+ srl %o3,8,%l6
+ stb %l5,[%i1+13]
+ stb %l6,[%i1+14]
+ stb %o3,[%i1+15]
+
+ ret
+ restore
+.type AES_decrypt,#function
+.size AES_decrypt,(.-AES_decrypt)
+___
+
+# fmovs instructions substituting for FP nops were originally added
+# to meet specific instruction alignment requirements to maximize ILP.
+# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
+# undesired effect, so just omit them and sacrifice some portion of
+# percent in performance...
+$code =~ s/fmovs.*$//gm;
+
+print $code;
+close STDOUT; # ensure flush
diff --git a/openssl-1.1.0h/crypto/aes/asm/aes-x86_64.pl b/openssl-1.1.0h/crypto/aes/asm/aes-x86_64.pl
new file mode 100755
index 0000000..ce4ca30
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aes-x86_64.pl
@@ -0,0 +1,2820 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Version 2.1.
+#
+# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
+# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
+# [you'll notice a lot of resemblance], such as compressed S-boxes
+# in little-endian byte order, prefetch of these tables in CBC mode,
+# as well as avoiding L1 cache aliasing between stack frame and key
+# schedule and already mentioned tables, compressed Td4...
+#
+# Performance in number of cycles per processed byte for 128-bit key:
+#
+# ECB encrypt ECB decrypt CBC large chunk
+# AMD64 33 43 13.0
+# EM64T 38 56 18.6(*)
+# Core 2 30 42 14.5(*)
+# Atom 65 86 32.1(*)
+#
+# (*) with hyper-threading off
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$verticalspin=1; # unlike 32-bit version $verticalspin performs
+ # ~15% better on both AMD and Intel cores
+$speed_limit=512; # see aes-586.pl for details
+
+$code=".text\n";
+
+$s0="%eax";
+$s1="%ebx";
+$s2="%ecx";
+$s3="%edx";
+$acc0="%esi"; $mask80="%rsi";
+$acc1="%edi"; $maskfe="%rdi";
+$acc2="%ebp"; $mask1b="%rbp";
+$inp="%r8";
+$out="%r9";
+$t0="%r10d";
+$t1="%r11d";
+$t2="%r12d";
+$rnds="%r13d";
+$sbox="%r14";
+$key="%r15";
+
+sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
+sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
+ $r =~ s/%[er]([sd]i)/%\1l/;
+ $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
+sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
+ $r =~ s/%r([0-9]+)/%r\1d/; $r; }
+sub _data_word()
+{ my $i;
+ while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
+}
+sub data_word()
+{ my $i;
+ my $last=pop(@_);
+ $code.=".long\t";
+ while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
+ $code.=sprintf"0x%08x\n",$last;
+}
+
+sub data_byte()
+{ my $i;
+ my $last=pop(@_);
+ $code.=".byte\t";
+ while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
+ $code.=sprintf"0x%02x\n",$last&0xff;
+}
+
+sub encvert()
+{ my $t3="%r8d"; # zaps $inp!
+
+$code.=<<___;
+ # favor 3-way issue Opteron pipeline...
+ movzb `&lo("$s0")`,$acc0
+ movzb `&lo("$s1")`,$acc1
+ movzb `&lo("$s2")`,$acc2
+ mov 0($sbox,$acc0,8),$t0
+ mov 0($sbox,$acc1,8),$t1
+ mov 0($sbox,$acc2,8),$t2
+
+ movzb `&hi("$s1")`,$acc0
+ movzb `&hi("$s2")`,$acc1
+ movzb `&lo("$s3")`,$acc2
+ xor 3($sbox,$acc0,8),$t0
+ xor 3($sbox,$acc1,8),$t1
+ mov 0($sbox,$acc2,8),$t3
+
+ movzb `&hi("$s3")`,$acc0
+ shr \$16,$s2
+ movzb `&hi("$s0")`,$acc2
+ xor 3($sbox,$acc0,8),$t2
+ shr \$16,$s3
+ xor 3($sbox,$acc2,8),$t3
+
+ shr \$16,$s1
+ lea 16($key),$key
+ shr \$16,$s0
+
+ movzb `&lo("$s2")`,$acc0
+ movzb `&lo("$s3")`,$acc1
+ movzb `&lo("$s0")`,$acc2
+ xor 2($sbox,$acc0,8),$t0
+ xor 2($sbox,$acc1,8),$t1
+ xor 2($sbox,$acc2,8),$t2
+
+ movzb `&hi("$s3")`,$acc0
+ movzb `&hi("$s0")`,$acc1
+ movzb `&lo("$s1")`,$acc2
+ xor 1($sbox,$acc0,8),$t0
+ xor 1($sbox,$acc1,8),$t1
+ xor 2($sbox,$acc2,8),$t3
+
+ mov 12($key),$s3
+ movzb `&hi("$s1")`,$acc1
+ movzb `&hi("$s2")`,$acc2
+ mov 0($key),$s0
+ xor 1($sbox,$acc1,8),$t2
+ xor 1($sbox,$acc2,8),$t3
+
+ mov 4($key),$s1
+ mov 8($key),$s2
+ xor $t0,$s0
+ xor $t1,$s1
+ xor $t2,$s2
+ xor $t3,$s3
+___
+}
+
+sub enclastvert()
+{ my $t3="%r8d"; # zaps $inp!
+
+$code.=<<___;
+ movzb `&lo("$s0")`,$acc0
+ movzb `&lo("$s1")`,$acc1
+ movzb `&lo("$s2")`,$acc2
+ movzb 2($sbox,$acc0,8),$t0
+ movzb 2($sbox,$acc1,8),$t1
+ movzb 2($sbox,$acc2,8),$t2
+
+ movzb `&lo("$s3")`,$acc0
+ movzb `&hi("$s1")`,$acc1
+ movzb `&hi("$s2")`,$acc2
+ movzb 2($sbox,$acc0,8),$t3
+ mov 0($sbox,$acc1,8),$acc1 #$t0
+ mov 0($sbox,$acc2,8),$acc2 #$t1
+
+ and \$0x0000ff00,$acc1
+ and \$0x0000ff00,$acc2
+
+ xor $acc1,$t0
+ xor $acc2,$t1
+ shr \$16,$s2
+
+ movzb `&hi("$s3")`,$acc0
+ movzb `&hi("$s0")`,$acc1
+ shr \$16,$s3
+ mov 0($sbox,$acc0,8),$acc0 #$t2
+ mov 0($sbox,$acc1,8),$acc1 #$t3
+
+ and \$0x0000ff00,$acc0
+ and \$0x0000ff00,$acc1
+ shr \$16,$s1
+ xor $acc0,$t2
+ xor $acc1,$t3
+ shr \$16,$s0
+
+ movzb `&lo("$s2")`,$acc0
+ movzb `&lo("$s3")`,$acc1
+ movzb `&lo("$s0")`,$acc2
+ mov 0($sbox,$acc0,8),$acc0 #$t0
+ mov 0($sbox,$acc1,8),$acc1 #$t1
+ mov 0($sbox,$acc2,8),$acc2 #$t2
+
+ and \$0x00ff0000,$acc0
+ and \$0x00ff0000,$acc1
+ and \$0x00ff0000,$acc2
+
+ xor $acc0,$t0
+ xor $acc1,$t1
+ xor $acc2,$t2
+
+ movzb `&lo("$s1")`,$acc0
+ movzb `&hi("$s3")`,$acc1
+ movzb `&hi("$s0")`,$acc2
+ mov 0($sbox,$acc0,8),$acc0 #$t3
+ mov 2($sbox,$acc1,8),$acc1 #$t0
+ mov 2($sbox,$acc2,8),$acc2 #$t1
+
+ and \$0x00ff0000,$acc0
+ and \$0xff000000,$acc1
+ and \$0xff000000,$acc2
+
+ xor $acc0,$t3
+ xor $acc1,$t0
+ xor $acc2,$t1
+
+ movzb `&hi("$s1")`,$acc0
+ movzb `&hi("$s2")`,$acc1
+ mov 16+12($key),$s3
+ mov 2($sbox,$acc0,8),$acc0 #$t2
+ mov 2($sbox,$acc1,8),$acc1 #$t3
+ mov 16+0($key),$s0
+
+ and \$0xff000000,$acc0
+ and \$0xff000000,$acc1
+
+ xor $acc0,$t2
+ xor $acc1,$t3
+
+ mov 16+4($key),$s1
+ mov 16+8($key),$s2
+ xor $t0,$s0
+ xor $t1,$s1
+ xor $t2,$s2
+ xor $t3,$s3
+___
+}
+
+sub encstep()
+{ my ($i,@s) = @_;
+ my $tmp0=$acc0;
+ my $tmp1=$acc1;
+ my $tmp2=$acc2;
+ my $out=($t0,$t1,$t2,$s[0])[$i];
+
+ if ($i==3) {
+ $tmp0=$s[1];
+ $tmp1=$s[2];
+ $tmp2=$s[3];
+ }
+ $code.=" movzb ".&lo($s[0]).",$out\n";
+ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
+ $code.=" lea 16($key),$key\n" if ($i==0);
+
+ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
+ $code.=" mov 0($sbox,$out,8),$out\n";
+
+ $code.=" shr \$16,$tmp1\n";
+ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
+ $code.=" xor 3($sbox,$tmp0,8),$out\n";
+
+ $code.=" movzb ".&lo($tmp1).",$tmp1\n";
+ $code.=" shr \$24,$tmp2\n";
+ $code.=" xor 4*$i($key),$out\n";
+
+ $code.=" xor 2($sbox,$tmp1,8),$out\n";
+ $code.=" xor 1($sbox,$tmp2,8),$out\n";
+
+ $code.=" mov $t0,$s[1]\n" if ($i==3);
+ $code.=" mov $t1,$s[2]\n" if ($i==3);
+ $code.=" mov $t2,$s[3]\n" if ($i==3);
+ $code.="\n";
+}
+
+sub enclast()
+{ my ($i,@s)=@_;
+ my $tmp0=$acc0;
+ my $tmp1=$acc1;
+ my $tmp2=$acc2;
+ my $out=($t0,$t1,$t2,$s[0])[$i];
+
+ if ($i==3) {
+ $tmp0=$s[1];
+ $tmp1=$s[2];
+ $tmp2=$s[3];
+ }
+ $code.=" movzb ".&lo($s[0]).",$out\n";
+ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
+
+ $code.=" mov 2($sbox,$out,8),$out\n";
+ $code.=" shr \$16,$tmp1\n";
+ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
+
+ $code.=" and \$0x000000ff,$out\n";
+ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
+ $code.=" movzb ".&lo($tmp1).",$tmp1\n";
+ $code.=" shr \$24,$tmp2\n";
+
+ $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
+ $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
+ $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
+
+ $code.=" and \$0x0000ff00,$tmp0\n";
+ $code.=" and \$0x00ff0000,$tmp1\n";
+ $code.=" and \$0xff000000,$tmp2\n";
+
+ $code.=" xor $tmp0,$out\n";
+ $code.=" mov $t0,$s[1]\n" if ($i==3);
+ $code.=" xor $tmp1,$out\n";
+ $code.=" mov $t1,$s[2]\n" if ($i==3);
+ $code.=" xor $tmp2,$out\n";
+ $code.=" mov $t2,$s[3]\n" if ($i==3);
+ $code.="\n";
+}
+
+$code.=<<___;
+.type _x86_64_AES_encrypt,\@abi-omnipotent
+.align 16
+_x86_64_AES_encrypt:
+ xor 0($key),$s0 # xor with key
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+
+ mov 240($key),$rnds # load key->rounds
+ sub \$1,$rnds
+ jmp .Lenc_loop
+.align 16
+.Lenc_loop:
+___
+ if ($verticalspin) { &encvert(); }
+ else { &encstep(0,$s0,$s1,$s2,$s3);
+ &encstep(1,$s1,$s2,$s3,$s0);
+ &encstep(2,$s2,$s3,$s0,$s1);
+ &encstep(3,$s3,$s0,$s1,$s2);
+ }
+$code.=<<___;
+ sub \$1,$rnds
+ jnz .Lenc_loop
+___
+ if ($verticalspin) { &enclastvert(); }
+ else { &enclast(0,$s0,$s1,$s2,$s3);
+ &enclast(1,$s1,$s2,$s3,$s0);
+ &enclast(2,$s2,$s3,$s0,$s1);
+ &enclast(3,$s3,$s0,$s1,$s2);
+ $code.=<<___;
+ xor 16+0($key),$s0 # xor with key
+ xor 16+4($key),$s1
+ xor 16+8($key),$s2
+ xor 16+12($key),$s3
+___
+ }
+$code.=<<___;
+ .byte 0xf3,0xc3 # rep ret
+.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
+___
+
+# it's possible to implement this by shifting tN by 8, filling least
+# significant byte with byte load and finally bswap-ing at the end,
+# but such partial register load kills Core 2...
+sub enccompactvert()
+{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
+
+$code.=<<___;
+ movzb `&lo("$s0")`,$t0
+ movzb `&lo("$s1")`,$t1
+ movzb `&lo("$s2")`,$t2
+ movzb `&lo("$s3")`,$t3
+ movzb `&hi("$s1")`,$acc0
+ movzb `&hi("$s2")`,$acc1
+ shr \$16,$s2
+ movzb `&hi("$s3")`,$acc2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
+ movzb ($sbox,$t3,1),$t3
+
+ movzb ($sbox,$acc0,1),$t4 #$t0
+ movzb `&hi("$s0")`,$acc0
+ movzb ($sbox,$acc1,1),$t5 #$t1
+ movzb `&lo("$s2")`,$acc1
+ movzb ($sbox,$acc2,1),$acc2 #$t2
+ movzb ($sbox,$acc0,1),$acc0 #$t3
+
+ shl \$8,$t4
+ shr \$16,$s3
+ shl \$8,$t5
+ xor $t4,$t0
+ shr \$16,$s0
+ movzb `&lo("$s3")`,$t4
+ shr \$16,$s1
+ xor $t5,$t1
+ shl \$8,$acc2
+ movzb `&lo("$s0")`,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ xor $acc2,$t2
+
+ shl \$8,$acc0
+ movzb `&lo("$s1")`,$acc2
+ shl \$16,$acc1
+ xor $acc0,$t3
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb `&hi("$s3")`,$acc0
+ movzb ($sbox,$t5,1),$t5 #$t2
+ xor $acc1,$t0
+
+ shr \$8,$s2
+ movzb `&hi("$s0")`,$acc1
+ shl \$16,$t4
+ shr \$8,$s1
+ shl \$16,$t5
+ xor $t4,$t1
+ movzb ($sbox,$acc2,1),$acc2 #$t3
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ movzb ($sbox,$acc1,1),$acc1 #$t1
+ movzb ($sbox,$s2,1),$s3 #$t3
+ movzb ($sbox,$s1,1),$s2 #$t2
+
+ shl \$16,$acc2
+ xor $t5,$t2
+ shl \$24,$acc0
+ xor $acc2,$t3
+ shl \$24,$acc1
+ xor $acc0,$t0
+ shl \$24,$s3
+ xor $acc1,$t1
+ shl \$24,$s2
+ mov $t0,$s0
+ mov $t1,$s1
+ xor $t2,$s2
+ xor $t3,$s3
+___
+}
+
+sub enctransform_ref()
+{ my $sn = shift;
+ my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
+
+$code.=<<___;
+ mov $sn,$acc
+ and \$0x80808080,$acc
+ mov $acc,$tmp
+ shr \$7,$tmp
+ lea ($sn,$sn),$r2
+ sub $tmp,$acc
+ and \$0xfefefefe,$r2
+ and \$0x1b1b1b1b,$acc
+ mov $sn,$tmp
+ xor $acc,$r2
+
+ xor $r2,$sn
+ rol \$24,$sn
+ xor $r2,$sn
+ ror \$16,$tmp
+ xor $tmp,$sn
+ ror \$8,$tmp
+ xor $tmp,$sn
+___
+}
+
+# unlike decrypt case it does not pay off to parallelize enctransform
+sub enctransform()
+{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
+
+$code.=<<___;
+ mov \$0x80808080,$t0
+ mov \$0x80808080,$t1
+ and $s0,$t0
+ and $s1,$t1
+ mov $t0,$acc0
+ mov $t1,$acc1
+ shr \$7,$t0
+ lea ($s0,$s0),$r20
+ shr \$7,$t1
+ lea ($s1,$s1),$r21
+ sub $t0,$acc0
+ sub $t1,$acc1
+ and \$0xfefefefe,$r20
+ and \$0xfefefefe,$r21
+ and \$0x1b1b1b1b,$acc0
+ and \$0x1b1b1b1b,$acc1
+ mov $s0,$t0
+ mov $s1,$t1
+ xor $acc0,$r20
+ xor $acc1,$r21
+
+ xor $r20,$s0
+ xor $r21,$s1
+ mov \$0x80808080,$t2
+ rol \$24,$s0
+ mov \$0x80808080,$t3
+ rol \$24,$s1
+ and $s2,$t2
+ and $s3,$t3
+ xor $r20,$s0
+ xor $r21,$s1
+ mov $t2,$acc0
+ ror \$16,$t0
+ mov $t3,$acc1
+ ror \$16,$t1
+ lea ($s2,$s2),$r20
+ shr \$7,$t2
+ xor $t0,$s0
+ shr \$7,$t3
+ xor $t1,$s1
+ ror \$8,$t0
+ lea ($s3,$s3),$r21
+ ror \$8,$t1
+ sub $t2,$acc0
+ sub $t3,$acc1
+ xor $t0,$s0
+ xor $t1,$s1
+
+ and \$0xfefefefe,$r20
+ and \$0xfefefefe,$r21
+ and \$0x1b1b1b1b,$acc0
+ and \$0x1b1b1b1b,$acc1
+ mov $s2,$t2
+ mov $s3,$t3
+ xor $acc0,$r20
+ xor $acc1,$r21
+
+ ror \$16,$t2
+ xor $r20,$s2
+ ror \$16,$t3
+ xor $r21,$s3
+ rol \$24,$s2
+ mov 0($sbox),$acc0 # prefetch Te4
+ rol \$24,$s3
+ xor $r20,$s2
+ mov 64($sbox),$acc1
+ xor $r21,$s3
+ mov 128($sbox),$r20
+ xor $t2,$s2
+ ror \$8,$t2
+ xor $t3,$s3
+ ror \$8,$t3
+ xor $t2,$s2
+ mov 192($sbox),$r21
+ xor $t3,$s3
+___
+}
+
+$code.=<<___;
+.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
+.align 16
+_x86_64_AES_encrypt_compact:
+ lea 128($sbox),$inp # size optimization
+ mov 0-128($inp),$acc1 # prefetch Te4
+ mov 32-128($inp),$acc2
+ mov 64-128($inp),$t0
+ mov 96-128($inp),$t1
+ mov 128-128($inp),$acc1
+ mov 160-128($inp),$acc2
+ mov 192-128($inp),$t0
+ mov 224-128($inp),$t1
+ jmp .Lenc_loop_compact
+.align 16
+.Lenc_loop_compact:
+ xor 0($key),$s0 # xor with key
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ lea 16($key),$key
+___
+ &enccompactvert();
+$code.=<<___;
+ cmp 16(%rsp),$key
+ je .Lenc_compact_done
+___
+ &enctransform();
+$code.=<<___;
+ jmp .Lenc_loop_compact
+.align 16
+.Lenc_compact_done:
+ xor 0($key),$s0
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ .byte 0xf3,0xc3 # rep ret
+.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
+___
+
+# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
+$code.=<<___;
+.globl AES_encrypt
+.type AES_encrypt,\@function,3
+.align 16
+.globl asm_AES_encrypt
+.hidden asm_AES_encrypt
+asm_AES_encrypt:
+AES_encrypt:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # allocate frame "above" key schedule
+ mov %rsp,%r10
+ lea -63(%rdx),%rcx # %rdx is key argument
+ and \$-64,%rsp
+ sub %rsp,%rcx
+ neg %rcx
+ and \$0x3c0,%rcx
+ sub %rcx,%rsp
+ sub \$32,%rsp
+
+ mov %rsi,16(%rsp) # save out
+ mov %r10,24(%rsp) # save real stack pointer
+.Lenc_prologue:
+
+ mov %rdx,$key
+ mov 240($key),$rnds # load rounds
+
+ mov 0(%rdi),$s0 # load input vector
+ mov 4(%rdi),$s1
+ mov 8(%rdi),$s2
+ mov 12(%rdi),$s3
+
+ shl \$4,$rnds
+ lea ($key,$rnds),%rbp
+ mov $key,(%rsp) # key schedule
+ mov %rbp,8(%rsp) # end of key schedule
+
+ # pick Te4 copy which can't "overlap" with stack frame or key schedule
+ lea .LAES_Te+2048(%rip),$sbox
+ lea 768(%rsp),%rbp
+ sub $sbox,%rbp
+ and \$0x300,%rbp
+ lea ($sbox,%rbp),$sbox
+
+ call _x86_64_AES_encrypt_compact
+
+ mov 16(%rsp),$out # restore out
+ mov 24(%rsp),%rsi # restore saved stack pointer
+ mov $s0,0($out) # write output vector
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lenc_epilogue:
+ ret
+.size AES_encrypt,.-AES_encrypt
+___
+
+#------------------------------------------------------------------#
+
+sub decvert()
+{ my $t3="%r8d"; # zaps $inp!
+
+$code.=<<___;
+ # favor 3-way issue Opteron pipeline...
+ movzb `&lo("$s0")`,$acc0
+ movzb `&lo("$s1")`,$acc1
+ movzb `&lo("$s2")`,$acc2
+ mov 0($sbox,$acc0,8),$t0
+ mov 0($sbox,$acc1,8),$t1
+ mov 0($sbox,$acc2,8),$t2
+
+ movzb `&hi("$s3")`,$acc0
+ movzb `&hi("$s0")`,$acc1
+ movzb `&lo("$s3")`,$acc2
+ xor 3($sbox,$acc0,8),$t0
+ xor 3($sbox,$acc1,8),$t1
+ mov 0($sbox,$acc2,8),$t3
+
+ movzb `&hi("$s1")`,$acc0
+ shr \$16,$s0
+ movzb `&hi("$s2")`,$acc2
+ xor 3($sbox,$acc0,8),$t2
+ shr \$16,$s3
+ xor 3($sbox,$acc2,8),$t3
+
+ shr \$16,$s1
+ lea 16($key),$key
+ shr \$16,$s2
+
+ movzb `&lo("$s2")`,$acc0
+ movzb `&lo("$s3")`,$acc1
+ movzb `&lo("$s0")`,$acc2
+ xor 2($sbox,$acc0,8),$t0
+ xor 2($sbox,$acc1,8),$t1
+ xor 2($sbox,$acc2,8),$t2
+
+ movzb `&hi("$s1")`,$acc0
+ movzb `&hi("$s2")`,$acc1
+ movzb `&lo("$s1")`,$acc2
+ xor 1($sbox,$acc0,8),$t0
+ xor 1($sbox,$acc1,8),$t1
+ xor 2($sbox,$acc2,8),$t3
+
+ movzb `&hi("$s3")`,$acc0
+ mov 12($key),$s3
+ movzb `&hi("$s0")`,$acc2
+ xor 1($sbox,$acc0,8),$t2
+ mov 0($key),$s0
+ xor 1($sbox,$acc2,8),$t3
+
+ xor $t0,$s0
+ mov 4($key),$s1
+ mov 8($key),$s2
+ xor $t2,$s2
+ xor $t1,$s1
+ xor $t3,$s3
+___
+}
+
+sub declastvert()
+{ my $t3="%r8d"; # zaps $inp!
+
+$code.=<<___;
+ lea 2048($sbox),$sbox # size optimization
+ movzb `&lo("$s0")`,$acc0
+ movzb `&lo("$s1")`,$acc1
+ movzb `&lo("$s2")`,$acc2
+ movzb ($sbox,$acc0,1),$t0
+ movzb ($sbox,$acc1,1),$t1
+ movzb ($sbox,$acc2,1),$t2
+
+ movzb `&lo("$s3")`,$acc0
+ movzb `&hi("$s3")`,$acc1
+ movzb `&hi("$s0")`,$acc2
+ movzb ($sbox,$acc0,1),$t3
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ movzb ($sbox,$acc2,1),$acc2 #$t1
+
+ shl \$8,$acc1
+ shl \$8,$acc2
+
+ xor $acc1,$t0
+ xor $acc2,$t1
+ shr \$16,$s3
+
+ movzb `&hi("$s1")`,$acc0
+ movzb `&hi("$s2")`,$acc1
+ shr \$16,$s0
+ movzb ($sbox,$acc0,1),$acc0 #$t2
+ movzb ($sbox,$acc1,1),$acc1 #$t3
+
+ shl \$8,$acc0
+ shl \$8,$acc1
+ shr \$16,$s1
+ xor $acc0,$t2
+ xor $acc1,$t3
+ shr \$16,$s2
+
+ movzb `&lo("$s2")`,$acc0
+ movzb `&lo("$s3")`,$acc1
+ movzb `&lo("$s0")`,$acc2
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ movzb ($sbox,$acc1,1),$acc1 #$t1
+ movzb ($sbox,$acc2,1),$acc2 #$t2
+
+ shl \$16,$acc0
+ shl \$16,$acc1
+ shl \$16,$acc2
+
+ xor $acc0,$t0
+ xor $acc1,$t1
+ xor $acc2,$t2
+
+ movzb `&lo("$s1")`,$acc0
+ movzb `&hi("$s1")`,$acc1
+ movzb `&hi("$s2")`,$acc2
+ movzb ($sbox,$acc0,1),$acc0 #$t3
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ movzb ($sbox,$acc2,1),$acc2 #$t1
+
+ shl \$16,$acc0
+ shl \$24,$acc1
+ shl \$24,$acc2
+
+ xor $acc0,$t3
+ xor $acc1,$t0
+ xor $acc2,$t1
+
+ movzb `&hi("$s3")`,$acc0
+ movzb `&hi("$s0")`,$acc1
+ mov 16+12($key),$s3
+ movzb ($sbox,$acc0,1),$acc0 #$t2
+ movzb ($sbox,$acc1,1),$acc1 #$t3
+ mov 16+0($key),$s0
+
+ shl \$24,$acc0
+ shl \$24,$acc1
+
+ xor $acc0,$t2
+ xor $acc1,$t3
+
+ mov 16+4($key),$s1
+ mov 16+8($key),$s2
+ lea -2048($sbox),$sbox
+ xor $t0,$s0
+ xor $t1,$s1
+ xor $t2,$s2
+ xor $t3,$s3
+___
+}
+
+sub decstep()
+{ my ($i,@s) = @_;
+ my $tmp0=$acc0;
+ my $tmp1=$acc1;
+ my $tmp2=$acc2;
+ my $out=($t0,$t1,$t2,$s[0])[$i];
+
+ $code.=" mov $s[0],$out\n" if ($i!=3);
+ $tmp1=$s[2] if ($i==3);
+ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
+ $code.=" and \$0xFF,$out\n";
+
+ $code.=" mov 0($sbox,$out,8),$out\n";
+ $code.=" shr \$16,$tmp1\n";
+ $tmp2=$s[3] if ($i==3);
+ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
+
+ $tmp0=$s[1] if ($i==3);
+ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
+ $code.=" and \$0xFF,$tmp1\n";
+ $code.=" shr \$24,$tmp2\n";
+
+ $code.=" xor 3($sbox,$tmp0,8),$out\n";
+ $code.=" xor 2($sbox,$tmp1,8),$out\n";
+ $code.=" xor 1($sbox,$tmp2,8),$out\n";
+
+ $code.=" mov $t2,$s[1]\n" if ($i==3);
+ $code.=" mov $t1,$s[2]\n" if ($i==3);
+ $code.=" mov $t0,$s[3]\n" if ($i==3);
+ $code.="\n";
+}
+
+sub declast()
+{ my ($i,@s)=@_;
+ my $tmp0=$acc0;
+ my $tmp1=$acc1;
+ my $tmp2=$acc2;
+ my $out=($t0,$t1,$t2,$s[0])[$i];
+
+ $code.=" mov $s[0],$out\n" if ($i!=3);
+ $tmp1=$s[2] if ($i==3);
+ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
+ $code.=" and \$0xFF,$out\n";
+
+ $code.=" movzb 2048($sbox,$out,1),$out\n";
+ $code.=" shr \$16,$tmp1\n";
+ $tmp2=$s[3] if ($i==3);
+ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
+
+ $tmp0=$s[1] if ($i==3);
+ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
+ $code.=" and \$0xFF,$tmp1\n";
+ $code.=" shr \$24,$tmp2\n";
+
+ $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
+ $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
+ $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
+
+ $code.=" shl \$8,$tmp0\n";
+ $code.=" shl \$16,$tmp1\n";
+ $code.=" shl \$24,$tmp2\n";
+
+ $code.=" xor $tmp0,$out\n";
+ $code.=" mov $t2,$s[1]\n" if ($i==3);
+ $code.=" xor $tmp1,$out\n";
+ $code.=" mov $t1,$s[2]\n" if ($i==3);
+ $code.=" xor $tmp2,$out\n";
+ $code.=" mov $t0,$s[3]\n" if ($i==3);
+ $code.="\n";
+}
+
+$code.=<<___;
+.type _x86_64_AES_decrypt,\@abi-omnipotent
+.align 16
+_x86_64_AES_decrypt:
+ xor 0($key),$s0 # xor with key
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+
+ mov 240($key),$rnds # load key->rounds
+ sub \$1,$rnds
+ jmp .Ldec_loop
+.align 16
+.Ldec_loop:
+___
+ if ($verticalspin) { &decvert(); }
+ else { &decstep(0,$s0,$s3,$s2,$s1);
+ &decstep(1,$s1,$s0,$s3,$s2);
+ &decstep(2,$s2,$s1,$s0,$s3);
+ &decstep(3,$s3,$s2,$s1,$s0);
+ $code.=<<___;
+ lea 16($key),$key
+ xor 0($key),$s0 # xor with key
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+___
+ }
+$code.=<<___;
+ sub \$1,$rnds
+ jnz .Ldec_loop
+___
+ if ($verticalspin) { &declastvert(); }
+ else { &declast(0,$s0,$s3,$s2,$s1);
+ &declast(1,$s1,$s0,$s3,$s2);
+ &declast(2,$s2,$s1,$s0,$s3);
+ &declast(3,$s3,$s2,$s1,$s0);
+ $code.=<<___;
+ xor 16+0($key),$s0 # xor with key
+ xor 16+4($key),$s1
+ xor 16+8($key),$s2
+ xor 16+12($key),$s3
+___
+ }
+$code.=<<___;
+ .byte 0xf3,0xc3 # rep ret
+.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
+___
+
+sub deccompactvert()
+{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
+
+$code.=<<___;
+ movzb `&lo("$s0")`,$t0
+ movzb `&lo("$s1")`,$t1
+ movzb `&lo("$s2")`,$t2
+ movzb `&lo("$s3")`,$t3
+ movzb `&hi("$s3")`,$acc0
+ movzb `&hi("$s0")`,$acc1
+ shr \$16,$s3
+ movzb `&hi("$s1")`,$acc2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
+ movzb ($sbox,$t3,1),$t3
+
+ movzb ($sbox,$acc0,1),$t4 #$t0
+ movzb `&hi("$s2")`,$acc0
+ movzb ($sbox,$acc1,1),$t5 #$t1
+ movzb ($sbox,$acc2,1),$acc2 #$t2
+ movzb ($sbox,$acc0,1),$acc0 #$t3
+
+ shr \$16,$s2
+ shl \$8,$t5
+ shl \$8,$t4
+ movzb `&lo("$s2")`,$acc1
+ shr \$16,$s0
+ xor $t4,$t0
+ shr \$16,$s1
+ movzb `&lo("$s3")`,$t4
+
+ shl \$8,$acc2
+ xor $t5,$t1
+ shl \$8,$acc0
+ movzb `&lo("$s0")`,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
+ xor $acc2,$t2
+ movzb `&lo("$s1")`,$acc2
+
+ shl \$16,$acc1
+ xor $acc0,$t3
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb `&hi("$s1")`,$acc0
+ movzb ($sbox,$acc2,1),$acc2 #$t3
+ xor $acc1,$t0
+ movzb ($sbox,$t5,1),$t5 #$t2
+ movzb `&hi("$s2")`,$acc1
+
+ shl \$16,$acc2
+ shl \$16,$t4
+ shl \$16,$t5
+ xor $acc2,$t3
+ movzb `&hi("$s3")`,$acc2
+ xor $t4,$t1
+ shr \$8,$s0
+ xor $t5,$t2
+
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ movzb ($sbox,$acc1,1),$s1 #$t1
+ movzb ($sbox,$acc2,1),$s2 #$t2
+ movzb ($sbox,$s0,1),$s3 #$t3
+
+ mov $t0,$s0
+ shl \$24,$acc0
+ shl \$24,$s1
+ shl \$24,$s2
+ xor $acc0,$s0
+ shl \$24,$s3
+ xor $t1,$s1
+ xor $t2,$s2
+ xor $t3,$s3
+___
+}
+
+# parallelized version! input is pair of 64-bit values: %rax=s1.s0
+# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
+# %ecx=s2 and %edx=s3.
+sub dectransform()
+{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
+ my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
+ my $prefetch = shift;
+
+$code.=<<___;
+ mov $mask80,$tp40
+ mov $mask80,$tp48
+ and $tp10,$tp40
+ and $tp18,$tp48
+ mov $tp40,$acc0
+ mov $tp48,$acc8
+ shr \$7,$tp40
+ lea ($tp10,$tp10),$tp20
+ shr \$7,$tp48
+ lea ($tp18,$tp18),$tp28
+ sub $tp40,$acc0
+ sub $tp48,$acc8
+ and $maskfe,$tp20
+ and $maskfe,$tp28
+ and $mask1b,$acc0
+ and $mask1b,$acc8
+ xor $acc0,$tp20
+ xor $acc8,$tp28
+ mov $mask80,$tp80
+ mov $mask80,$tp88
+
+ and $tp20,$tp80
+ and $tp28,$tp88
+ mov $tp80,$acc0
+ mov $tp88,$acc8
+ shr \$7,$tp80
+ lea ($tp20,$tp20),$tp40
+ shr \$7,$tp88
+ lea ($tp28,$tp28),$tp48
+ sub $tp80,$acc0
+ sub $tp88,$acc8
+ and $maskfe,$tp40
+ and $maskfe,$tp48
+ and $mask1b,$acc0
+ and $mask1b,$acc8
+ xor $acc0,$tp40
+ xor $acc8,$tp48
+ mov $mask80,$tp80
+ mov $mask80,$tp88
+
+ and $tp40,$tp80
+ and $tp48,$tp88
+ mov $tp80,$acc0
+ mov $tp88,$acc8
+ shr \$7,$tp80
+ xor $tp10,$tp20 # tp2^=tp1
+ shr \$7,$tp88
+ xor $tp18,$tp28 # tp2^=tp1
+ sub $tp80,$acc0
+ sub $tp88,$acc8
+ lea ($tp40,$tp40),$tp80
+ lea ($tp48,$tp48),$tp88
+ xor $tp10,$tp40 # tp4^=tp1
+ xor $tp18,$tp48 # tp4^=tp1
+ and $maskfe,$tp80
+ and $maskfe,$tp88
+ and $mask1b,$acc0
+ and $mask1b,$acc8
+ xor $acc0,$tp80
+ xor $acc8,$tp88
+
+ xor $tp80,$tp10 # tp1^=tp8
+ xor $tp88,$tp18 # tp1^=tp8
+ xor $tp80,$tp20 # tp2^tp1^=tp8
+ xor $tp88,$tp28 # tp2^tp1^=tp8
+ mov $tp10,$acc0
+ mov $tp18,$acc8
+ xor $tp80,$tp40 # tp4^tp1^=tp8
+ shr \$32,$acc0
+ xor $tp88,$tp48 # tp4^tp1^=tp8
+ shr \$32,$acc8
+ xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
+ rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
+ xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
+ rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
+ xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
+ rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
+ xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
+
+ rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
+ xor `&LO("$tp80")`,`&LO("$tp10")`
+ shr \$32,$tp80
+ xor `&LO("$tp88")`,`&LO("$tp18")`
+ shr \$32,$tp88
+ xor `&LO("$tp80")`,`&LO("$acc0")`
+ xor `&LO("$tp88")`,`&LO("$acc8")`
+
+ mov $tp20,$tp80
+ rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
+ mov $tp28,$tp88
+ rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
+ shr \$32,$tp80
+ xor `&LO("$tp20")`,`&LO("$tp10")`
+ shr \$32,$tp88
+ xor `&LO("$tp28")`,`&LO("$tp18")`
+ rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
+ mov $tp40,$tp20
+ rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
+ mov $tp48,$tp28
+ shr \$32,$tp20
+ xor `&LO("$tp80")`,`&LO("$acc0")`
+ shr \$32,$tp28
+ xor `&LO("$tp88")`,`&LO("$acc8")`
+
+ `"mov 0($sbox),$mask80" if ($prefetch)`
+ rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
+ `"mov 64($sbox),$maskfe" if ($prefetch)`
+ rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
+ `"mov 128($sbox),$mask1b" if ($prefetch)`
+ rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
+ `"mov 192($sbox),$tp80" if ($prefetch)`
+ xor `&LO("$tp40")`,`&LO("$tp10")`
+ rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
+ xor `&LO("$tp48")`,`&LO("$tp18")`
+ `"mov 256($sbox),$tp88" if ($prefetch)`
+ xor `&LO("$tp20")`,`&LO("$acc0")`
+ xor `&LO("$tp28")`,`&LO("$acc8")`
+___
+}
+
+$code.=<<___;
+.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
+.align 16
+_x86_64_AES_decrypt_compact:
+ lea 128($sbox),$inp # size optimization
+ mov 0-128($inp),$acc1 # prefetch Td4
+ mov 32-128($inp),$acc2
+ mov 64-128($inp),$t0
+ mov 96-128($inp),$t1
+ mov 128-128($inp),$acc1
+ mov 160-128($inp),$acc2
+ mov 192-128($inp),$t0
+ mov 224-128($inp),$t1
+ jmp .Ldec_loop_compact
+
+.align 16
+.Ldec_loop_compact:
+ xor 0($key),$s0 # xor with key
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ lea 16($key),$key
+___
+ &deccompactvert();
+$code.=<<___;
+ cmp 16(%rsp),$key
+ je .Ldec_compact_done
+
+ mov 256+0($sbox),$mask80
+ shl \$32,%rbx
+ shl \$32,%rdx
+ mov 256+8($sbox),$maskfe
+ or %rbx,%rax
+ or %rdx,%rcx
+ mov 256+16($sbox),$mask1b
+___
+ &dectransform(1);
+$code.=<<___;
+ jmp .Ldec_loop_compact
+.align 16
+.Ldec_compact_done:
+ xor 0($key),$s0
+ xor 4($key),$s1
+ xor 8($key),$s2
+ xor 12($key),$s3
+ .byte 0xf3,0xc3 # rep ret
+.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
+___
+
+# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
+$code.=<<___;
+.globl AES_decrypt
+.type AES_decrypt,\@function,3
+.align 16
+.globl asm_AES_decrypt
+.hidden asm_AES_decrypt
+asm_AES_decrypt:
+AES_decrypt:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # allocate frame "above" key schedule
+ mov %rsp,%r10
+ lea -63(%rdx),%rcx # %rdx is key argument
+ and \$-64,%rsp
+ sub %rsp,%rcx
+ neg %rcx
+ and \$0x3c0,%rcx
+ sub %rcx,%rsp
+ sub \$32,%rsp
+
+ mov %rsi,16(%rsp) # save out
+ mov %r10,24(%rsp) # save real stack pointer
+.Ldec_prologue:
+
+ mov %rdx,$key
+ mov 240($key),$rnds # load rounds
+
+ mov 0(%rdi),$s0 # load input vector
+ mov 4(%rdi),$s1
+ mov 8(%rdi),$s2
+ mov 12(%rdi),$s3
+
+ shl \$4,$rnds
+ lea ($key,$rnds),%rbp
+ mov $key,(%rsp) # key schedule
+ mov %rbp,8(%rsp) # end of key schedule
+
+ # pick Td4 copy which can't "overlap" with stack frame or key schedule
+ lea .LAES_Td+2048(%rip),$sbox
+ lea 768(%rsp),%rbp
+ sub $sbox,%rbp
+ and \$0x300,%rbp
+ lea ($sbox,%rbp),$sbox
+ shr \$3,%rbp # recall "magic" constants!
+ add %rbp,$sbox
+
+ call _x86_64_AES_decrypt_compact
+
+ mov 16(%rsp),$out # restore out
+ mov 24(%rsp),%rsi # restore saved stack pointer
+ mov $s0,0($out) # write output vector
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Ldec_epilogue:
+ ret
+.size AES_decrypt,.-AES_decrypt
+___
+#------------------------------------------------------------------#
+
+sub enckey()
+{
+$code.=<<___;
+ movz %dl,%esi # rk[i]>>0
+ movzb -128(%rbp,%rsi),%ebx
+ movz %dh,%esi # rk[i]>>8
+ shl \$24,%ebx
+ xor %ebx,%eax
+
+ movzb -128(%rbp,%rsi),%ebx
+ shr \$16,%edx
+ movz %dl,%esi # rk[i]>>16
+ xor %ebx,%eax
+
+ movzb -128(%rbp,%rsi),%ebx
+ movz %dh,%esi # rk[i]>>24
+ shl \$8,%ebx
+ xor %ebx,%eax
+
+ movzb -128(%rbp,%rsi),%ebx
+ shl \$16,%ebx
+ xor %ebx,%eax
+
+ xor 1024-128(%rbp,%rcx,4),%eax # rcon
+___
+}
+
+# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+$code.=<<___;
+.globl AES_set_encrypt_key
+.type AES_set_encrypt_key,\@function,3
+.align 16
+AES_set_encrypt_key:
+ push %rbx
+ push %rbp
+ push %r12 # redundant, but allows to share
+ push %r13 # exception handler...
+ push %r14
+ push %r15
+ sub \$8,%rsp
+.Lenc_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+
+ mov 40(%rsp),%rbp
+ mov 48(%rsp),%rbx
+ add \$56,%rsp
+.Lenc_key_epilogue:
+ ret
+.size AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
+.align 16
+_x86_64_AES_set_encrypt_key:
+ mov %esi,%ecx # %ecx=bits
+ mov %rdi,%rsi # %rsi=userKey
+ mov %rdx,%rdi # %rdi=key
+
+ test \$-1,%rsi
+ jz .Lbadpointer
+ test \$-1,%rdi
+ jz .Lbadpointer
+
+ lea .LAES_Te(%rip),%rbp
+ lea 2048+128(%rbp),%rbp
+
+ # prefetch Te4
+ mov 0-128(%rbp),%eax
+ mov 32-128(%rbp),%ebx
+ mov 64-128(%rbp),%r8d
+ mov 96-128(%rbp),%edx
+ mov 128-128(%rbp),%eax
+ mov 160-128(%rbp),%ebx
+ mov 192-128(%rbp),%r8d
+ mov 224-128(%rbp),%edx
+
+ cmp \$128,%ecx
+ je .L10rounds
+ cmp \$192,%ecx
+ je .L12rounds
+ cmp \$256,%ecx
+ je .L14rounds
+ mov \$-2,%rax # invalid number of bits
+ jmp .Lexit
+
+.L10rounds:
+ mov 0(%rsi),%rax # copy first 4 dwords
+ mov 8(%rsi),%rdx
+ mov %rax,0(%rdi)
+ mov %rdx,8(%rdi)
+
+ shr \$32,%rdx
+ xor %ecx,%ecx
+ jmp .L10shortcut
+.align 4
+.L10loop:
+ mov 0(%rdi),%eax # rk[0]
+ mov 12(%rdi),%edx # rk[3]
+.L10shortcut:
+___
+ &enckey ();
+$code.=<<___;
+ mov %eax,16(%rdi) # rk[4]
+ xor 4(%rdi),%eax
+ mov %eax,20(%rdi) # rk[5]
+ xor 8(%rdi),%eax
+ mov %eax,24(%rdi) # rk[6]
+ xor 12(%rdi),%eax
+ mov %eax,28(%rdi) # rk[7]
+ add \$1,%ecx
+ lea 16(%rdi),%rdi
+ cmp \$10,%ecx
+ jl .L10loop
+
+ movl \$10,80(%rdi) # setup number of rounds
+ xor %rax,%rax
+ jmp .Lexit
+
+.L12rounds:
+ mov 0(%rsi),%rax # copy first 6 dwords
+ mov 8(%rsi),%rbx
+ mov 16(%rsi),%rdx
+ mov %rax,0(%rdi)
+ mov %rbx,8(%rdi)
+ mov %rdx,16(%rdi)
+
+ shr \$32,%rdx
+ xor %ecx,%ecx
+ jmp .L12shortcut
+.align 4
+.L12loop:
+ mov 0(%rdi),%eax # rk[0]
+ mov 20(%rdi),%edx # rk[5]
+.L12shortcut:
+___
+ &enckey ();
+$code.=<<___;
+ mov %eax,24(%rdi) # rk[6]
+ xor 4(%rdi),%eax
+ mov %eax,28(%rdi) # rk[7]
+ xor 8(%rdi),%eax
+ mov %eax,32(%rdi) # rk[8]
+ xor 12(%rdi),%eax
+ mov %eax,36(%rdi) # rk[9]
+
+ cmp \$7,%ecx
+ je .L12break
+ add \$1,%ecx
+
+ xor 16(%rdi),%eax
+ mov %eax,40(%rdi) # rk[10]
+ xor 20(%rdi),%eax
+ mov %eax,44(%rdi) # rk[11]
+
+ lea 24(%rdi),%rdi
+ jmp .L12loop
+.L12break:
+ movl \$12,72(%rdi) # setup number of rounds
+ xor %rax,%rax
+ jmp .Lexit
+
+.L14rounds:
+ mov 0(%rsi),%rax # copy first 8 dwords
+ mov 8(%rsi),%rbx
+ mov 16(%rsi),%rcx
+ mov 24(%rsi),%rdx
+ mov %rax,0(%rdi)
+ mov %rbx,8(%rdi)
+ mov %rcx,16(%rdi)
+ mov %rdx,24(%rdi)
+
+ shr \$32,%rdx
+ xor %ecx,%ecx
+ jmp .L14shortcut
+.align 4
+.L14loop:
+ mov 0(%rdi),%eax # rk[0]
+ mov 28(%rdi),%edx # rk[4]
+.L14shortcut:
+___
+ &enckey ();
+$code.=<<___;
+ mov %eax,32(%rdi) # rk[8]
+ xor 4(%rdi),%eax
+ mov %eax,36(%rdi) # rk[9]
+ xor 8(%rdi),%eax
+ mov %eax,40(%rdi) # rk[10]
+ xor 12(%rdi),%eax
+ mov %eax,44(%rdi) # rk[11]
+
+ cmp \$6,%ecx
+ je .L14break
+ add \$1,%ecx
+
+ mov %eax,%edx
+ mov 16(%rdi),%eax # rk[4]
+ movz %dl,%esi # rk[11]>>0
+ movzb -128(%rbp,%rsi),%ebx
+ movz %dh,%esi # rk[11]>>8
+ xor %ebx,%eax
+
+ movzb -128(%rbp,%rsi),%ebx
+ shr \$16,%edx
+ shl \$8,%ebx
+ movz %dl,%esi # rk[11]>>16
+ xor %ebx,%eax
+
+ movzb -128(%rbp,%rsi),%ebx
+ movz %dh,%esi # rk[11]>>24
+ shl \$16,%ebx
+ xor %ebx,%eax
+
+ movzb -128(%rbp,%rsi),%ebx
+ shl \$24,%ebx
+ xor %ebx,%eax
+
+ mov %eax,48(%rdi) # rk[12]
+ xor 20(%rdi),%eax
+ mov %eax,52(%rdi) # rk[13]
+ xor 24(%rdi),%eax
+ mov %eax,56(%rdi) # rk[14]
+ xor 28(%rdi),%eax
+ mov %eax,60(%rdi) # rk[15]
+
+ lea 32(%rdi),%rdi
+ jmp .L14loop
+.L14break:
+ movl \$14,48(%rdi) # setup number of rounds
+ xor %rax,%rax
+ jmp .Lexit
+
+.Lbadpointer:
+ mov \$-1,%rax
+.Lexit:
+ .byte 0xf3,0xc3 # rep ret
+.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
+___
+
+sub deckey_ref()
+{ my ($i,$ptr,$te,$td) = @_;
+ my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
+$code.=<<___;
+ mov $i($ptr),$tp1
+ mov $tp1,$acc
+ and \$0x80808080,$acc
+ mov $acc,$tp4
+ shr \$7,$tp4
+ lea 0($tp1,$tp1),$tp2
+ sub $tp4,$acc
+ and \$0xfefefefe,$tp2
+ and \$0x1b1b1b1b,$acc
+ xor $tp2,$acc
+ mov $acc,$tp2
+
+ and \$0x80808080,$acc
+ mov $acc,$tp8
+ shr \$7,$tp8
+ lea 0($tp2,$tp2),$tp4
+ sub $tp8,$acc
+ and \$0xfefefefe,$tp4
+ and \$0x1b1b1b1b,$acc
+ xor $tp1,$tp2 # tp2^tp1
+ xor $tp4,$acc
+ mov $acc,$tp4
+
+ and \$0x80808080,$acc
+ mov $acc,$tp8
+ shr \$7,$tp8
+ sub $tp8,$acc
+ lea 0($tp4,$tp4),$tp8
+ xor $tp1,$tp4 # tp4^tp1
+ and \$0xfefefefe,$tp8
+ and \$0x1b1b1b1b,$acc
+ xor $acc,$tp8
+
+ xor $tp8,$tp1 # tp1^tp8
+ rol \$8,$tp1 # ROTATE(tp1^tp8,8)
+ xor $tp8,$tp2 # tp2^tp1^tp8
+ xor $tp8,$tp4 # tp4^tp1^tp8
+ xor $tp2,$tp8
+ xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
+
+ xor $tp8,$tp1
+ rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
+ xor $tp2,$tp1
+ rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
+ xor $tp4,$tp1
+
+ mov $tp1,$i($ptr)
+___
+}
+
+# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# AES_KEY *key)
+$code.=<<___;
+.globl AES_set_decrypt_key
+.type AES_set_decrypt_key,\@function,3
+.align 16
+AES_set_decrypt_key:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ push %rdx # save key schedule
+.Ldec_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+ mov (%rsp),%r8 # restore key schedule
+ cmp \$0,%eax
+ jne .Labort
+
+ mov 240(%r8),%r14d # pull number of rounds
+ xor %rdi,%rdi
+ lea (%rdi,%r14d,4),%rcx
+ mov %r8,%rsi
+ lea (%r8,%rcx,4),%rdi # pointer to last chunk
+.align 4
+.Linvert:
+ mov 0(%rsi),%rax
+ mov 8(%rsi),%rbx
+ mov 0(%rdi),%rcx
+ mov 8(%rdi),%rdx
+ mov %rax,0(%rdi)
+ mov %rbx,8(%rdi)
+ mov %rcx,0(%rsi)
+ mov %rdx,8(%rsi)
+ lea 16(%rsi),%rsi
+ lea -16(%rdi),%rdi
+ cmp %rsi,%rdi
+ jne .Linvert
+
+ lea .LAES_Te+2048+1024(%rip),%rax # rcon
+
+ mov 40(%rax),$mask80
+ mov 48(%rax),$maskfe
+ mov 56(%rax),$mask1b
+
+ mov %r8,$key
+ sub \$1,%r14d
+.align 4
+.Lpermute:
+ lea 16($key),$key
+ mov 0($key),%rax
+ mov 8($key),%rcx
+___
+ &dectransform ();
+$code.=<<___;
+ mov %eax,0($key)
+ mov %ebx,4($key)
+ mov %ecx,8($key)
+ mov %edx,12($key)
+ sub \$1,%r14d
+ jnz .Lpermute
+
+ xor %rax,%rax
+.Labort:
+ mov 8(%rsp),%r15
+ mov 16(%rsp),%r14
+ mov 24(%rsp),%r13
+ mov 32(%rsp),%r12
+ mov 40(%rsp),%rbp
+ mov 48(%rsp),%rbx
+ add \$56,%rsp
+.Ldec_key_epilogue:
+ ret
+.size AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+{
+# stack frame layout
+# -8(%rsp) return address
+my $keyp="0(%rsp)"; # one to pass as $key
+my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
+my $_rsp="16(%rsp)"; # saved %rsp
+my $_inp="24(%rsp)"; # copy of 1st parameter, inp
+my $_out="32(%rsp)"; # copy of 2nd parameter, out
+my $_len="40(%rsp)"; # copy of 3rd parameter, length
+my $_key="48(%rsp)"; # copy of 4th parameter, key
+my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
+my $ivec="64(%rsp)"; # ivec[16]
+my $aes_key="80(%rsp)"; # copy of aes_key
+my $mark="80+240(%rsp)"; # copy of aes_key->rounds
+
+$code.=<<___;
+.globl AES_cbc_encrypt
+.type AES_cbc_encrypt,\@function,6
+.align 16
+.extern OPENSSL_ia32cap_P
+.globl asm_AES_cbc_encrypt
+.hidden asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
+AES_cbc_encrypt:
+ cmp \$0,%rdx # check length
+ je .Lcbc_epilogue
+ pushfq
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+.Lcbc_prologue:
+
+ cld
+ mov %r9d,%r9d # clear upper half of enc
+
+ lea .LAES_Te(%rip),$sbox
+ cmp \$0,%r9
+ jne .Lcbc_picked_te
+ lea .LAES_Td(%rip),$sbox
+.Lcbc_picked_te:
+
+ mov OPENSSL_ia32cap_P(%rip),%r10d
+ cmp \$$speed_limit,%rdx
+ jb .Lcbc_slow_prologue
+ test \$15,%rdx
+ jnz .Lcbc_slow_prologue
+ bt \$28,%r10d
+ jc .Lcbc_slow_prologue
+
+ # allocate aligned stack frame...
+ lea -88-248(%rsp),$key
+ and \$-64,$key
+
+ # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
+ mov $sbox,%r10
+ lea 2304($sbox),%r11
+ mov $key,%r12
+ and \$0xFFF,%r10 # s = $sbox&0xfff
+ and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
+ and \$0xFFF,%r12 # p = %rsp&0xfff
+
+ cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
+ jb .Lcbc_te_break_out
+ sub %r11,%r12
+ sub %r12,$key
+ jmp .Lcbc_te_ok
+.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz
+ sub %r10,%r12
+ and \$0xFFF,%r12
+ add \$320,%r12
+ sub %r12,$key
+.align 4
+.Lcbc_te_ok:
+
+ xchg %rsp,$key
+ #add \$8,%rsp # reserve for return address!
+ mov $key,$_rsp # save %rsp
+.Lcbc_fast_body:
+ mov %rdi,$_inp # save copy of inp
+ mov %rsi,$_out # save copy of out
+ mov %rdx,$_len # save copy of len
+ mov %rcx,$_key # save copy of key
+ mov %r8,$_ivp # save copy of ivp
+ movl \$0,$mark # copy of aes_key->rounds = 0;
+ mov %r8,%rbp # rearrange input arguments
+ mov %r9,%rbx
+ mov %rsi,$out
+ mov %rdi,$inp
+ mov %rcx,$key
+
+ mov 240($key),%eax # key->rounds
+ # do we copy key schedule to stack?
+ mov $key,%r10
+ sub $sbox,%r10
+ and \$0xfff,%r10
+ cmp \$2304,%r10
+ jb .Lcbc_do_ecopy
+ cmp \$4096-248,%r10
+ jb .Lcbc_skip_ecopy
+.align 4
+.Lcbc_do_ecopy:
+ mov $key,%rsi
+ lea $aes_key,%rdi
+ lea $aes_key,$key
+ mov \$240/8,%ecx
+ .long 0x90A548F3 # rep movsq
+ mov %eax,(%rdi) # copy aes_key->rounds
+.Lcbc_skip_ecopy:
+ mov $key,$keyp # save key pointer
+
+ mov \$18,%ecx
+.align 4
+.Lcbc_prefetch_te:
+ mov 0($sbox),%r10
+ mov 32($sbox),%r11
+ mov 64($sbox),%r12
+ mov 96($sbox),%r13
+ lea 128($sbox),$sbox
+ sub \$1,%ecx
+ jnz .Lcbc_prefetch_te
+ lea -2304($sbox),$sbox
+
+ cmp \$0,%rbx
+ je .LFAST_DECRYPT
+
+#----------------------------- ENCRYPT -----------------------------#
+ mov 0(%rbp),$s0 # load iv
+ mov 4(%rbp),$s1
+ mov 8(%rbp),$s2
+ mov 12(%rbp),$s3
+
+.align 4
+.Lcbc_fast_enc_loop:
+ xor 0($inp),$s0
+ xor 4($inp),$s1
+ xor 8($inp),$s2
+ xor 12($inp),$s3
+ mov $keyp,$key # restore key
+ mov $inp,$_inp # if ($verticalspin) save inp
+
+ call _x86_64_AES_encrypt
+
+ mov $_inp,$inp # if ($verticalspin) restore inp
+ mov $_len,%r10
+ mov $s0,0($out)
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ lea 16($inp),$inp
+ lea 16($out),$out
+ sub \$16,%r10
+ test \$-16,%r10
+ mov %r10,$_len
+ jnz .Lcbc_fast_enc_loop
+ mov $_ivp,%rbp # restore ivp
+ mov $s0,0(%rbp) # save ivec
+ mov $s1,4(%rbp)
+ mov $s2,8(%rbp)
+ mov $s3,12(%rbp)
+
+ jmp .Lcbc_fast_cleanup
+
+#----------------------------- DECRYPT -----------------------------#
+.align 16
+.LFAST_DECRYPT:
+ cmp $inp,$out
+ je .Lcbc_fast_dec_in_place
+
+ mov %rbp,$ivec
+.align 4
+.Lcbc_fast_dec_loop:
+ mov 0($inp),$s0 # read input
+ mov 4($inp),$s1
+ mov 8($inp),$s2
+ mov 12($inp),$s3
+ mov $keyp,$key # restore key
+ mov $inp,$_inp # if ($verticalspin) save inp
+
+ call _x86_64_AES_decrypt
+
+ mov $ivec,%rbp # load ivp
+ mov $_inp,$inp # if ($verticalspin) restore inp
+ mov $_len,%r10 # load len
+ xor 0(%rbp),$s0 # xor iv
+ xor 4(%rbp),$s1
+ xor 8(%rbp),$s2
+ xor 12(%rbp),$s3
+ mov $inp,%rbp # current input, next iv
+
+ sub \$16,%r10
+ mov %r10,$_len # update len
+ mov %rbp,$ivec # update ivp
+
+ mov $s0,0($out) # write output
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ lea 16($inp),$inp
+ lea 16($out),$out
+ jnz .Lcbc_fast_dec_loop
+ mov $_ivp,%r12 # load user ivp
+ mov 0(%rbp),%r10 # load iv
+ mov 8(%rbp),%r11
+ mov %r10,0(%r12) # copy back to user
+ mov %r11,8(%r12)
+ jmp .Lcbc_fast_cleanup
+
+.align 16
+.Lcbc_fast_dec_in_place:
+ mov 0(%rbp),%r10 # copy iv to stack
+ mov 8(%rbp),%r11
+ mov %r10,0+$ivec
+ mov %r11,8+$ivec
+.align 4
+.Lcbc_fast_dec_in_place_loop:
+ mov 0($inp),$s0 # load input
+ mov 4($inp),$s1
+ mov 8($inp),$s2
+ mov 12($inp),$s3
+ mov $keyp,$key # restore key
+ mov $inp,$_inp # if ($verticalspin) save inp
+
+ call _x86_64_AES_decrypt
+
+ mov $_inp,$inp # if ($verticalspin) restore inp
+ mov $_len,%r10
+ xor 0+$ivec,$s0
+ xor 4+$ivec,$s1
+ xor 8+$ivec,$s2
+ xor 12+$ivec,$s3
+
+ mov 0($inp),%r11 # load input
+ mov 8($inp),%r12
+ sub \$16,%r10
+ jz .Lcbc_fast_dec_in_place_done
+
+ mov %r11,0+$ivec # copy input to iv
+ mov %r12,8+$ivec
+
+ mov $s0,0($out) # save output [zaps input]
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ lea 16($inp),$inp
+ lea 16($out),$out
+ mov %r10,$_len
+ jmp .Lcbc_fast_dec_in_place_loop
+.Lcbc_fast_dec_in_place_done:
+ mov $_ivp,%rdi
+ mov %r11,0(%rdi) # copy iv back to user
+ mov %r12,8(%rdi)
+
+ mov $s0,0($out) # save output [zaps input]
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+.align 4
+.Lcbc_fast_cleanup:
+ cmpl \$0,$mark # was the key schedule copied?
+ lea $aes_key,%rdi
+ je .Lcbc_exit
+ mov \$240/8,%ecx
+ xor %rax,%rax
+ .long 0x90AB48F3 # rep stosq
+
+ jmp .Lcbc_exit
+
+#--------------------------- SLOW ROUTINE ---------------------------#
+.align 16
+.Lcbc_slow_prologue:
+ # allocate aligned stack frame...
+ lea -88(%rsp),%rbp
+ and \$-64,%rbp
+ # ... just "above" key schedule
+ lea -88-63(%rcx),%r10
+ sub %rbp,%r10
+ neg %r10
+ and \$0x3c0,%r10
+ sub %r10,%rbp
+
+ xchg %rsp,%rbp
+ #add \$8,%rsp # reserve for return address!
+ mov %rbp,$_rsp # save %rsp
+.Lcbc_slow_body:
+ #mov %rdi,$_inp # save copy of inp
+ #mov %rsi,$_out # save copy of out
+ #mov %rdx,$_len # save copy of len
+ #mov %rcx,$_key # save copy of key
+ mov %r8,$_ivp # save copy of ivp
+ mov %r8,%rbp # rearrange input arguments
+ mov %r9,%rbx
+ mov %rsi,$out
+ mov %rdi,$inp
+ mov %rcx,$key
+ mov %rdx,%r10
+
+ mov 240($key),%eax
+ mov $key,$keyp # save key pointer
+ shl \$4,%eax
+ lea ($key,%rax),%rax
+ mov %rax,$keyend
+
+ # pick Te4 copy which can't "overlap" with stack frame or key scdedule
+ lea 2048($sbox),$sbox
+ lea 768-8(%rsp),%rax
+ sub $sbox,%rax
+ and \$0x300,%rax
+ lea ($sbox,%rax),$sbox
+
+ cmp \$0,%rbx
+ je .LSLOW_DECRYPT
+
+#--------------------------- SLOW ENCRYPT ---------------------------#
+ test \$-16,%r10 # check upon length
+ mov 0(%rbp),$s0 # load iv
+ mov 4(%rbp),$s1
+ mov 8(%rbp),$s2
+ mov 12(%rbp),$s3
+ jz .Lcbc_slow_enc_tail # short input...
+
+.align 4
+.Lcbc_slow_enc_loop:
+ xor 0($inp),$s0
+ xor 4($inp),$s1
+ xor 8($inp),$s2
+ xor 12($inp),$s3
+ mov $keyp,$key # restore key
+ mov $inp,$_inp # save inp
+ mov $out,$_out # save out
+ mov %r10,$_len # save len
+
+ call _x86_64_AES_encrypt_compact
+
+ mov $_inp,$inp # restore inp
+ mov $_out,$out # restore out
+ mov $_len,%r10 # restore len
+ mov $s0,0($out)
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ lea 16($inp),$inp
+ lea 16($out),$out
+ sub \$16,%r10
+ test \$-16,%r10
+ jnz .Lcbc_slow_enc_loop
+ test \$15,%r10
+ jnz .Lcbc_slow_enc_tail
+ mov $_ivp,%rbp # restore ivp
+ mov $s0,0(%rbp) # save ivec
+ mov $s1,4(%rbp)
+ mov $s2,8(%rbp)
+ mov $s3,12(%rbp)
+
+ jmp .Lcbc_exit
+
+.align 4
+.Lcbc_slow_enc_tail:
+ mov %rax,%r11
+ mov %rcx,%r12
+ mov %r10,%rcx
+ mov $inp,%rsi
+ mov $out,%rdi
+ .long 0x9066A4F3 # rep movsb
+ mov \$16,%rcx # zero tail
+ sub %r10,%rcx
+ xor %rax,%rax
+ .long 0x9066AAF3 # rep stosb
+ mov $out,$inp # this is not a mistake!
+ mov \$16,%r10 # len=16
+ mov %r11,%rax
+ mov %r12,%rcx
+ jmp .Lcbc_slow_enc_loop # one more spin...
+#--------------------------- SLOW DECRYPT ---------------------------#
+.align 16
+.LSLOW_DECRYPT:
+ shr \$3,%rax
+ add %rax,$sbox # recall "magic" constants!
+
+ mov 0(%rbp),%r11 # copy iv to stack
+ mov 8(%rbp),%r12
+ mov %r11,0+$ivec
+ mov %r12,8+$ivec
+
+.align 4
+.Lcbc_slow_dec_loop:
+ mov 0($inp),$s0 # load input
+ mov 4($inp),$s1
+ mov 8($inp),$s2
+ mov 12($inp),$s3
+ mov $keyp,$key # restore key
+ mov $inp,$_inp # save inp
+ mov $out,$_out # save out
+ mov %r10,$_len # save len
+
+ call _x86_64_AES_decrypt_compact
+
+ mov $_inp,$inp # restore inp
+ mov $_out,$out # restore out
+ mov $_len,%r10
+ xor 0+$ivec,$s0
+ xor 4+$ivec,$s1
+ xor 8+$ivec,$s2
+ xor 12+$ivec,$s3
+
+ mov 0($inp),%r11 # load input
+ mov 8($inp),%r12
+ sub \$16,%r10
+ jc .Lcbc_slow_dec_partial
+ jz .Lcbc_slow_dec_done
+
+ mov %r11,0+$ivec # copy input to iv
+ mov %r12,8+$ivec
+
+ mov $s0,0($out) # save output [can zap input]
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ lea 16($inp),$inp
+ lea 16($out),$out
+ jmp .Lcbc_slow_dec_loop
+.Lcbc_slow_dec_done:
+ mov $_ivp,%rdi
+ mov %r11,0(%rdi) # copy iv back to user
+ mov %r12,8(%rdi)
+
+ mov $s0,0($out) # save output [can zap input]
+ mov $s1,4($out)
+ mov $s2,8($out)
+ mov $s3,12($out)
+
+ jmp .Lcbc_exit
+
+.align 4
+.Lcbc_slow_dec_partial:
+ mov $_ivp,%rdi
+ mov %r11,0(%rdi) # copy iv back to user
+ mov %r12,8(%rdi)
+
+ mov $s0,0+$ivec # save output to stack
+ mov $s1,4+$ivec
+ mov $s2,8+$ivec
+ mov $s3,12+$ivec
+
+ mov $out,%rdi
+ lea $ivec,%rsi
+ lea 16(%r10),%rcx
+ .long 0x9066A4F3 # rep movsb
+ jmp .Lcbc_exit
+
+.align 16
+.Lcbc_exit:
+ mov $_rsp,%rsi
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lcbc_popfq:
+ popfq
+.Lcbc_epilogue:
+ ret
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+
+$code.=<<___;
+.align 64
+.LAES_Te:
+___
+ &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
+ &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
+ &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
+ &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
+ &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
+ &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
+ &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
+ &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
+ &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
+ &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
+ &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
+ &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
+ &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
+ &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
+ &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
+ &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
+ &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
+ &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
+ &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
+ &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
+ &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
+ &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
+ &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
+ &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
+ &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
+ &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
+ &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
+ &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
+ &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
+ &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
+ &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
+ &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
+ &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
+ &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
+ &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
+ &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
+ &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
+ &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
+ &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
+ &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
+ &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
+ &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
+ &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
+ &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
+ &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
+ &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
+ &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
+ &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
+ &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
+ &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
+ &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
+ &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
+ &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
+ &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
+ &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
+ &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
+ &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
+ &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
+ &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
+ &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
+ &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
+ &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
+ &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
+ &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
+
+#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
+#rcon:
+$code.=<<___;
+ .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
+ .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
+ .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
+ .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
+___
+$code.=<<___;
+.align 64
+.LAES_Td:
+___
+ &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
+ &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
+ &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
+ &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
+ &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
+ &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
+ &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
+ &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
+ &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
+ &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
+ &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
+ &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
+ &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
+ &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
+ &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
+ &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
+ &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
+ &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
+ &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
+ &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
+ &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
+ &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
+ &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
+ &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
+ &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
+ &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
+ &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
+ &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
+ &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
+ &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
+ &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
+ &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
+ &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
+ &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
+ &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
+ &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
+ &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
+ &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
+ &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
+ &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
+ &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
+ &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
+ &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
+ &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
+ &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
+ &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
+ &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
+ &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
+ &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
+ &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
+ &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
+ &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
+ &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
+ &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
+ &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
+ &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
+ &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
+ &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
+ &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
+ &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
+ &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
+ &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
+ &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
+ &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
+
+#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+$code.=<<___;
+ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+___
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+$code.=<<___;
+ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+___
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+$code.=<<___;
+ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+___
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
+$code.=<<___;
+ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type block_se_handler,\@abi-omnipotent
+.align 16
+block_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_block_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_block_prologue
+
+ mov 24(%rax),%rax # pull saved real stack pointer
+ lea 48(%rax),%rax # adjust...
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_block_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ jmp .Lcommon_seh_exit
+.size block_se_handler,.-block_se_handler
+
+.type key_se_handler,\@abi-omnipotent
+.align 16
+key_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_key_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_key_prologue
+
+ lea 56(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_key_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ jmp .Lcommon_seh_exit
+.size key_se_handler,.-key_se_handler
+
+.type cbc_se_handler,\@abi-omnipotent
+.align 16
+cbc_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lcbc_prologue(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lcbc_prologue
+ jb .Lin_cbc_prologue
+
+ lea .Lcbc_fast_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
+ jb .Lin_cbc_frame_setup
+
+ lea .Lcbc_slow_prologue(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
+ jb .Lin_cbc_body
+
+ lea .Lcbc_slow_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
+ jb .Lin_cbc_frame_setup
+
+.Lin_cbc_body:
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lcbc_epilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
+ jae .Lin_cbc_prologue
+
+ lea 8(%rax),%rax
+
+ lea .Lcbc_popfq(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
+ jae .Lin_cbc_prologue
+
+ mov `16-8`(%rax),%rax # biased $_rsp
+ lea 56(%rax),%rax
+
+.Lin_cbc_frame_setup:
+ mov -16(%rax),%rbx
+ mov -24(%rax),%rbp
+ mov -32(%rax),%r12
+ mov -40(%rax),%r13
+ mov -48(%rax),%r14
+ mov -56(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_cbc_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+.Lcommon_seh_exit:
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size cbc_se_handler,.-cbc_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_AES_encrypt
+ .rva .LSEH_end_AES_encrypt
+ .rva .LSEH_info_AES_encrypt
+
+ .rva .LSEH_begin_AES_decrypt
+ .rva .LSEH_end_AES_decrypt
+ .rva .LSEH_info_AES_decrypt
+
+ .rva .LSEH_begin_AES_set_encrypt_key
+ .rva .LSEH_end_AES_set_encrypt_key
+ .rva .LSEH_info_AES_set_encrypt_key
+
+ .rva .LSEH_begin_AES_set_decrypt_key
+ .rva .LSEH_end_AES_set_decrypt_key
+ .rva .LSEH_info_AES_set_decrypt_key
+
+ .rva .LSEH_begin_AES_cbc_encrypt
+ .rva .LSEH_end_AES_cbc_encrypt
+ .rva .LSEH_info_AES_cbc_encrypt
+
+.section .xdata
+.align 8
+.LSEH_info_AES_encrypt:
+ .byte 9,0,0,0
+ .rva block_se_handler
+ .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
+.LSEH_info_AES_decrypt:
+ .byte 9,0,0,0
+ .rva block_se_handler
+ .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
+.LSEH_info_AES_set_encrypt_key:
+ .byte 9,0,0,0
+ .rva key_se_handler
+ .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
+.LSEH_info_AES_set_decrypt_key:
+ .byte 9,0,0,0
+ .rva key_se_handler
+ .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
+.LSEH_info_AES_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva cbc_se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesfx-sparcv9.pl b/openssl-1.1.0h/crypto/aes/asm/aesfx-sparcv9.pl
new file mode 100644
index 0000000..04b3cf7
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesfx-sparcv9.pl
@@ -0,0 +1,1270 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2016
+#
+# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
+# required key setup and single-block procedures.
+#
+# April 2016
+#
+# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
+# that parallelizeable nature of CBC decrypt and CTR is not utilized
+# yet. CBC encrypt on the other hand is as good as it can possibly
+# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
+# This is ~6x faster than pure software implementation...
+#
+# July 2016
+#
+# Switch from faligndata to fshiftorx, which allows to omit alignaddr
+# instructions and improve single-block and short-input performance
+# with misaligned data.
+
+$output = pop;
+open STDOUT,">$output";
+
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#define LOCALS (STACK_BIAS+STACK_FRAME)
+
+.text
+
+.globl aes_fx_encrypt
+.align 32
+aes_fx_encrypt:
+ and $inp, 7, $tmp ! is input aligned?
+ andn $inp, 7, $inp
+ ldd [$key + 0], %f6 ! round[0]
+ ldd [$key + 8], %f8
+ mov %o7, %g1
+ ld [$key + 240], $rounds
+
+1: call .+8
+ add %o7, .Linp_align-1b, %o7
+
+ sll $tmp, 3, $tmp
+ ldd [$inp + 0], %f0 ! load input
+ brz,pt $tmp, .Lenc_inp_aligned
+ ldd [$inp + 8], %f2
+
+ ldd [%o7 + $tmp], %f14 ! shift left params
+ ldd [$inp + 16], %f4
+ fshiftorx %f0, %f2, %f14, %f0
+ fshiftorx %f2, %f4, %f14, %f2
+
+.Lenc_inp_aligned:
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fxor %f0, %f6, %f0 ! ^=round[0]
+ fxor %f2, %f8, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+ add $key, 32, $key
+ sub $rounds, 4, $rounds
+
+.Loop_enc:
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$key + 16], %f10
+ ldd [$key + 24], %f12
+ add $key, 32, $key
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ ldd [$key + 0], %f6
+ ldd [$key + 8], %f8
+
+ brnz,a $rounds, .Loop_enc
+ sub $rounds, 2, $rounds
+
+ andcc $out, 7, $tmp ! is output aligned?
+ andn $out, 7, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+ add %o7, 64, %o7
+ sll $tmp, 3, $tmp
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [%o7 + $tmp], %f14 ! shift right params
+
+ fmovd %f0, %f4
+ faesenclx %f2, %f6, %f0
+ faesenclx %f4, %f8, %f2
+
+ bnz,pn %icc, .Lenc_out_unaligned
+ mov %g1, %o7
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+.align 16
+.Lenc_out_unaligned:
+ add $out, 16, $inp
+ orn %g0, $mask, $tmp
+ fshiftorx %f0, %f0, %f14, %f4
+ fshiftorx %f0, %f2, %f14, %f6
+ fshiftorx %f2, %f2, %f14, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ stda %f8, [$inp + $tmp]0xc0 ! partial store
+ retl
+ nop
+.type aes_fx_encrypt,#function
+.size aes_fx_encrypt,.-aes_fx_encrypt
+
+.globl aes_fx_decrypt
+.align 32
+aes_fx_decrypt:
+ and $inp, 7, $tmp ! is input aligned?
+ andn $inp, 7, $inp
+ ldd [$key + 0], %f6 ! round[0]
+ ldd [$key + 8], %f8
+ mov %o7, %g1
+ ld [$key + 240], $rounds
+
+1: call .+8
+ add %o7, .Linp_align-1b, %o7
+
+ sll $tmp, 3, $tmp
+ ldd [$inp + 0], %f0 ! load input
+ brz,pt $tmp, .Ldec_inp_aligned
+ ldd [$inp + 8], %f2
+
+ ldd [%o7 + $tmp], %f14 ! shift left params
+ ldd [$inp + 16], %f4
+ fshiftorx %f0, %f2, %f14, %f0
+ fshiftorx %f2, %f4, %f14, %f2
+
+.Ldec_inp_aligned:
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fxor %f0, %f6, %f0 ! ^=round[0]
+ fxor %f2, %f8, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+ add $key, 32, $key
+ sub $rounds, 4, $rounds
+
+.Loop_dec:
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$key + 16], %f10
+ ldd [$key + 24], %f12
+ add $key, 32, $key
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f6, %f0
+ faesdecx %f4, %f8, %f2
+ ldd [$key + 0], %f6
+ ldd [$key + 8], %f8
+
+ brnz,a $rounds, .Loop_dec
+ sub $rounds, 2, $rounds
+
+ andcc $out, 7, $tmp ! is output aligned?
+ andn $out, 7, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+ add %o7, 64, %o7
+ sll $tmp, 3, $tmp
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [%o7 + $tmp], %f14 ! shift right params
+
+ fmovd %f0, %f4
+ faesdeclx %f2, %f6, %f0
+ faesdeclx %f4, %f8, %f2
+
+ bnz,pn %icc, .Ldec_out_unaligned
+ mov %g1, %o7
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+.align 16
+.Ldec_out_unaligned:
+ add $out, 16, $inp
+ orn %g0, $mask, $tmp
+ fshiftorx %f0, %f0, %f14, %f4
+ fshiftorx %f0, %f2, %f14, %f6
+ fshiftorx %f2, %f2, %f14, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ stda %f8, [$inp + $tmp]0xc0 ! partial store
+ retl
+ nop
+.type aes_fx_decrypt,#function
+.size aes_fx_decrypt,.-aes_fx_decrypt
+___
+}
+{
+my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
+$code.=<<___;
+.globl aes_fx_set_decrypt_key
+.align 32
+aes_fx_set_decrypt_key:
+ b .Lset_encrypt_key
+ mov -1, $inc
+ retl
+ nop
+.type aes_fx_set_decrypt_key,#function
+.size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
+
+.globl aes_fx_set_encrypt_key
+.align 32
+aes_fx_set_encrypt_key:
+ mov 1, $inc
+ nop
+.Lset_encrypt_key:
+ and $inp, 7, $tmp
+ andn $inp, 7, $inp
+ sll $tmp, 3, $tmp
+ mov %o7, %g1
+
+1: call .+8
+ add %o7, .Linp_align-1b, %o7
+
+ ldd [%o7 + $tmp], %f10 ! shift left params
+ mov %g1, %o7
+
+ cmp $bits, 192
+ ldd [$inp + 0], %f0
+ bl,pt %icc, .L128
+ ldd [$inp + 8], %f2
+
+ be,pt %icc, .L192
+ ldd [$inp + 16], %f4
+ brz,pt $tmp, .L256aligned
+ ldd [$inp + 24], %f6
+
+ ldd [$inp + 32], %f8
+ fshiftorx %f0, %f2, %f10, %f0
+ fshiftorx %f2, %f4, %f10, %f2
+ fshiftorx %f4, %f6, %f10, %f4
+ fshiftorx %f6, %f8, %f10, %f6
+
+.L256aligned:
+ mov 14, $bits
+ and $inc, `14*16`, $tmp
+ st $bits, [$out + 240] ! store rounds
+ add $out, $tmp, $out ! start or end of key schedule
+ sllx $inc, 4, $inc ! 16 or -16
+___
+for ($i=0; $i<6; $i++) {
+ $code.=<<___;
+ std %f0, [$out + 0]
+ faeskeyx %f6, `0x10+$i`, %f0
+ std %f2, [$out + 8]
+ add $out, $inc, $out
+ faeskeyx %f0, 0x00, %f2
+ std %f4, [$out + 0]
+ faeskeyx %f2, 0x01, %f4
+ std %f6, [$out + 8]
+ add $out, $inc, $out
+ faeskeyx %f4, 0x00, %f6
+___
+}
+$code.=<<___;
+ std %f0, [$out + 0]
+ faeskeyx %f6, `0x10+$i`, %f0
+ std %f2, [$out + 8]
+ add $out, $inc, $out
+ faeskeyx %f0, 0x00, %f2
+ std %f4,[$out + 0]
+ std %f6,[$out + 8]
+ add $out, $inc, $out
+ std %f0,[$out + 0]
+ std %f2,[$out + 8]
+ retl
+ xor %o0, %o0, %o0 ! return 0
+
+.align 16
+.L192:
+ brz,pt $tmp, .L192aligned
+ nop
+
+ ldd [$inp + 24], %f6
+ fshiftorx %f0, %f2, %f10, %f0
+ fshiftorx %f2, %f4, %f10, %f2
+ fshiftorx %f4, %f6, %f10, %f4
+
+.L192aligned:
+ mov 12, $bits
+ and $inc, `12*16`, $tmp
+ st $bits, [$out + 240] ! store rounds
+ add $out, $tmp, $out ! start or end of key schedule
+ sllx $inc, 4, $inc ! 16 or -16
+___
+for ($i=0; $i<8; $i+=2) {
+ $code.=<<___;
+ std %f0, [$out + 0]
+ faeskeyx %f4, `0x10+$i`, %f0
+ std %f2, [$out + 8]
+ add $out, $inc, $out
+ faeskeyx %f0, 0x00, %f2
+ std %f4, [$out + 0]
+ faeskeyx %f2, 0x00, %f4
+ std %f0, [$out + 8]
+ add $out, $inc, $out
+ faeskeyx %f4, `0x10+$i+1`, %f0
+ std %f2, [$out + 0]
+ faeskeyx %f0, 0x00, %f2
+ std %f4, [$out + 8]
+ add $out, $inc, $out
+___
+$code.=<<___ if ($i<6);
+ faeskeyx %f2, 0x00, %f4
+___
+}
+$code.=<<___;
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ retl
+ xor %o0, %o0, %o0 ! return 0
+
+.align 16
+.L128:
+ brz,pt $tmp, .L128aligned
+ nop
+
+ ldd [$inp + 16], %f4
+ fshiftorx %f0, %f2, %f10, %f0
+ fshiftorx %f2, %f4, %f10, %f2
+
+.L128aligned:
+ mov 10, $bits
+ and $inc, `10*16`, $tmp
+ st $bits, [$out + 240] ! store rounds
+ add $out, $tmp, $out ! start or end of key schedule
+ sllx $inc, 4, $inc ! 16 or -16
+___
+for ($i=0; $i<10; $i++) {
+ $code.=<<___;
+ std %f0, [$out + 0]
+ faeskeyx %f2, `0x10+$i`, %f0
+ std %f2, [$out + 8]
+ add $out, $inc, $out
+ faeskeyx %f0, 0x00, %f2
+___
+}
+$code.=<<___;
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ retl
+ xor %o0, %o0, %o0 ! return 0
+.type aes_fx_set_encrypt_key,#function
+.size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
+___
+}
+{
+my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
+my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
+my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
+ = map("%f$_",grep { !($_ & 1) } (16 .. 62));
+my ($ileft,$iright) = ($ialign,$oalign);
+
+$code.=<<___;
+.globl aes_fx_cbc_encrypt
+.align 32
+aes_fx_cbc_encrypt:
+ save %sp, -STACK_FRAME-16, %sp
+ srln $len, 4, $len
+ and $inp, 7, $ialign
+ andn $inp, 7, $inp
+ brz,pn $len, .Lcbc_no_data
+ sll $ialign, 3, $ileft
+
+1: call .+8
+ add %o7, .Linp_align-1b, %o7
+
+ ld [$key + 240], $rounds
+ and $out, 7, $oalign
+ ld [$ivp + 0], %f0 ! load ivec
+ andn $out, 7, $out
+ ld [$ivp + 4], %f1
+ sll $oalign, 3, $mask
+ ld [$ivp + 8], %f2
+ ld [$ivp + 12], %f3
+
+ sll $rounds, 4, $rounds
+ add $rounds, $key, $end
+ ldd [$key + 0], $r0hi ! round[0]
+ ldd [$key + 8], $r0lo
+
+ add $inp, 16, $inp
+ sub $len, 1, $len
+ ldd [$end + 0], $rlhi ! round[last]
+ ldd [$end + 8], $rllo
+
+ mov 16, $inc
+ movrz $len, 0, $inc
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ ldd [%o7 + $ileft], $fshift ! shift left params
+ add %o7, 64, %o7
+ ldd [$inp - 16], $in0 ! load input
+ ldd [$inp - 8], $in1
+ ldda [$inp]0x82, $intail ! non-faulting load
+ brz $dir, .Lcbc_decrypt
+ add $inp, $inc, $inp ! inp+=16
+
+ fxor $r0hi, %f0, %f0 ! ivec^=round[0]
+ fxor $r0lo, %f2, %f2
+ fshiftorx $in0, $in1, $fshift, $in0
+ fshiftorx $in1, $intail, $fshift, $in1
+ nop
+
+.Loop_cbc_enc:
+ fxor $in0, %f0, %f0 ! inp^ivec^round[0]
+ fxor $in1, %f2, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+ add $key, 32, $end
+ sub $rounds, 16*6, $inner
+
+.Lcbc_enc:
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10
+ ldd [$end + 24], %f12
+ add $end, 32, $end
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ ldd [$end + 0], %f6
+ ldd [$end + 8], %f8
+
+ brnz,a $inner, .Lcbc_enc
+ sub $inner, 16*2, $inner
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10 ! round[last-1]
+ ldd [$end + 24], %f12
+
+ movrz $len, 0, $inc
+ fmovd $intail, $in0
+ ldd [$inp - 8], $in1 ! load next input block
+ ldda [$inp]0x82, $intail ! non-faulting load
+ add $inp, $inc, $inp ! inp+=16
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+
+ fshiftorx $in0, $in1, $fshift, $in0
+ fshiftorx $in1, $intail, $fshift, $in1
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fxor $r0hi, $in0, $in0 ! inp^=round[0]
+ fxor $r0lo, $in1, $in1
+
+ fmovd %f0, %f4
+ faesenclx %f2, $rlhi, %f0
+ faesenclx %f4, $rllo, %f2
+
+ brnz,pn $oalign, .Lcbc_enc_unaligned_out
+ nop
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ add $out, 16, $out
+
+ brnz,a $len, .Loop_cbc_enc
+ sub $len, 1, $len
+
+ st %f0, [$ivp + 0] ! output ivec
+ st %f1, [$ivp + 4]
+ st %f2, [$ivp + 8]
+ st %f3, [$ivp + 12]
+
+.Lcbc_no_data:
+ ret
+ restore
+
+.align 32
+.Lcbc_enc_unaligned_out:
+ ldd [%o7 + $mask], $fshift ! shift right params
+ mov 0xff, $mask
+ srl $mask, $oalign, $mask
+ sub %g0, $ileft, $iright
+
+ fshiftorx %f0, %f0, $fshift, %f6
+ fshiftorx %f0, %f2, $fshift, %f8
+
+ stda %f6, [$out + $mask]0xc0 ! partial store
+ orn %g0, $mask, $mask
+ std %f8, [$out + 8]
+ add $out, 16, $out
+ brz $len, .Lcbc_enc_unaligned_out_done
+ sub $len, 1, $len
+ b .Loop_cbc_enc_unaligned_out
+ nop
+
+.align 32
+.Loop_cbc_enc_unaligned_out:
+ fmovd %f2, $outhead
+ fxor $in0, %f0, %f0 ! inp^ivec^round[0]
+ fxor $in1, %f2, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$key + 48], %f10 ! round[3]
+ ldd [$key + 56], %f12
+
+ ldx [$inp - 16], %o0
+ ldx [$inp - 8], %o1
+ brz $ileft, .Lcbc_enc_aligned_inp
+ movrz $len, 0, $inc
+
+ ldx [$inp], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+
+.Lcbc_enc_aligned_inp:
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ ldd [$key + 64], %f6 ! round[4]
+ ldd [$key + 72], %f8
+ add $key, 64, $end
+ sub $rounds, 16*8, $inner
+
+ stx %o0, [%sp + LOCALS + 0]
+ stx %o1, [%sp + LOCALS + 8]
+ add $inp, $inc, $inp ! inp+=16
+ nop
+
+.Lcbc_enc_unaligned:
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10
+ ldd [$end + 24], %f12
+ add $end, 32, $end
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ ldd [$end + 0], %f6
+ ldd [$end + 8], %f8
+
+ brnz,a $inner, .Lcbc_enc_unaligned
+ sub $inner, 16*2, $inner
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10 ! round[last-1]
+ ldd [$end + 24], %f12
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+
+ ldd [%sp + LOCALS + 0], $in0
+ ldd [%sp + LOCALS + 8], $in1
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fxor $r0hi, $in0, $in0 ! inp^=round[0]
+ fxor $r0lo, $in1, $in1
+
+ fmovd %f0, %f4
+ faesenclx %f2, $rlhi, %f0
+ faesenclx %f4, $rllo, %f2
+
+ fshiftorx $outhead, %f0, $fshift, %f6
+ fshiftorx %f0, %f2, $fshift, %f8
+ std %f6, [$out + 0]
+ std %f8, [$out + 8]
+ add $out, 16, $out
+
+ brnz,a $len, .Loop_cbc_enc_unaligned_out
+ sub $len, 1, $len
+
+.Lcbc_enc_unaligned_out_done:
+ fshiftorx %f2, %f2, $fshift, %f8
+ stda %f8, [$out + $mask]0xc0 ! partial store
+
+ st %f0, [$ivp + 0] ! output ivec
+ st %f1, [$ivp + 4]
+ st %f2, [$ivp + 8]
+ st %f3, [$ivp + 12]
+
+ ret
+ restore
+
+.align 32
+.Lcbc_decrypt:
+ fshiftorx $in0, $in1, $fshift, $in0
+ fshiftorx $in1, $intail, $fshift, $in1
+ fmovd %f0, $iv0
+ fmovd %f2, $iv1
+
+.Loop_cbc_dec:
+ fxor $in0, $r0hi, %f0 ! inp^round[0]
+ fxor $in1, $r0lo, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+ add $key, 32, $end
+ sub $rounds, 16*6, $inner
+
+.Lcbc_dec:
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$end + 16], %f10
+ ldd [$end + 24], %f12
+ add $end, 32, $end
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f6, %f0
+ faesdecx %f4, %f8, %f2
+ ldd [$end + 0], %f6
+ ldd [$end + 8], %f8
+
+ brnz,a $inner, .Lcbc_dec
+ sub $inner, 16*2, $inner
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$end + 16], %f10 ! round[last-1]
+ ldd [$end + 24], %f12
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f6, %f0
+ faesdecx %f4, %f8, %f2
+ fxor $iv0, $rlhi, %f6 ! ivec^round[last]
+ fxor $iv1, $rllo, %f8
+ fmovd $in0, $iv0
+ fmovd $in1, $iv1
+
+ movrz $len, 0, $inc
+ fmovd $intail, $in0
+ ldd [$inp - 8], $in1 ! load next input block
+ ldda [$inp]0x82, $intail ! non-faulting load
+ add $inp, $inc, $inp ! inp+=16
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fshiftorx $in0, $in1, $fshift, $in0
+ fshiftorx $in1, $intail, $fshift, $in1
+
+ fmovd %f0, %f4
+ faesdeclx %f2, %f6, %f0
+ faesdeclx %f4, %f8, %f2
+
+ brnz,pn $oalign, .Lcbc_dec_unaligned_out
+ nop
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ add $out, 16, $out
+
+ brnz,a $len, .Loop_cbc_dec
+ sub $len, 1, $len
+
+ st $iv0, [$ivp + 0] ! output ivec
+ st $iv0#lo, [$ivp + 4]
+ st $iv1, [$ivp + 8]
+ st $iv1#lo, [$ivp + 12]
+
+ ret
+ restore
+
+.align 32
+.Lcbc_dec_unaligned_out:
+ ldd [%o7 + $mask], $fshift ! shift right params
+ mov 0xff, $mask
+ srl $mask, $oalign, $mask
+ sub %g0, $ileft, $iright
+
+ fshiftorx %f0, %f0, $fshift, %f6
+ fshiftorx %f0, %f2, $fshift, %f8
+
+ stda %f6, [$out + $mask]0xc0 ! partial store
+ orn %g0, $mask, $mask
+ std %f8, [$out + 8]
+ add $out, 16, $out
+ brz $len, .Lcbc_dec_unaligned_out_done
+ sub $len, 1, $len
+ b .Loop_cbc_dec_unaligned_out
+ nop
+
+.align 32
+.Loop_cbc_dec_unaligned_out:
+ fmovd %f2, $outhead
+ fxor $in0, $r0hi, %f0 ! inp^round[0]
+ fxor $in1, $r0lo, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$key + 48], %f10 ! round[3]
+ ldd [$key + 56], %f12
+
+ ldx [$inp - 16], %o0
+ ldx [$inp - 8], %o1
+ brz $ileft, .Lcbc_dec_aligned_inp
+ movrz $len, 0, $inc
+
+ ldx [$inp], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+
+.Lcbc_dec_aligned_inp:
+ fmovd %f0, %f4
+ faesdecx %f2, %f6, %f0
+ faesdecx %f4, %f8, %f2
+ ldd [$key + 64], %f6 ! round[4]
+ ldd [$key + 72], %f8
+ add $key, 64, $end
+ sub $rounds, 16*8, $inner
+
+ stx %o0, [%sp + LOCALS + 0]
+ stx %o1, [%sp + LOCALS + 8]
+ add $inp, $inc, $inp ! inp+=16
+ nop
+
+.Lcbc_dec_unaligned:
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$end + 16], %f10
+ ldd [$end + 24], %f12
+ add $end, 32, $end
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f6, %f0
+ faesdecx %f4, %f8, %f2
+ ldd [$end + 0], %f6
+ ldd [$end + 8], %f8
+
+ brnz,a $inner, .Lcbc_dec_unaligned
+ sub $inner, 16*2, $inner
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$end + 16], %f10 ! round[last-1]
+ ldd [$end + 24], %f12
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f6, %f0
+ faesdecx %f4, %f8, %f2
+
+ fxor $iv0, $rlhi, %f6 ! ivec^round[last]
+ fxor $iv1, $rllo, %f8
+ fmovd $in0, $iv0
+ fmovd $in1, $iv1
+ ldd [%sp + LOCALS + 0], $in0
+ ldd [%sp + LOCALS + 8], $in1
+
+ fmovd %f0, %f4
+ faesdecx %f2, %f10, %f0
+ faesdecx %f4, %f12, %f2
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fmovd %f0, %f4
+ faesdeclx %f2, %f6, %f0
+ faesdeclx %f4, %f8, %f2
+
+ fshiftorx $outhead, %f0, $fshift, %f6
+ fshiftorx %f0, %f2, $fshift, %f8
+ std %f6, [$out + 0]
+ std %f8, [$out + 8]
+ add $out, 16, $out
+
+ brnz,a $len, .Loop_cbc_dec_unaligned_out
+ sub $len, 1, $len
+
+.Lcbc_dec_unaligned_out_done:
+ fshiftorx %f2, %f2, $fshift, %f8
+ stda %f8, [$out + $mask]0xc0 ! partial store
+
+ st $iv0, [$ivp + 0] ! output ivec
+ st $iv0#lo, [$ivp + 4]
+ st $iv1, [$ivp + 8]
+ st $iv1#lo, [$ivp + 12]
+
+ ret
+ restore
+.type aes_fx_cbc_encrypt,#function
+.size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
+my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
+my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
+ = map("%f$_",grep { !($_ & 1) } (16 .. 62));
+my ($ileft,$iright) = ($ialign, $oalign);
+my $one = "%f14";
+
+$code.=<<___;
+.globl aes_fx_ctr32_encrypt_blocks
+.align 32
+aes_fx_ctr32_encrypt_blocks:
+ save %sp, -STACK_FRAME-16, %sp
+ srln $len, 0, $len
+ and $inp, 7, $ialign
+ andn $inp, 7, $inp
+ brz,pn $len, .Lctr32_no_data
+ sll $ialign, 3, $ileft
+
+.Lpic: call .+8
+ add %o7, .Linp_align - .Lpic, %o7
+
+ ld [$key + 240], $rounds
+ and $out, 7, $oalign
+ ld [$ivp + 0], $ctr0 ! load counter
+ andn $out, 7, $out
+ ld [$ivp + 4], $ctr0#lo
+ sll $oalign, 3, $mask
+ ld [$ivp + 8], $ctr1
+ ld [$ivp + 12], $ctr1#lo
+ ldd [%o7 + 128], $one
+
+ sll $rounds, 4, $rounds
+ add $rounds, $key, $end
+ ldd [$key + 0], $r0hi ! round[0]
+ ldd [$key + 8], $r0lo
+
+ add $inp, 16, $inp
+ sub $len, 1, $len
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ mov 16, $inc
+ movrz $len, 0, $inc
+ ldd [$end + 0], $rlhi ! round[last]
+ ldd [$end + 8], $rllo
+
+ ldd [%o7 + $ileft], $fshift ! shiftleft params
+ add %o7, 64, %o7
+ ldd [$inp - 16], $in0 ! load input
+ ldd [$inp - 8], $in1
+ ldda [$inp]0x82, $intail ! non-faulting load
+ add $inp, $inc, $inp ! inp+=16
+
+ fshiftorx $in0, $in1, $fshift, $in0
+ fshiftorx $in1, $intail, $fshift, $in1
+
+.Loop_ctr32:
+ fxor $ctr0, $r0hi, %f0 ! counter^round[0]
+ fxor $ctr1, $r0lo, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+ add $key, 32, $end
+ sub $rounds, 16*6, $inner
+
+.Lctr32_enc:
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10
+ ldd [$end + 24], %f12
+ add $end, 32, $end
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ ldd [$end + 0], %f6
+ ldd [$end + 8], %f8
+
+ brnz,a $inner, .Lctr32_enc
+ sub $inner, 16*2, $inner
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10 ! round[last-1]
+ ldd [$end + 24], %f12
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ fxor $in0, $rlhi, %f6 ! inp^round[last]
+ fxor $in1, $rllo, %f8
+
+ movrz $len, 0, $inc
+ fmovd $intail, $in0
+ ldd [$inp - 8], $in1 ! load next input block
+ ldda [$inp]0x82, $intail ! non-faulting load
+ add $inp, $inc, $inp ! inp+=16
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fshiftorx $in0, $in1, $fshift, $in0
+ fshiftorx $in1, $intail, $fshift, $in1
+ fpadd32 $ctr1, $one, $ctr1 ! increment counter
+
+ fmovd %f0, %f4
+ faesenclx %f2, %f6, %f0
+ faesenclx %f4, %f8, %f2
+
+ brnz,pn $oalign, .Lctr32_unaligned_out
+ nop
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ add $out, 16, $out
+
+ brnz,a $len, .Loop_ctr32
+ sub $len, 1, $len
+
+.Lctr32_no_data:
+ ret
+ restore
+
+.align 32
+.Lctr32_unaligned_out:
+ ldd [%o7 + $mask], $fshift ! shift right params
+ mov 0xff, $mask
+ srl $mask, $oalign, $mask
+ sub %g0, $ileft, $iright
+
+ fshiftorx %f0, %f0, $fshift, %f6
+ fshiftorx %f0, %f2, $fshift, %f8
+
+ stda %f6, [$out + $mask]0xc0 ! partial store
+ orn %g0, $mask, $mask
+ std %f8, [$out + 8]
+ add $out, 16, $out
+ brz $len, .Lctr32_unaligned_out_done
+ sub $len, 1, $len
+ b .Loop_ctr32_unaligned_out
+ nop
+
+.align 32
+.Loop_ctr32_unaligned_out:
+ fmovd %f2, $outhead
+ fxor $ctr0, $r0hi, %f0 ! counter^round[0]
+ fxor $ctr1, $r0lo, %f2
+ ldd [$key + 32], %f6 ! round[2]
+ ldd [$key + 40], %f8
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$key + 48], %f10 ! round[3]
+ ldd [$key + 56], %f12
+
+ ldx [$inp - 16], %o0
+ ldx [$inp - 8], %o1
+ brz $ileft, .Lctr32_aligned_inp
+ movrz $len, 0, $inc
+
+ ldx [$inp], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+
+.Lctr32_aligned_inp:
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ ldd [$key + 64], %f6 ! round[4]
+ ldd [$key + 72], %f8
+ add $key, 64, $end
+ sub $rounds, 16*8, $inner
+
+ stx %o0, [%sp + LOCALS + 0]
+ stx %o1, [%sp + LOCALS + 8]
+ add $inp, $inc, $inp ! inp+=16
+ nop
+
+.Lctr32_enc_unaligned:
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10
+ ldd [$end + 24], %f12
+ add $end, 32, $end
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ ldd [$end + 0], %f6
+ ldd [$end + 8], %f8
+
+ brnz,a $inner, .Lctr32_enc_unaligned
+ sub $inner, 16*2, $inner
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$end + 16], %f10 ! round[last-1]
+ ldd [$end + 24], %f12
+ fpadd32 $ctr1, $one, $ctr1 ! increment counter
+
+ fmovd %f0, %f4
+ faesencx %f2, %f6, %f0
+ faesencx %f4, %f8, %f2
+ fxor $in0, $rlhi, %f6 ! inp^round[last]
+ fxor $in1, $rllo, %f8
+ ldd [%sp + LOCALS + 0], $in0
+ ldd [%sp + LOCALS + 8], $in1
+
+ fmovd %f0, %f4
+ faesencx %f2, %f10, %f0
+ faesencx %f4, %f12, %f2
+ ldd [$key + 16], %f10 ! round[1]
+ ldd [$key + 24], %f12
+
+ fmovd %f0, %f4
+ faesenclx %f2, %f6, %f0
+ faesenclx %f4, %f8, %f2
+
+ fshiftorx $outhead, %f0, $fshift, %f6
+ fshiftorx %f0, %f2, $fshift, %f8
+ std %f6, [$out + 0]
+ std %f8, [$out + 8]
+ add $out, 16, $out
+
+ brnz,a $len, .Loop_ctr32_unaligned_out
+ sub $len, 1, $len
+
+.Lctr32_unaligned_out_done:
+ fshiftorx %f2, %f2, $fshift, %f8
+ stda %f8, [$out + $mask]0xc0 ! partial store
+
+ ret
+ restore
+.type aes_fx_ctr32_encrypt_blocks,#function
+.size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
+
+.align 32
+.Linp_align: ! fshiftorx parameters for left shift toward %rs1
+ .byte 0, 0, 64, 0, 0, 64, 0, -64
+ .byte 0, 0, 56, 8, 0, 56, 8, -56
+ .byte 0, 0, 48, 16, 0, 48, 16, -48
+ .byte 0, 0, 40, 24, 0, 40, 24, -40
+ .byte 0, 0, 32, 32, 0, 32, 32, -32
+ .byte 0, 0, 24, 40, 0, 24, 40, -24
+ .byte 0, 0, 16, 48, 0, 16, 48, -16
+ .byte 0, 0, 8, 56, 0, 8, 56, -8
+.Lout_align: ! fshiftorx parameters for right shift toward %rs2
+ .byte 0, 0, 0, 64, 0, 0, 64, 0
+ .byte 0, 0, 8, 56, 0, 8, 56, -8
+ .byte 0, 0, 16, 48, 0, 16, 48, -16
+ .byte 0, 0, 24, 40, 0, 24, 40, -24
+ .byte 0, 0, 32, 32, 0, 32, 32, -32
+ .byte 0, 0, 40, 24, 0, 40, 24, -40
+ .byte 0, 0, 48, 16, 0, 48, 16, -48
+ .byte 0, 0, 56, 8, 0, 56, 8, -56
+.Lone:
+ .word 0, 1
+.asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+}
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %visopf = ( "faligndata" => 0x048,
+ "bshuffle" => 0x04c,
+ "fpadd32" => 0x052,
+ "fxor" => 0x06c,
+ "fsrc2" => 0x078 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "alignaddr" => 0x018,
+ "bmask" => 0x019,
+ "alignaddrl" => 0x01a );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unfx {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = ( "faesencx" => 0x90,
+ "faesdecx" => 0x91,
+ "faesenclx" => 0x92,
+ "faesdeclx" => 0x93,
+ "faeskeyx" => 0x94 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
+ $rs2 = oct($rs2) if ($rs2 =~ /^0/);
+
+ foreach ($rs1,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unfx3src {
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = ( "fshiftorx" => 0x0b );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ foreach ($rs1,$rs2,$rs3,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
+
+ s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+ &unfx($1,$2,$3,$4)
+ /ge or
+ s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unfx3src($1,$2,$3,$4,$5)
+ /ge or
+ s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /ge or
+ s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge;
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesni-mb-x86_64.pl b/openssl-1.1.0h/crypto/aes/asm/aesni-mb-x86_64.pl
new file mode 100644
index 0000000..aa2735e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -0,0 +1,1402 @@
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# Multi-buffer AES-NI procedures process several independent buffers
+# in parallel by interleaving independent instructions.
+#
+# Cycles per byte for interleave factor 4:
+#
+# asymptotic measured
+# ---------------------------
+# Westmere 5.00/4=1.25 5.13/4=1.28
+# Atom 15.0/4=3.75 ?15.7/4=3.93
+# Sandy Bridge 5.06/4=1.27 5.18/4=1.29
+# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
+# Haswell 4.44/4=1.11 4.44/4=1.11
+# Bulldozer 5.75/4=1.44 5.76/4=1.44
+#
+# Cycles per byte for interleave factor 8 (not implemented for
+# pre-AVX processors, where higher interleave factor incidentally
+# doesn't result in improvement):
+#
+# asymptotic measured
+# ---------------------------
+# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
+# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
+# Haswell 5.00/8=0.63 5.00/8=0.63
+# Bulldozer 5.75/8=0.72 5.77/8=0.72
+#
+# (*) Sandy/Ivy Bridge are known to handle high interleave factors
+# suboptimally;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=0;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# void aesni_multi_cbc_encrypt (
+# struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
+# const AES_KEY *key,
+# int num); /* 1 or 2 */
+#
+$inp="%rdi"; # 1st arg
+$key="%rsi"; # 2nd arg
+$num="%edx";
+
+@inptr=map("%r$_",(8..11));
+@outptr=map("%r$_",(12..15));
+
+($rndkey0,$rndkey1)=("%xmm0","%xmm1");
+@out=map("%xmm$_",(2..5));
+@inp=map("%xmm$_",(6..9));
+($counters,$mask,$zero)=map("%xmm$_",(10..12));
+
+($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_multi_cbc_encrypt
+.type aesni_multi_cbc_encrypt,\@function,3
+.align 32
+aesni_multi_cbc_encrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Lenc_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_enc_shortcut
+ jmp .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Lenc4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ pxor $zero,@out[0]
+ movups 0x20-0x78($key),$rndkey0
+ pxor $zero,@out[1]
+ mov 0xf0-0x78($key),$rounds
+ pxor $zero,@out[2]
+ movdqu (@inptr[0]),@inp[0] # load inputs
+ pxor $zero,@out[3]
+ movdqu (@inptr[1]),@inp[1]
+ pxor @inp[0],@out[0]
+ movdqu (@inptr[2]),@inp[2]
+ pxor @inp[1],@out[1]
+ movdqu (@inptr[3]),@inp[3]
+ pxor @inp[2],@out[2]
+ pxor @inp[3],@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_enc4x
+
+.align 32
+.Loop_enc4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesenc $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesenc $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[2],$offset)
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesenc $rndkey,@out[0]
+ aesenc $rndkey,@out[1]
+ aesenc $rndkey,@out[2]
+ cmovge $sink,@inptr[$i] # cancel input
+ cmovg $sink,@outptr[$i] # sink output
+ aesenc $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesenc $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesenc $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesenc $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesenc $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Lenc4x_tail
+
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesenc $rndkey0,@out[0]
+ aesenc $rndkey0,@out[1]
+ aesenc $rndkey0,@out[2]
+ aesenc $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Lenc4x_tail
+
+.align 32
+.Lenc4x_tail:
+ aesenc $rndkey1,@out[0]
+ aesenc $rndkey1,@out[1]
+ aesenc $rndkey1,@out[2]
+ aesenc $rndkey1,@out[3]
+ movdqu (@inptr[0],$offset),@inp[0]
+ movdqu 0x10-0x78($key),$rndkey1
+
+ aesenclast $rndkey0,@out[0]
+ movdqu (@inptr[1],$offset),@inp[1]
+ pxor $zero,@inp[0]
+ aesenclast $rndkey0,@out[1]
+ movdqu (@inptr[2],$offset),@inp[2]
+ pxor $zero,@inp[1]
+ aesenclast $rndkey0,@out[2]
+ movdqu (@inptr[3],$offset),@inp[3]
+ pxor $zero,@inp[2]
+ aesenclast $rndkey0,@out[3]
+ movdqu 0x20-0x78($key),$rndkey0
+ pxor $zero,@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ pxor @inp[0],@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ pxor @inp[1],@out[1]
+ movups @out[2],-16(@outptr[2],$offset)
+ pxor @inp[2],@out[2]
+ movups @out[3],-16(@outptr[3],$offset)
+ pxor @inp[3],@out[3]
+
+ dec $num
+ jnz .Loop_enc4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ #pxor @inp[0],@out[0]
+ #pxor @inp[1],@out[1]
+ #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
+ #pxor @inp[2],@out[2]
+ #movdqu @out[1],`40*1+24-40*2`($inp)
+ #pxor @inp[3],@out[3]
+ #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
+ #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Lenc4x_loop_grande
+
+.Lenc4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ #movaps -0x68(%rax),%xmm13
+ #movaps -0x58(%rax),%xmm14
+ #movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lenc4x_epilogue:
+ ret
+.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
+
+.globl aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_decrypt,\@function,3
+.align 32
+aesni_multi_cbc_decrypt:
+___
+$code.=<<___ if ($avx);
+ cmp \$2,$num
+ jb .Ldec_non_avx
+ mov OPENSSL_ia32cap_P+4(%rip),%ecx
+ test \$`1<<28`,%ecx # AVX bit
+ jnz _avx_cbc_dec_shortcut
+ jmp .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+
+ sub \$48,%rsp
+ and \$-64,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Ldec4x_body:
+ movdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*2($inp),$inp
+
+.Ldec4x_loop_grande:
+ mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<4;$i++) {
+ $code.=<<___;
+ mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*2`($inp),@inptr[$i]
+ cmp $num,$one
+ mov `40*$i+8-40*2`($inp),@outptr[$i]
+ cmovg $one,$num # find maximum
+ test $one,$one
+ movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@inptr[$i] # cancel input
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldec4x_done
+
+ movups 0x10-0x78($key),$rndkey1
+ movups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+ movdqu (@inptr[0]),@out[0] # load inputs
+ movdqu (@inptr[1]),@out[1]
+ pxor $zero,@out[0]
+ movdqu (@inptr[2]),@out[2]
+ pxor $zero,@out[1]
+ movdqu (@inptr[3]),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+ movdqa 32(%rsp),$counters # load counters
+ xor $offset,$offset
+ jmp .Loop_dec4x
+
+.align 32
+.Loop_dec4x:
+ add \$16,$offset
+ lea 16(%rsp),$sink # sink pointer
+ mov \$1,$one # constant of 1
+ sub $offset,$sink
+
+ aesdec $rndkey1,@out[0]
+ prefetcht0 31(@inptr[0],$offset) # prefetch input
+ prefetcht0 31(@inptr[1],$offset)
+ aesdec $rndkey1,@out[1]
+ prefetcht0 31(@inptr[2],$offset)
+ prefetcht0 31(@inptr[3],$offset)
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x30-0x78($key),$rndkey1
+___
+for($i=0;$i<4;$i++) {
+my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
+$code.=<<___;
+ cmp `32+4*$i`(%rsp),$one
+ aesdec $rndkey,@out[0]
+ aesdec $rndkey,@out[1]
+ aesdec $rndkey,@out[2]
+ cmovge $sink,@inptr[$i] # cancel input
+ cmovg $sink,@outptr[$i] # sink output
+ aesdec $rndkey,@out[3]
+ movups `0x40+16*$i-0x78`($key),$rndkey
+___
+}
+$code.=<<___;
+ movdqa $counters,$mask
+ aesdec $rndkey0,@out[0]
+ prefetcht0 15(@outptr[0],$offset) # prefetch output
+ prefetcht0 15(@outptr[1],$offset)
+ aesdec $rndkey0,@out[1]
+ prefetcht0 15(@outptr[2],$offset)
+ prefetcht0 15(@outptr[3],$offset)
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0x80-0x78($key),$rndkey0
+ pxor $zero,$zero
+
+ aesdec $rndkey1,@out[0]
+ pcmpgtd $zero,$mask
+ movdqu -0x78($key),$zero # reload 0-round key
+ aesdec $rndkey1,@out[1]
+ paddd $mask,$counters # decrement counters
+ movdqa $counters,32(%rsp) # update counters
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0x90-0x78($key),$rndkey1
+
+ cmp \$11,$rounds
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xa0-0x78($key),$rndkey0
+
+ jb .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xb0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xc0-0x78($key),$rndkey0
+
+ je .Ldec4x_tail
+
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ aesdec $rndkey1,@out[3]
+ movups 0xd0-0x78($key),$rndkey1
+
+ aesdec $rndkey0,@out[0]
+ aesdec $rndkey0,@out[1]
+ aesdec $rndkey0,@out[2]
+ aesdec $rndkey0,@out[3]
+ movups 0xe0-0x78($key),$rndkey0
+ jmp .Ldec4x_tail
+
+.align 32
+.Ldec4x_tail:
+ aesdec $rndkey1,@out[0]
+ aesdec $rndkey1,@out[1]
+ aesdec $rndkey1,@out[2]
+ pxor $rndkey0,@inp[0]
+ pxor $rndkey0,@inp[1]
+ aesdec $rndkey1,@out[3]
+ movdqu 0x10-0x78($key),$rndkey1
+ pxor $rndkey0,@inp[2]
+ pxor $rndkey0,@inp[3]
+ movdqu 0x20-0x78($key),$rndkey0
+
+ aesdeclast @inp[0],@out[0]
+ aesdeclast @inp[1],@out[1]
+ movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
+ movdqu -16(@inptr[1],$offset),@inp[1]
+ aesdeclast @inp[2],@out[2]
+ aesdeclast @inp[3],@out[3]
+ movdqu -16(@inptr[2],$offset),@inp[2]
+ movdqu -16(@inptr[3],$offset),@inp[3]
+
+ movups @out[0],-16(@outptr[0],$offset)
+ movdqu (@inptr[0],$offset),@out[0]
+ movups @out[1],-16(@outptr[1],$offset)
+ movdqu (@inptr[1],$offset),@out[1]
+ pxor $zero,@out[0]
+ movups @out[2],-16(@outptr[2],$offset)
+ movdqu (@inptr[2],$offset),@out[2]
+ pxor $zero,@out[1]
+ movups @out[3],-16(@outptr[3],$offset)
+ movdqu (@inptr[3],$offset),@out[3]
+ pxor $zero,@out[2]
+ pxor $zero,@out[3]
+
+ dec $num
+ jnz .Loop_dec4x
+
+ mov 16(%rsp),%rax # original %rsp
+ mov 24(%rsp),$num
+
+ lea `40*4`($inp),$inp
+ dec $num
+ jnz .Ldec4x_loop_grande
+
+.Ldec4x_done:
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ #movaps -0x68(%rax),%xmm13
+ #movaps -0x58(%rax),%xmm14
+ #movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Ldec4x_epilogue:
+ ret
+.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+___
+
+ if ($avx) {{{
+my @ptr=map("%r$_",(8..15));
+my $offload=$sink;
+
+my @out=map("%xmm$_",(2..9));
+my @inp=map("%xmm$_",(10..13));
+my ($counters,$zero)=("%xmm14","%xmm15");
+
+$code.=<<___;
+.type aesni_multi_cbc_encrypt_avx,\@function,3
+.align 32
+aesni_multi_cbc_encrypt_avx:
+_avx_cbc_enc_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+ # +64 distances between inputs and outputs
+ # +128 off-load area for @inp[0..3]
+
+ sub \$192,%rsp
+ and \$-128,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Lenc8x_body:
+ vzeroupper
+ vmovdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*4($inp),$inp
+ shr \$1,$num
+
+.Lenc8x_loop_grande:
+ #mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<8;$i++) {
+ my $temp = $i ? $offload : $offset;
+ $code.=<<___;
+ mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ cmp $num,$one
+ mov `40*$i+8-40*4`($inp),$temp # output pointer
+ cmovg $one,$num # find maximum
+ test $one,$one
+ vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+ sub @ptr[$i],$temp # distance between input and output
+ mov $temp,`64+8*$i`(%rsp) # initialize distances
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Lenc8x_done
+
+ vmovups 0x10-0x78($key),$rndkey1
+ vmovups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+
+ vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
+ lea 128(%rsp),$offload # offload area
+ vpxor (@ptr[1]),$zero,@inp[1]
+ vpxor (@ptr[2]),$zero,@inp[2]
+ vpxor (@ptr[3]),$zero,@inp[3]
+ vpxor @inp[0],@out[0],@out[0]
+ vpxor (@ptr[4]),$zero,@inp[0]
+ vpxor @inp[1],@out[1],@out[1]
+ vpxor (@ptr[5]),$zero,@inp[1]
+ vpxor @inp[2],@out[2],@out[2]
+ vpxor (@ptr[6]),$zero,@inp[2]
+ vpxor @inp[3],@out[3],@out[3]
+ vpxor (@ptr[7]),$zero,@inp[3]
+ vpxor @inp[0],@out[4],@out[4]
+ mov \$1,$one # constant of 1
+ vpxor @inp[1],@out[5],@out[5]
+ vpxor @inp[2],@out[6],@out[6]
+ vpxor @inp[3],@out[7],@out[7]
+ jmp .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+___
+for($i=0;$i<8;$i++) {
+my $rndkey=($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+ vaesenc $rndkey,@out[0],@out[0]
+ cmp 32+4*$i(%rsp),$one
+___
+$code.=<<___ if ($i);
+ mov 64+8*$i(%rsp),$offset
+___
+$code.=<<___;
+ vaesenc $rndkey,@out[1],@out[1]
+ prefetcht0 31(@ptr[$i]) # prefetch input
+ vaesenc $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+___
+$code.=<<___;
+ vaesenc $rndkey,@out[3],@out[3]
+ lea (@ptr[$i],$offset),$offset
+ cmovge %rsp,@ptr[$i] # cancel input
+ vaesenc $rndkey,@out[4],@out[4]
+ cmovg %rsp,$offset # sink output
+ vaesenc $rndkey,@out[5],@out[5]
+ sub @ptr[$i],$offset
+ vaesenc $rndkey,@out[6],@out[6]
+ vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
+ mov $offset,64+8*$i(%rsp)
+ vaesenc $rndkey,@out[7],@out[7]
+ vmovups `16*(3+$i)-0x78`($key),$rndkey
+ lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
+___
+$code.=<<___ if ($i<4)
+ vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
+___
+}
+$code.=<<___;
+ vmovdqu 32(%rsp),$counters
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+ prefetcht0 15(@ptr[$i-1])
+ cmp \$11,$rounds
+ jb .Lenc8x_tail
+
+ vaesenc $rndkey1,@out[0],@out[0]
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vaesenc $rndkey1,@out[5],@out[5]
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0xb0-0x78($key),$rndkey1
+
+ vaesenc $rndkey0,@out[0],@out[0]
+ vaesenc $rndkey0,@out[1],@out[1]
+ vaesenc $rndkey0,@out[2],@out[2]
+ vaesenc $rndkey0,@out[3],@out[3]
+ vaesenc $rndkey0,@out[4],@out[4]
+ vaesenc $rndkey0,@out[5],@out[5]
+ vaesenc $rndkey0,@out[6],@out[6]
+ vaesenc $rndkey0,@out[7],@out[7]
+ vmovups 0xc0-0x78($key),$rndkey0
+ je .Lenc8x_tail
+
+ vaesenc $rndkey1,@out[0],@out[0]
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vaesenc $rndkey1,@out[5],@out[5]
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0xd0-0x78($key),$rndkey1
+
+ vaesenc $rndkey0,@out[0],@out[0]
+ vaesenc $rndkey0,@out[1],@out[1]
+ vaesenc $rndkey0,@out[2],@out[2]
+ vaesenc $rndkey0,@out[3],@out[3]
+ vaesenc $rndkey0,@out[4],@out[4]
+ vaesenc $rndkey0,@out[5],@out[5]
+ vaesenc $rndkey0,@out[6],@out[6]
+ vaesenc $rndkey0,@out[7],@out[7]
+ vmovups 0xe0-0x78($key),$rndkey0
+
+.Lenc8x_tail:
+ vaesenc $rndkey1,@out[0],@out[0]
+ vpxor $zero,$zero,$zero
+ vaesenc $rndkey1,@out[1],@out[1]
+ vaesenc $rndkey1,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesenc $rndkey1,@out[3],@out[3]
+ vaesenc $rndkey1,@out[4],@out[4]
+ vpaddd $counters,$zero,$zero # decrement counters
+ vmovdqu 48(%rsp),$counters
+ vaesenc $rndkey1,@out[5],@out[5]
+ mov 64(%rsp),$offset # pre-load 1st offset
+ vaesenc $rndkey1,@out[6],@out[6]
+ vaesenc $rndkey1,@out[7],@out[7]
+ vmovups 0x10-0x78($key),$rndkey1
+
+ vaesenclast $rndkey0,@out[0],@out[0]
+ vmovdqa $zero,32(%rsp) # update counters
+ vpxor $zero,$zero,$zero
+ vaesenclast $rndkey0,@out[1],@out[1]
+ vaesenclast $rndkey0,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesenclast $rndkey0,@out[3],@out[3]
+ vaesenclast $rndkey0,@out[4],@out[4]
+ vpaddd $zero,$counters,$counters # decrement counters
+ vmovdqu -0x78($key),$zero # 0-round
+ vaesenclast $rndkey0,@out[5],@out[5]
+ vaesenclast $rndkey0,@out[6],@out[6]
+ vmovdqa $counters,48(%rsp) # update counters
+ vaesenclast $rndkey0,@out[7],@out[7]
+ vmovups 0x20-0x78($key),$rndkey0
+
+ vmovups @out[0],-16(@ptr[0]) # write output
+ sub $offset,@ptr[0] # switch to input
+ vpxor 0x00($offload),@out[0],@out[0]
+ vmovups @out[1],-16(@ptr[1])
+ sub `64+1*8`(%rsp),@ptr[1]
+ vpxor 0x10($offload),@out[1],@out[1]
+ vmovups @out[2],-16(@ptr[2])
+ sub `64+2*8`(%rsp),@ptr[2]
+ vpxor 0x20($offload),@out[2],@out[2]
+ vmovups @out[3],-16(@ptr[3])
+ sub `64+3*8`(%rsp),@ptr[3]
+ vpxor 0x30($offload),@out[3],@out[3]
+ vmovups @out[4],-16(@ptr[4])
+ sub `64+4*8`(%rsp),@ptr[4]
+ vpxor @inp[0],@out[4],@out[4]
+ vmovups @out[5],-16(@ptr[5])
+ sub `64+5*8`(%rsp),@ptr[5]
+ vpxor @inp[1],@out[5],@out[5]
+ vmovups @out[6],-16(@ptr[6])
+ sub `64+6*8`(%rsp),@ptr[6]
+ vpxor @inp[2],@out[6],@out[6]
+ vmovups @out[7],-16(@ptr[7])
+ sub `64+7*8`(%rsp),@ptr[7]
+ vpxor @inp[3],@out[7],@out[7]
+
+ dec $num
+ jnz .Loop_enc8x
+
+ mov 16(%rsp),%rax # original %rsp
+ #mov 24(%rsp),$num
+ #lea `40*8`($inp),$inp
+ #dec $num
+ #jnz .Lenc8x_loop_grande
+
+.Lenc8x_done:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Lenc8x_epilogue:
+ ret
+.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type aesni_multi_cbc_decrypt_avx,\@function,3
+.align 32
+aesni_multi_cbc_decrypt_avx:
+_avx_cbc_dec_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,-0x78(%rax)
+ movaps %xmm13,-0x68(%rax)
+ movaps %xmm14,-0x58(%rax)
+ movaps %xmm15,-0x48(%rax)
+___
+$code.=<<___;
+ # stack layout
+ #
+ # +0 output sink
+ # +16 input sink [original %rsp and $num]
+ # +32 counters
+ # +64 distances between inputs and outputs
+ # +128 off-load area for @inp[0..3]
+ # +192 IV/input offload
+
+ sub \$256,%rsp
+ and \$-256,%rsp
+ sub \$192,%rsp
+ mov %rax,16(%rsp) # original %rsp
+
+.Ldec8x_body:
+ vzeroupper
+ vmovdqu ($key),$zero # 0-round key
+ lea 0x78($key),$key # size optimization
+ lea 40*4($inp),$inp
+ shr \$1,$num
+
+.Ldec8x_loop_grande:
+ #mov $num,24(%rsp) # original $num
+ xor $num,$num
+___
+for($i=0;$i<8;$i++) {
+ my $temp = $i ? $offload : $offset;
+ $code.=<<___;
+ mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
+ mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ cmp $num,$one
+ mov `40*$i+8-40*4`($inp),$temp # output pointer
+ cmovg $one,$num # find maximum
+ test $one,$one
+ vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ mov $one,`32+4*$i`(%rsp) # initialize counters
+ cmovle %rsp,@ptr[$i] # cancel input
+ sub @ptr[$i],$temp # distance between input and output
+ mov $temp,`64+8*$i`(%rsp) # initialize distances
+ vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
+___
+}
+$code.=<<___;
+ test $num,$num
+ jz .Ldec8x_done
+
+ vmovups 0x10-0x78($key),$rndkey1
+ vmovups 0x20-0x78($key),$rndkey0
+ mov 0xf0-0x78($key),$rounds
+ lea 192+128(%rsp),$offload # offload area
+
+ vmovdqu (@ptr[0]),@out[0] # load inputs
+ vmovdqu (@ptr[1]),@out[1]
+ vmovdqu (@ptr[2]),@out[2]
+ vmovdqu (@ptr[3]),@out[3]
+ vmovdqu (@ptr[4]),@out[4]
+ vmovdqu (@ptr[5]),@out[5]
+ vmovdqu (@ptr[6]),@out[6]
+ vmovdqu (@ptr[7]),@out[7]
+ vmovdqu @out[0],0x00($offload) # offload inputs
+ vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
+ vmovdqu @out[1],0x10($offload)
+ vpxor $zero,@out[1],@out[1]
+ vmovdqu @out[2],0x20($offload)
+ vpxor $zero,@out[2],@out[2]
+ vmovdqu @out[3],0x30($offload)
+ vpxor $zero,@out[3],@out[3]
+ vmovdqu @out[4],0x40($offload)
+ vpxor $zero,@out[4],@out[4]
+ vmovdqu @out[5],0x50($offload)
+ vpxor $zero,@out[5],@out[5]
+ vmovdqu @out[6],0x60($offload)
+ vpxor $zero,@out[6],@out[6]
+ vmovdqu @out[7],0x70($offload)
+ vpxor $zero,@out[7],@out[7]
+ xor \$0x80,$offload
+ mov \$1,$one # constant of 1
+ jmp .Loop_dec8x
+
+.align 32
+.Loop_dec8x:
+___
+for($i=0;$i<8;$i++) {
+my $rndkey=($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+ vaesdec $rndkey,@out[0],@out[0]
+ cmp 32+4*$i(%rsp),$one
+___
+$code.=<<___ if ($i);
+ mov 64+8*$i(%rsp),$offset
+___
+$code.=<<___;
+ vaesdec $rndkey,@out[1],@out[1]
+ prefetcht0 31(@ptr[$i]) # prefetch input
+ vaesdec $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+___
+$code.=<<___;
+ vaesdec $rndkey,@out[3],@out[3]
+ lea (@ptr[$i],$offset),$offset
+ cmovge %rsp,@ptr[$i] # cancel input
+ vaesdec $rndkey,@out[4],@out[4]
+ cmovg %rsp,$offset # sink output
+ vaesdec $rndkey,@out[5],@out[5]
+ sub @ptr[$i],$offset
+ vaesdec $rndkey,@out[6],@out[6]
+ vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
+ mov $offset,64+8*$i(%rsp)
+ vaesdec $rndkey,@out[7],@out[7]
+ vmovups `16*(3+$i)-0x78`($key),$rndkey
+ lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
+___
+$code.=<<___ if ($i<4);
+ vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
+___
+}
+$code.=<<___;
+ vmovdqu 32(%rsp),$counters
+ prefetcht0 15(@ptr[$i-2]) # prefetch output
+ prefetcht0 15(@ptr[$i-1])
+ cmp \$11,$rounds
+ jb .Ldec8x_tail
+
+ vaesdec $rndkey1,@out[0],@out[0]
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vaesdec $rndkey1,@out[5],@out[5]
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0xb0-0x78($key),$rndkey1
+
+ vaesdec $rndkey0,@out[0],@out[0]
+ vaesdec $rndkey0,@out[1],@out[1]
+ vaesdec $rndkey0,@out[2],@out[2]
+ vaesdec $rndkey0,@out[3],@out[3]
+ vaesdec $rndkey0,@out[4],@out[4]
+ vaesdec $rndkey0,@out[5],@out[5]
+ vaesdec $rndkey0,@out[6],@out[6]
+ vaesdec $rndkey0,@out[7],@out[7]
+ vmovups 0xc0-0x78($key),$rndkey0
+ je .Ldec8x_tail
+
+ vaesdec $rndkey1,@out[0],@out[0]
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vaesdec $rndkey1,@out[5],@out[5]
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0xd0-0x78($key),$rndkey1
+
+ vaesdec $rndkey0,@out[0],@out[0]
+ vaesdec $rndkey0,@out[1],@out[1]
+ vaesdec $rndkey0,@out[2],@out[2]
+ vaesdec $rndkey0,@out[3],@out[3]
+ vaesdec $rndkey0,@out[4],@out[4]
+ vaesdec $rndkey0,@out[5],@out[5]
+ vaesdec $rndkey0,@out[6],@out[6]
+ vaesdec $rndkey0,@out[7],@out[7]
+ vmovups 0xe0-0x78($key),$rndkey0
+
+.Ldec8x_tail:
+ vaesdec $rndkey1,@out[0],@out[0]
+ vpxor $zero,$zero,$zero
+ vaesdec $rndkey1,@out[1],@out[1]
+ vaesdec $rndkey1,@out[2],@out[2]
+ vpcmpgtd $zero,$counters,$zero
+ vaesdec $rndkey1,@out[3],@out[3]
+ vaesdec $rndkey1,@out[4],@out[4]
+ vpaddd $counters,$zero,$zero # decrement counters
+ vmovdqu 48(%rsp),$counters
+ vaesdec $rndkey1,@out[5],@out[5]
+ mov 64(%rsp),$offset # pre-load 1st offset
+ vaesdec $rndkey1,@out[6],@out[6]
+ vaesdec $rndkey1,@out[7],@out[7]
+ vmovups 0x10-0x78($key),$rndkey1
+
+ vaesdeclast $rndkey0,@out[0],@out[0]
+ vmovdqa $zero,32(%rsp) # update counters
+ vpxor $zero,$zero,$zero
+ vaesdeclast $rndkey0,@out[1],@out[1]
+ vpxor 0x00($offload),@out[0],@out[0] # xor with IV
+ vaesdeclast $rndkey0,@out[2],@out[2]
+ vpxor 0x10($offload),@out[1],@out[1]
+ vpcmpgtd $zero,$counters,$zero
+ vaesdeclast $rndkey0,@out[3],@out[3]
+ vpxor 0x20($offload),@out[2],@out[2]
+ vaesdeclast $rndkey0,@out[4],@out[4]
+ vpxor 0x30($offload),@out[3],@out[3]
+ vpaddd $zero,$counters,$counters # decrement counters
+ vmovdqu -0x78($key),$zero # 0-round
+ vaesdeclast $rndkey0,@out[5],@out[5]
+ vpxor 0x40($offload),@out[4],@out[4]
+ vaesdeclast $rndkey0,@out[6],@out[6]
+ vpxor 0x50($offload),@out[5],@out[5]
+ vmovdqa $counters,48(%rsp) # update counters
+ vaesdeclast $rndkey0,@out[7],@out[7]
+ vpxor 0x60($offload),@out[6],@out[6]
+ vmovups 0x20-0x78($key),$rndkey0
+
+ vmovups @out[0],-16(@ptr[0]) # write output
+ sub $offset,@ptr[0] # switch to input
+ vmovdqu 128+0(%rsp),@out[0]
+ vpxor 0x70($offload),@out[7],@out[7]
+ vmovups @out[1],-16(@ptr[1])
+ sub `64+1*8`(%rsp),@ptr[1]
+ vmovdqu @out[0],0x00($offload)
+ vpxor $zero,@out[0],@out[0]
+ vmovdqu 128+16(%rsp),@out[1]
+ vmovups @out[2],-16(@ptr[2])
+ sub `64+2*8`(%rsp),@ptr[2]
+ vmovdqu @out[1],0x10($offload)
+ vpxor $zero,@out[1],@out[1]
+ vmovdqu 128+32(%rsp),@out[2]
+ vmovups @out[3],-16(@ptr[3])
+ sub `64+3*8`(%rsp),@ptr[3]
+ vmovdqu @out[2],0x20($offload)
+ vpxor $zero,@out[2],@out[2]
+ vmovdqu 128+48(%rsp),@out[3]
+ vmovups @out[4],-16(@ptr[4])
+ sub `64+4*8`(%rsp),@ptr[4]
+ vmovdqu @out[3],0x30($offload)
+ vpxor $zero,@out[3],@out[3]
+ vmovdqu @inp[0],0x40($offload)
+ vpxor @inp[0],$zero,@out[4]
+ vmovups @out[5],-16(@ptr[5])
+ sub `64+5*8`(%rsp),@ptr[5]
+ vmovdqu @inp[1],0x50($offload)
+ vpxor @inp[1],$zero,@out[5]
+ vmovups @out[6],-16(@ptr[6])
+ sub `64+6*8`(%rsp),@ptr[6]
+ vmovdqu @inp[2],0x60($offload)
+ vpxor @inp[2],$zero,@out[6]
+ vmovups @out[7],-16(@ptr[7])
+ sub `64+7*8`(%rsp),@ptr[7]
+ vmovdqu @inp[3],0x70($offload)
+ vpxor @inp[3],$zero,@out[7]
+
+ xor \$128,$offload
+ dec $num
+ jnz .Loop_dec8x
+
+ mov 16(%rsp),%rax # original %rsp
+ #mov 24(%rsp),$num
+ #lea `40*8`($inp),$inp
+ #dec $num
+ #jnz .Ldec8x_loop_grande
+
+.Ldec8x_done:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Ldec8x_epilogue:
+ ret
+.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
+___
+ }}}
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov 16(%rax),%rax # pull saved stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore cotnext->R12
+ mov %r13,224($context) # restore cotnext->R13
+ mov %r14,232($context) # restore cotnext->R14
+ mov %r15,240($context) # restore cotnext->R15
+
+ lea -56-10*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_multi_cbc_encrypt
+ .rva .LSEH_end_aesni_multi_cbc_encrypt
+ .rva .LSEH_info_aesni_multi_cbc_encrypt
+ .rva .LSEH_begin_aesni_multi_cbc_decrypt
+ .rva .LSEH_end_aesni_multi_cbc_decrypt
+ .rva .LSEH_info_aesni_multi_cbc_decrypt
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
+ .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
+ .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
+ .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_aesni_multi_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
+.LSEH_info_aesni_multi_cbc_decrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_multi_cbc_encrypt_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
+.LSEH_info_aesni_multi_cbc_decrypt_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
+___
+}
+####################################################################
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ rex(\@opcode,$4,$3);
+ push @opcode,0x0f,0x3a,0xdf;
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
+ my $c=$2;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesimc" => 0xdb,
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesni-sha1-x86_64.pl b/openssl-1.1.0h/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000..33a7f0c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,2072 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+# AES-128-CBC +SHA1 stitch gain
+# Westmere 3.77[+5.3] 9.07 6.55 +38%
+# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
+# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
+# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
+# Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%)
+# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
+# Ryzen(**) 2.71[+1.93] 4.64 2.74 +69%
+# Goldmont(**) 3.82[+1.70] 5.52 4.20 +31%
+#
+# AES-192-CBC
+# Westmere 4.51 9.81 6.80 +44%
+# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%)
+# Ivy Bridge 6.05 10.65 6.07 +75%
+# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%)
+# Bulldozer 6.89 12.84 6.96 +84%
+#
+# AES-256-CBC
+# Westmere 5.25 10.55 7.21 +46%
+# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
+# Ivy Bridge 7.05 11.65 7.12 +64%
+# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
+# Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%)
+# Bulldozer 8.00 13.95 8.25 +69%
+# Ryzen(**) 3.71 5.64 3.72 +52%
+# Goldmont(**) 5.35 7.05 5.76 +22%
+#
+# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+# background information. Above numbers in parentheses are SSSE3
+# results collected on AVX-capable CPU, i.e. apply on OSes that
+# don't support AVX.
+# (**) SHAEXT results.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+# AES-128-CBC AES-192-CBC AES-256-CBC
+# Westmere 1.25 1.50 1.75
+# Sandy Bridge 0.74 0.91 1.09
+# Ivy Bridge 0.74 0.90 1.11
+# Haswell 0.63 0.76 0.88
+# Bulldozer 0.70 0.85 0.99
+
+# And indeed:
+#
+# AES-256-CBC +SHA1 stitch gain
+# Westmere 1.75 7.20 6.68 +7.8%
+# Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%)
+# Ivy Bridge 1.11 5.70 5.45 +4.6%
+# Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%)
+# Bulldozer 0.99 6.95 5.95 +17%(**)
+#
+# (*) Tiny improvement coefficient on Haswell is because we compare
+# AVX1 stitch to sum with AVX2 SHA1.
+# (**) Execution is fully dominated by integer code sequence and
+# SIMD still hardly shows [in single-process benchmark;-]
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+
+$shaext=1; ### set to zero if compiling for 1.0.1
+
+$stitched_decrypt=0;
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+# void aesni_cbc_sha1_enc(const void *inp,
+# void *out,
+# size_t length,
+# const AES_KEY *key,
+# unsigned char *iv,
+# SHA_CTX *ctx,
+# const void *in0);
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc,\@abi-omnipotent
+.align 32
+aesni_cbc_sha1_enc:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11
+___
+$code.=<<___ if ($shaext);
+ bt \$61,%r11 # check SHA bit
+ jc aesni_cbc_sha1_enc_shaext
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+ jmp aesni_cbc_sha1_enc_ssse3
+ ret
+.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
+my $K_XX_XX="%r11";
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc
+my @rndkey=("%xmm14","%xmm15"); # for enc
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
+
+if (1) { # reassign for Atom Silvermont
+ # The goal is to minimize amount of instructions with more than
+ # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
+ # SSSE3 instructions to upper half of the register bank.
+ @X=map("%xmm$_",(8..11,4..7));
+ @Tx=map("%xmm$_",(12,13,3));
+ ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+ @rndkey=("%xmm0","%xmm1");
+}
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_ssse3,\@function,6
+.align 32
+aesni_cbc_sha1_enc_ssse3:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_ssse3 # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ movdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240-112($key),$rounds
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @Tx[2],@X[-4&7] # byte swap
+ pshufb @Tx[2],@X[-3&7]
+ pshufb @Tx[2],@X[-2&7]
+ add \$64,$inp
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ pshufb @Tx[2],@X[-1&7]
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movups -112($key),$rndkey0 # $key[0]
+ movups 16-112($key),$rndkey[0] # forward reference
+ jmp .Loop_ssse3
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ movups `16*$n`($in0),$in # load input
+ xorps $rndkey0,$in
+___
+ $code.=<<___ if ($n);
+ movups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ xorps $in,$iv
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Laesenclast$sn
+ movups `32+16*($k+0)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+1)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+ je .Laesenclast$sn
+ movups `32+16*($k+2)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+3)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+.Laesenclast$sn:
+ aesenclast $rndkey[0],$iv
+ movups 16-112($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns)); # ror
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
+ &movdqa (@Tx[0],@X[-1&7]);
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@Tx[2],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+
+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[2],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@Tx[1],2);
+ &pxor (@X[0],@Tx[2]);
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns)) if ($Xi==8);
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_ror/);
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+ &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ if ($Xi%5) {
+ &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ eval(shift(@insns)); # ror
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
+
+ &pslld (@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrld (@Tx[0],30);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
+ &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (shift);
+
+ unshift(@Tx,pop(@Tx));
+
+ &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqu (@X[-4&7],"0($inp)"); # load input
+ &movdqu (@X[-3&7],"16($inp)");
+ &movdqu (@X[-2&7],"32($inp)");
+ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7],@Tx[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+my @body_00_19 = (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&xor (@T[0],$d);',
+ '&mov (@T[1],$a);', # $b for next round
+
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&xor ($b,$c);', # $c^$d for next round
+
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&and (@T[1],$b);', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c);', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_00_19;
+
+ $n = scalar(@r);
+ $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+
+ return @r;
+}
+
+my @body_20_39 = (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c)
+ '&mov (@T[1],$a);', # $b for next round
+
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round
+
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_20_39;
+
+ $n = scalar(@r);
+ $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20);
+ $jj++;
+
+ return @r;
+}
+
+my @body_40_59 = (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40);', # restore $c
+
+ '&$_ror ($b,7);', # $b>>>2
+ '&mov (@T[1],$a);', # $b for next round
+ '&xor (@T[0],$c);',
+
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59);', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59);', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
+
+ use integer;
+ my ($k,$n);
+ my @r=@body_40_59;
+
+ $n = scalar(@r);
+ $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40);
+ $jj++;
+
+ return @r;
+}
+$code.=<<___;
+.align 32
+.Loop_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_ssse3
+
+.Ldone_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups $iv,($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ ret
+.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+ if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+# reassign for Atom Silvermont (see above)
+($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
+@X=map("%xmm$_",(8..13,6,7));
+@Tx=map("%xmm$_",(14,15,5));
+
+my @aes256_dec = (
+ '&movdqu($inout0,"0x00($in0)");',
+ '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);',
+ '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);',
+ '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);',
+
+ '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
+ '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3]
+ undef,undef
+ );
+for ($i=0;$i<13;$i++) {
+ push (@aes256_dec,(
+ '&aesdec ($inout0,$rndkey0);',
+ '&aesdec ($inout1,$rndkey0);',
+ '&aesdec ($inout2,$rndkey0);',
+ '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+ ));
+ push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
+ push (@aes256_dec,(undef,undef)) if ($i==5);
+}
+push(@aes256_dec,(
+ '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
+ '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
+ '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
+ '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
+
+ '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");',
+ '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);',
+ '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);',
+ '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);',
+
+ '&movups ("0x30($out,$in0)",$inout3);'
+ ));
+
+sub body_00_19_dec () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39_dec() if ($rx==19);
+
+ my @r=@body_00_19;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+sub body_20_39_dec () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59_dec() if ($rx==39);
+
+ my @r=@body_20_39;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+sub body_40_59_dec () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+
+ my @r=@body_40_59;
+
+ unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);
+ $rx++;
+
+ return @r;
+}
+
+$code.=<<___;
+.globl aesni256_cbc_sha1_dec
+.type aesni256_cbc_sha1_dec,\@abi-omnipotent
+.align 32
+aesni256_cbc_sha1_dec:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni256_cbc_sha1_dec_avx
+___
+$code.=<<___;
+ jmp aesni256_cbc_sha1_dec_ssse3
+ ret
+.size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
+
+.type aesni256_cbc_sha1_dec_ssse3,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_ssse3:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_dec_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ movdqu ($ivp),@X[3] # load IV
+ #mov $ivp,88(%rsp) # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ movdqa 64($K_XX_XX),@Tx[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @Tx[2],@X[-4&7] # byte swap
+ add \$64,$inp
+ pshufb @Tx[2],@X[-3&7]
+ pshufb @Tx[2],@X[-2&7]
+ pshufb @Tx[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movdqu -112($key),$rndkey0 # $key[0]
+ jmp .Loop_dec_ssse3
+
+.align 32
+.Loop_dec_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_16_31(\&body_00_19_dec);
+ &Xupdate_ssse3_32_79(\&body_00_19_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_40_59_dec);
+ &Xupdate_ssse3_32_79(\&body_20_39_dec);
+ &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_rx=$rx;
+
+ &Xloop_ssse3(\&body_20_39_dec);
+ &Xloop_ssse3(\&body_20_39_dec);
+ &Xloop_ssse3(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_dec_ssse3
+
+.Ldone_dec_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $rx=$saved_rx;
+
+ &Xtail_ssse3(\&body_20_39_dec);
+ &Xtail_ssse3(\&body_20_39_dec);
+ &Xtail_ssse3(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups @X[3],($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_dec_ssse3:
+ ret
+.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
+___
+ }}}
+$j=$jj=$r=$rx=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec
+my $Kx=@Tx[2];
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_avx,\@function,6
+.align 32
+aesni_cbc_sha1_enc_avx:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_avx # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ vmovdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240-112($key),$rounds
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),$Kx # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ vmovups -112($key),$rndkey[1] # $key[0]
+ vmovups 16-112($key),$rndkey[0] # forward reference
+ jmp .Loop_avx
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ vmovdqu `16*$n`($in0),$in # load input
+ vpxor $rndkey[1],$in,$in
+___
+ $code.=<<___ if ($n);
+ vmovups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ vpxor $in,$iv,$iv
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+0)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+1)-112`($key),$rndkey[0]
+ je .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+2)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+ vaesenclast $rndkey[0],$iv,$iv
+ vmovups -112($key),$rndkey[0]
+ vmovups 16-112($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31() # recall that $Xi starts with 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ &vpsrld (@Tx[0],@Tx[1],30);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[1],@Tx[1],2);
+ &vpxor (@X[0],@X[0],@Tx[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (shift);
+
+ &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask
+ &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19
+ &vmovdqu(@X[-4&7],"0($inp)"); # load input
+ &vmovdqu(@X[-3&7],"16($inp)");
+ &vmovdqu(@X[-2&7],"32($inp)");
+ &vmovdqu(@X[-1&7],"48($inp)");
+ &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 32
+.Loop_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_avx
+
+.Ldone_avx:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ vmovups $iv,($ivp) # write IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx:
+ ret
+.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+
+ if ($stitched_decrypt) {{{
+# reset
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$j=$jj=$r=$rx=0;
+$Xi=4;
+
+@aes256_dec = (
+ '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
+ '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
+ '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
+ '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
+
+ '&vmovups($rndkey0,"16-112($key)");',
+ '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3]
+ undef,undef
+ );
+for ($i=0;$i<13;$i++) {
+ push (@aes256_dec,(
+ '&vaesdec ($inout0,$inout0,$rndkey0);',
+ '&vaesdec ($inout1,$inout1,$rndkey0);',
+ '&vaesdec ($inout2,$inout2,$rndkey0);',
+ '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
+ ));
+ push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11);
+ push (@aes256_dec,(undef,undef)) if ($i==5);
+}
+push(@aes256_dec,(
+ '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
+ '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
+ '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
+ '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
+
+ '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");',
+ '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);',
+ '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);',
+ '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);',
+
+ '&vmovups ("0x30($out,$in0)",$inout3);'
+ ));
+
+$code.=<<___;
+.type aesni256_cbc_sha1_dec_avx,\@function,6
+.align 32
+aesni256_cbc_sha1_dec_avx:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_dec_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ lea 112($key),%r15 # size optimization
+ vmovdqu ($ivp),@X[3] # load IV
+___
+($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),$Kx # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ vmovups -112($key),$rndkey0 # $key[0]
+ jmp .Loop_dec_avx
+
+.align 32
+.Loop_dec_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_16_31(\&body_00_19_dec);
+ &Xupdate_avx_32_79(\&body_00_19_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_40_59_dec);
+ &Xupdate_avx_32_79(\&body_20_39_dec);
+ &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_rx=$rx;
+
+ &Xloop_avx(\&body_20_39_dec);
+ &Xloop_avx(\&body_20_39_dec);
+ &Xloop_avx(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $C,@T[1]
+ mov $D,12($ctx)
+ xor $D,@T[1]
+ mov $E,16($ctx)
+ and @T[1],@T[0]
+ jmp .Loop_dec_avx
+
+.Ldone_dec_avx:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $rx=$saved_rx;
+
+ &Xtail_avx(\&body_20_39_dec);
+ &Xtail_avx(\&body_20_39_dec);
+ &Xtail_avx(\&body_20_39_dec);
+
+ eval(@aes256_dec[-1]); # last store
+$code.=<<___;
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ vmovups @X[3],($ivp) # write IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_dec_avx:
+ ret
+.size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
+___
+ }}}
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+
+.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+ if ($shaext) {{{
+($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$rounds="%r11d";
+
+($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
+@rndkey=("%xmm0","%xmm1");
+$r=0;
+
+my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_shaext,\@function,6
+.align 32
+aesni_cbc_sha1_enc_shaext:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+___
+$code.=<<___ if ($win64);
+ lea `-8-10*16`(%rsp),%rsp
+ movaps %xmm6,-8-10*16(%rax)
+ movaps %xmm7,-8-9*16(%rax)
+ movaps %xmm8,-8-8*16(%rax)
+ movaps %xmm9,-8-7*16(%rax)
+ movaps %xmm10,-8-6*16(%rax)
+ movaps %xmm11,-8-5*16(%rax)
+ movaps %xmm12,-8-4*16(%rax)
+ movaps %xmm13,-8-3*16(%rax)
+ movaps %xmm14,-8-2*16(%rax)
+ movaps %xmm15,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ movdqu ($ctx),$ABCD
+ movd 16($ctx),$E
+ movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap
+
+ mov 240($key),$rounds
+ sub $in0,$out
+ movups ($key),$rndkey0 # $key[0]
+ movups ($ivp),$iv # load IV
+ movups 16($key),$rndkey[0] # forward reference
+ lea 112($key),$key # size optimization
+
+ pshufd \$0b00011011,$ABCD,$ABCD # flip word order
+ pshufd \$0b00011011,$E,$E # flip word order
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+___
+ &$aesenc();
+$code.=<<___;
+ movdqu ($inp),@MSG[0]
+ movdqa $E,$E_SAVE # offload $E
+ pshufb $BSWAP,@MSG[0]
+ movdqu 0x10($inp),@MSG[1]
+ movdqa $ABCD,$ABCD_SAVE # offload $ABCD
+___
+ &$aesenc();
+$code.=<<___;
+ pshufb $BSWAP,@MSG[1]
+
+ paddd @MSG[0],$E
+ movdqu 0x20($inp),@MSG[2]
+ lea 0x40($inp),$inp
+ pxor $E_SAVE,@MSG[0] # black magic
+___
+ &$aesenc();
+$code.=<<___;
+ pxor $E_SAVE,@MSG[0] # black magic
+ movdqa $ABCD,$E_
+ pshufb $BSWAP,@MSG[2]
+ sha1rnds4 \$0,$E,$ABCD # 0-3
+ sha1nexte @MSG[1],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ sha1msg1 @MSG[1],@MSG[0]
+ movdqu -0x10($inp),@MSG[3]
+ movdqa $ABCD,$E
+ pshufb $BSWAP,@MSG[3]
+___
+ &$aesenc();
+$code.=<<___;
+ sha1rnds4 \$0,$E_,$ABCD # 4-7
+ sha1nexte @MSG[2],$E
+ pxor @MSG[2],@MSG[0]
+ sha1msg1 @MSG[2],@MSG[1]
+___
+ &$aesenc();
+
+for($i=2;$i<20-4;$i++) {
+$code.=<<___;
+ movdqa $ABCD,$E_
+ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11
+ sha1nexte @MSG[3],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ sha1msg2 @MSG[3],@MSG[0]
+ pxor @MSG[3],@MSG[1]
+ sha1msg1 @MSG[3],@MSG[2]
+___
+ ($E,$E_)=($E_,$E);
+ push(@MSG,shift(@MSG));
+
+ &$aesenc();
+}
+$code.=<<___;
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 64-67
+ sha1nexte @MSG[3],$E_
+ sha1msg2 @MSG[3],@MSG[0]
+ pxor @MSG[3],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 68-71
+ sha1nexte @MSG[0],$E
+ sha1msg2 @MSG[0],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $E_SAVE,@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 72-75
+ sha1nexte @MSG[1],$E_
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 76-79
+ sha1nexte $MSG[0],$E
+___
+ while($r<40) { &$aesenc(); } # remaining aesenc's
+$code.=<<___;
+ dec $len
+
+ paddd $ABCD_SAVE,$ABCD
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+ jnz .Loop_shaext
+
+ pshufd \$0b00011011,$ABCD,$ABCD
+ pshufd \$0b00011011,$E,$E
+ movups $iv,($ivp) # write IV
+ movdqu $ABCD,($ctx)
+ movd $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-10*16(%rax),%xmm6
+ movaps -8-9*16(%rax),%xmm7
+ movaps -8-8*16(%rax),%xmm8
+ movaps -8-7*16(%rax),%xmm9
+ movaps -8-6*16(%rax),%xmm10
+ movaps -8-5*16(%rax),%xmm11
+ movaps -8-4*16(%rax),%xmm12
+ movaps -8-3*16(%rax),%xmm13
+ movaps -8-2*16(%rax),%xmm14
+ movaps -8-1*16(%rax),%xmm15
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
+___
+ }}}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+___
+$code.=<<___ if ($shaext);
+ lea aesni_cbc_sha1_enc_shaext(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lseh_no_shaext
+
+ lea (%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea 168(%rax),%rax # adjust stack pointer
+ jmp .Lcommon_seh_tail
+.Lseh_no_shaext:
+___
+$code.=<<___;
+ lea 96(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea `104+10*16`(%rax),%rax # adjust stack pointer
+
+ mov 0(%rax),%r15
+ mov 8(%rax),%r14
+ mov 16(%rax),%r13
+ mov 24(%rax),%r12
+ mov 32(%rax),%rbp
+ mov 40(%rax),%rbx
+ lea 48(%rax),%rax
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size ssse3_handler,.-ssse3_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_end_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_aesni_cbc_sha1_enc_shaext:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ unshift @opcode,$rex|0x40 if($rex);
+}
+
+sub sha1rnds4 {
+ if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x3a,0xcc);
+ rex(\@opcode,$3,$2);
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return "sha1rnds4\t".@_[0];
+ }
+}
+
+sub sha1op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha1nexte" => 0xc8,
+ "sha1msg1" => 0xc9,
+ "sha1msg2" => 0xca );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ rex(\@opcode,$2,$1);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x0f,0x38);
+
+ if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ unshift @opcode,0x66;
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
+ s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
+ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesni-sha256-x86_64.pl b/openssl-1.1.0h/crypto/aes/asm/aesni-sha256-x86_64.pl
new file mode 100644
index 0000000..0e49f26
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -0,0 +1,1717 @@
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# January 2013
+#
+# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
+# AESNI code is weaved into it. As SHA256 dominates execution time,
+# stitch performance does not depend on AES key length. Below are
+# performance numbers in cycles per processed byte, less is better,
+# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
+# subroutine:
+#
+# AES-128/-192/-256+SHA256 this(**) gain
+# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
+# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
+# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
+# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
+# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
+# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
+# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
+#
+# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
+# Westmere is omitted from loop, this is because gain was not
+# estimated high enough to justify the effort;
+# (**) these are EVP-free results, results obtained with 'speed
+# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
+# (***) these are SHAEXT results;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=12);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=$avx; ### set to zero if compiling for 1.0.1
+$avx=1 if (!$shaext && $avx);
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$func="aesni_cbc_sha256_enc";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+ "%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+########################################################################
+# void aesni_cbc_sha256_enc(const void *inp,
+# void *out,
+# size_t length,
+# const AES_KEY *key,
+# unsigned char *iv,
+# SHA256_CTX *ctx,
+# const void *in0);
+($inp, $out, $len, $key, $ivp, $ctx, $in0) =
+("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+$Tbl="%rbp";
+
+$_inp="16*$SZ+0*8(%rsp)";
+$_out="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$_key="16*$SZ+3*8(%rsp)";
+$_ivp="16*$SZ+4*8(%rsp)";
+$_ctx="16*$SZ+5*8(%rsp)";
+$_in0="16*$SZ+6*8(%rsp)";
+$_rsp="16*$SZ+7*8(%rsp)";
+$framesz=16*$SZ+8*8;
+
+$code=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+.globl $func
+.type $func,\@abi-omnipotent
+.align 16
+$func:
+___
+ if ($avx) {
+$code.=<<___;
+ lea OPENSSL_ia32cap_P(%rip),%r11
+ mov \$1,%eax
+ cmp \$0,`$win64?"%rcx":"%rdi"`
+ je .Lprobe
+ mov 0(%r11),%eax
+ mov 4(%r11),%r10
+___
+$code.=<<___ if ($shaext);
+ bt \$61,%r10 # check for SHA
+ jc ${func}_shaext
+___
+$code.=<<___;
+ mov %r10,%r11
+ shr \$32,%r11
+
+ test \$`1<<11`,%r10d # check for XOP
+ jnz ${func}_xop
+___
+$code.=<<___ if ($avx>1);
+ and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
+ cmp \$`1<<8|1<<5|1<<3`,%r11d
+ je ${func}_avx2
+___
+$code.=<<___;
+ and \$`1<<28`,%r10d # check for AVX
+ jnz ${func}_avx
+ ud2
+___
+ }
+$code.=<<___;
+ xor %eax,%eax
+ cmp \$0,`$win64?"%rcx":"%rdi"`
+ je .Lprobe
+ ud2
+.Lprobe:
+ ret
+.size $func,.-$func
+
+.align 64
+.type $TABLE,\@object
+$TABLE:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
+ .long 0,0,0,0, 0,0,0,0
+ .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+######################################################################
+# SIMD code paths
+#
+{{{
+($iv,$inout,$roundkey,$temp,
+ $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
+
+$aesni_cbc_idx=0;
+@aesni_cbc_block = (
+## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
+## &vmovdqu ($inout,($inp));
+## &mov ($_inp,$inp);
+
+ '&vpxor ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
+
+ '&vpxor ($inout,$inout,$iv);',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
+
+ '&vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
+
+ '&vaesenclast ($temp,$inout,$roundkey);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
+
+ '&vpand ($iv,$temp,$mask10);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
+
+ '&vaesenclast ($temp,$inout,$roundkey);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
+
+ '&vpand ($temp,$temp,$mask12);'.
+ ' &vaesenc ($inout,$inout,$roundkey);'.
+ '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
+
+ '&vpor ($iv,$iv,$temp);'.
+ ' &vaesenclast ($temp,$inout,$roundkey);'.
+ ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
+
+## &mov ($inp,$_inp);
+## &mov ($out,$_out);
+## &vpand ($temp,$temp,$mask14);
+## &vpor ($iv,$iv,$temp);
+## &vmovdqu ($iv,($out,$inp);
+## &lea (inp,16($inp));
+);
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+ '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
+ '&mov ($a,$a1)',
+ '&mov ($a4,$f)',
+
+ '&xor ($a0,$e)',
+ '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
+ '&xor ($a4,$g)', # f^g
+
+ '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
+ '&xor ($a1,$a)',
+ '&and ($a4,$e)', # (f^g)&e
+
+ @aesni_cbc_block[$aesni_cbc_idx++].
+ '&xor ($a0,$e)',
+ '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
+ '&mov ($a2,$a)',
+
+ '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
+ '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
+ '&xor ($a2,$b)', # a^b, b^c in next round
+
+ '&ror ($a0,$Sigma1[0])', # Sigma1(e)
+ '&add ($h,$a4)', # h+=Ch(e,f,g)
+ '&and ($a3,$a2)', # (b^c)&(a^b)
+
+ '&xor ($a1,$a)',
+ '&add ($h,$a0)', # h+=Sigma1(e)
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
+
+ '&add ($d,$h)', # d+=h
+ '&ror ($a1,$Sigma0[0])', # Sigma0(a)
+ '&add ($h,$a3)', # h+=Maj(a,b,c)
+
+ '&mov ($a0,$d)',
+ '&add ($a1,$h);'. # h+=Sigma0(a)
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+ );
+}
+
+if ($avx) {{
+######################################################################
+# XOP code path
+#
+$code.=<<___;
+.type ${func}_xop,\@function,6
+.align 64
+${func}_xop:
+.Lxop_shortcut:
+ mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ sub \$`$framesz+$win64*16*10`,%rsp
+ and \$-64,%rsp # align stack frame
+
+ shl \$6,$len
+ sub $inp,$out # re-bias
+ sub $inp,$in0
+ add $inp,$len # end of input
+
+ #mov $inp,$_inp # saved later
+ mov $out,$_out
+ mov $len,$_end
+ #mov $key,$_key # remains resident in $inp register
+ mov $ivp,$_ivp
+ mov $ctx,$_ctx
+ mov $in0,$_in0
+ mov %r11,$_rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,`$framesz+16*0`(%rsp)
+ movaps %xmm7,`$framesz+16*1`(%rsp)
+ movaps %xmm8,`$framesz+16*2`(%rsp)
+ movaps %xmm9,`$framesz+16*3`(%rsp)
+ movaps %xmm10,`$framesz+16*4`(%rsp)
+ movaps %xmm11,`$framesz+16*5`(%rsp)
+ movaps %xmm12,`$framesz+16*6`(%rsp)
+ movaps %xmm13,`$framesz+16*7`(%rsp)
+ movaps %xmm14,`$framesz+16*8`(%rsp)
+ movaps %xmm15,`$framesz+16*9`(%rsp)
+___
+$code.=<<___;
+.Lprologue_xop:
+ vzeroall
+
+ mov $inp,%r12 # borrow $a4
+ lea 0x80($key),$inp # size optimization, reassign
+ lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
+ mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
+ mov $ctx,%r15 # borrow $a2
+ mov $in0,%rsi # borrow $a3
+ vmovdqu ($ivp),$iv # load IV
+ sub \$9,%r14
+
+ mov $SZ*0(%r15),$A
+ mov $SZ*1(%r15),$B
+ mov $SZ*2(%r15),$C
+ mov $SZ*3(%r15),$D
+ mov $SZ*4(%r15),$E
+ mov $SZ*5(%r15),$F
+ mov $SZ*6(%r15),$G
+ mov $SZ*7(%r15),$H
+
+ vmovdqa 0x00(%r13,%r14,8),$mask14
+ vmovdqa 0x10(%r13,%r14,8),$mask12
+ vmovdqa 0x20(%r13,%r14,8),$mask10
+ vmovdqu 0x00-0x80($inp),$roundkey
+ jmp .Lloop_xop
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%xmm$_",(0..3));
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+$code.=<<___;
+.align 16
+.Lloop_xop:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00(%rsi,%r12),@X[0]
+ vmovdqu 0x10(%rsi,%r12),@X[1]
+ vmovdqu 0x20(%rsi,%r12),@X[2]
+ vmovdqu 0x30(%rsi,%r12),@X[3]
+ vpshufb $t3,@X[0],@X[0]
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[1],@X[1]
+ vpshufb $t3,@X[2],@X[2]
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x10(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x20(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x30(%rsp)
+ mov $E,$a0
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ sub \$-16*2*$SZ,$Tbl # size optimization
+ vmovdqu (%r12),$inout # $a4
+ mov %r12,$_inp # $a4
+___
+sub XOP_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
+
+ &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t0,$t0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t1);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t2,@X[3],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq ($t3,$t3,8);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrld ($t2,@X[0],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpslldq ($t3,$t3,8); # 22 instructions
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ $aesni_cbc_idx=0;
+ for ($i=0,$j=0; $j<4; $j++) {
+ &XOP_256_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &mov ("%r12",$_inp); # borrow $a4
+ &vpand ($temp,$temp,$mask14);
+ &mov ("%r15",$_out); # borrow $a2
+ &vpor ($iv,$iv,$temp);
+ &vmovdqu ("(%r15,%r12)",$iv); # write output
+ &lea ("%r12","16(%r12)"); # inp++
+
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
+ &jne (".Lxop_00_47");
+
+ &vmovdqu ($inout,"(%r12)");
+ &mov ($_inp,"%r12");
+
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+ }
+$code.=<<___;
+ mov $_inp,%r12 # borrow $a4
+ mov $_out,%r13 # borrow $a0
+ mov $_ctx,%r15 # borrow $a2
+ mov $_in0,%rsi # borrow $a3
+
+ vpand $mask14,$temp,$temp
+ mov $a1,$A
+ vpor $temp,$iv,$iv
+ vmovdqu $iv,(%r13,%r12) # write output
+ lea 16(%r12),%r12 # inp++
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ add $SZ*7(%r15),$H
+
+ cmp $_end,%r12
+
+ mov $A,$SZ*0(%r15)
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+
+ jb .Lloop_xop
+
+ mov $_ivp,$ivp
+ mov $_rsp,%rsi
+ vmovdqu $iv,($ivp) # output IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps `$framesz+16*0`(%rsp),%xmm6
+ movaps `$framesz+16*1`(%rsp),%xmm7
+ movaps `$framesz+16*2`(%rsp),%xmm8
+ movaps `$framesz+16*3`(%rsp),%xmm9
+ movaps `$framesz+16*4`(%rsp),%xmm10
+ movaps `$framesz+16*5`(%rsp),%xmm11
+ movaps `$framesz+16*6`(%rsp),%xmm12
+ movaps `$framesz+16*7`(%rsp),%xmm13
+ movaps `$framesz+16*8`(%rsp),%xmm14
+ movaps `$framesz+16*9`(%rsp),%xmm15
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_xop:
+ ret
+.size ${func}_xop,.-${func}_xop
+___
+######################################################################
+# AVX+shrd code path
+#
+local *ror = sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type ${func}_avx,\@function,6
+.align 64
+${func}_avx:
+.Lavx_shortcut:
+ mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ sub \$`$framesz+$win64*16*10`,%rsp
+ and \$-64,%rsp # align stack frame
+
+ shl \$6,$len
+ sub $inp,$out # re-bias
+ sub $inp,$in0
+ add $inp,$len # end of input
+
+ #mov $inp,$_inp # saved later
+ mov $out,$_out
+ mov $len,$_end
+ #mov $key,$_key # remains resident in $inp register
+ mov $ivp,$_ivp
+ mov $ctx,$_ctx
+ mov $in0,$_in0
+ mov %r11,$_rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,`$framesz+16*0`(%rsp)
+ movaps %xmm7,`$framesz+16*1`(%rsp)
+ movaps %xmm8,`$framesz+16*2`(%rsp)
+ movaps %xmm9,`$framesz+16*3`(%rsp)
+ movaps %xmm10,`$framesz+16*4`(%rsp)
+ movaps %xmm11,`$framesz+16*5`(%rsp)
+ movaps %xmm12,`$framesz+16*6`(%rsp)
+ movaps %xmm13,`$framesz+16*7`(%rsp)
+ movaps %xmm14,`$framesz+16*8`(%rsp)
+ movaps %xmm15,`$framesz+16*9`(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx:
+ vzeroall
+
+ mov $inp,%r12 # borrow $a4
+ lea 0x80($key),$inp # size optimization, reassign
+ lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
+ mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
+ mov $ctx,%r15 # borrow $a2
+ mov $in0,%rsi # borrow $a3
+ vmovdqu ($ivp),$iv # load IV
+ sub \$9,%r14
+
+ mov $SZ*0(%r15),$A
+ mov $SZ*1(%r15),$B
+ mov $SZ*2(%r15),$C
+ mov $SZ*3(%r15),$D
+ mov $SZ*4(%r15),$E
+ mov $SZ*5(%r15),$F
+ mov $SZ*6(%r15),$G
+ mov $SZ*7(%r15),$H
+
+ vmovdqa 0x00(%r13,%r14,8),$mask14
+ vmovdqa 0x10(%r13,%r14,8),$mask12
+ vmovdqa 0x20(%r13,%r14,8),$mask10
+ vmovdqu 0x00-0x80($inp),$roundkey
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%xmm$_",(0..3));
+ my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+$code.=<<___;
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu 0x00(%rsi,%r12),@X[0]
+ vmovdqu 0x10(%rsi,%r12),@X[1]
+ vmovdqu 0x20(%rsi,%r12),@X[2]
+ vmovdqu 0x30(%rsi,%r12),@X[3]
+ vpshufb $t3,@X[0],@X[0]
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[1],@X[1]
+ vpshufb $t3,@X[2],@X[2]
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ mov $A,$a1
+ vmovdqa $t1,0x10(%rsp)
+ mov $B,$a3
+ vmovdqa $t2,0x20(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x30(%rsp)
+ mov $E,$a0
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ sub \$-16*2*$SZ,$Tbl # size optimization
+ vmovdqu (%r12),$inout # $a4
+ mov %r12,$_inp # $a4
+___
+sub Xupdate_256_AVX () {
+ (
+ '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
+ '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
+ '&vpsrld ($t2,$t0,$sigma0[0]);',
+ '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
+ '&vpsrld ($t3,$t0,$sigma0[2])',
+ '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
+ '&vpxor ($t0,$t3,$t2)',
+ '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
+ '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t1)',
+ '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
+ '&vpxor ($t0,$t0,$t2)',
+ '&vpsrld ($t2,$t3,$sigma1[2]);',
+ '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
+ '&vpsrlq ($t3,$t3,$sigma1[0]);',
+ '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
+ '&vpxor ($t2,$t2,$t3);',
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
+ '&vpshufd ($t2,$t2,0b10000100)',
+ '&vpsrldq ($t2,$t2,8)',
+ '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
+ '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
+ '&vpsrld ($t2,$t3,$sigma1[2])',
+ '&vpsrlq ($t3,$t3,$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3);',
+ '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
+ '&vpxor ($t2,$t2,$t3)',
+ '&vpshufd ($t2,$t2,0b11101000)',
+ '&vpslldq ($t2,$t2,8)',
+ '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
+ );
+}
+
+sub AVX_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
+
+ foreach (Xupdate_256_AVX()) { # 29 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa (16*$j."(%rsp)",$t2);
+}
+
+ $aesni_cbc_idx=0;
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX_256_00_47($j,\&body_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &mov ("%r12",$_inp); # borrow $a4
+ &vpand ($temp,$temp,$mask14);
+ &mov ("%r15",$_out); # borrow $a2
+ &vpor ($iv,$iv,$temp);
+ &vmovdqu ("(%r15,%r12)",$iv); # write output
+ &lea ("%r12","16(%r12)"); # inp++
+
+ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
+ &jne (".Lavx_00_47");
+
+ &vmovdqu ($inout,"(%r12)");
+ &mov ($_inp,"%r12");
+
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ foreach(body_00_15()) { eval; }
+ }
+
+ }
+$code.=<<___;
+ mov $_inp,%r12 # borrow $a4
+ mov $_out,%r13 # borrow $a0
+ mov $_ctx,%r15 # borrow $a2
+ mov $_in0,%rsi # borrow $a3
+
+ vpand $mask14,$temp,$temp
+ mov $a1,$A
+ vpor $temp,$iv,$iv
+ vmovdqu $iv,(%r13,%r12) # write output
+ lea 16(%r12),%r12 # inp++
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ add $SZ*7(%r15),$H
+
+ cmp $_end,%r12
+
+ mov $A,$SZ*0(%r15)
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+ jb .Lloop_avx
+
+ mov $_ivp,$ivp
+ mov $_rsp,%rsi
+ vmovdqu $iv,($ivp) # output IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps `$framesz+16*0`(%rsp),%xmm6
+ movaps `$framesz+16*1`(%rsp),%xmm7
+ movaps `$framesz+16*2`(%rsp),%xmm8
+ movaps `$framesz+16*3`(%rsp),%xmm9
+ movaps `$framesz+16*4`(%rsp),%xmm10
+ movaps `$framesz+16*5`(%rsp),%xmm11
+ movaps `$framesz+16*6`(%rsp),%xmm12
+ movaps `$framesz+16*7`(%rsp),%xmm13
+ movaps `$framesz+16*8`(%rsp),%xmm14
+ movaps `$framesz+16*9`(%rsp),%xmm15
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx:
+ ret
+.size ${func}_avx,.-${func}_avx
+___
+
+if ($avx>1) {{
+######################################################################
+# AVX2+BMI code path
+#
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $PUSH8=8*2*$SZ;
+use integer;
+
+sub bodyx_00_15 () {
+ # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+ '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
+ '&and ($a4,$e)', # f&e
+ '&rorx ($a0,$e,$Sigma1[2])',
+ '&rorx ($a2,$e,$Sigma1[1])',
+
+ '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
+ '&lea ($h,"($h,$a4)")',
+ '&andn ($a4,$e,$g)', # ~e&g
+ '&xor ($a0,$a2)',
+
+ '&rorx ($a1,$e,$Sigma1[0])',
+ '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
+ '&xor ($a0,$a1)', # Sigma1(e)
+ '&mov ($a2,$a)',
+
+ '&rorx ($a4,$a,$Sigma0[2])',
+ '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
+ '&xor ($a2,$b)', # a^b, b^c in next round
+ '&rorx ($a1,$a,$Sigma0[1])',
+
+ '&rorx ($a0,$a,$Sigma0[0])',
+ '&lea ($d,"($d,$h)")', # d+=h
+ '&and ($a3,$a2)', # (b^c)&(a^b)
+ @aesni_cbc_block[$aesni_cbc_idx++].
+ '&xor ($a1,$a4)',
+
+ '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
+ '&xor ($a1,$a0)', # Sigma0(a)
+ '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
+ '&mov ($a4,$e)', # copy of f in future
+
+ '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+ );
+ # and at the finish one has to $a+=$a1
+}
+
+$code.=<<___;
+.type ${func}_avx2,\@function,6
+.align 64
+${func}_avx2:
+.Lavx2_shortcut:
+ mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ mov %rsp,%r11 # copy %rsp
+ sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
+ and \$-256*$SZ,%rsp # align stack frame
+ add \$`2*$SZ*($rounds-8)`,%rsp
+
+ shl \$6,$len
+ sub $inp,$out # re-bias
+ sub $inp,$in0
+ add $inp,$len # end of input
+
+ #mov $inp,$_inp # saved later
+ #mov $out,$_out # kept in $offload
+ mov $len,$_end
+ #mov $key,$_key # remains resident in $inp register
+ mov $ivp,$_ivp
+ mov $ctx,$_ctx
+ mov $in0,$_in0
+ mov %r11,$_rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,`$framesz+16*0`(%rsp)
+ movaps %xmm7,`$framesz+16*1`(%rsp)
+ movaps %xmm8,`$framesz+16*2`(%rsp)
+ movaps %xmm9,`$framesz+16*3`(%rsp)
+ movaps %xmm10,`$framesz+16*4`(%rsp)
+ movaps %xmm11,`$framesz+16*5`(%rsp)
+ movaps %xmm12,`$framesz+16*6`(%rsp)
+ movaps %xmm13,`$framesz+16*7`(%rsp)
+ movaps %xmm14,`$framesz+16*8`(%rsp)
+ movaps %xmm15,`$framesz+16*9`(%rsp)
+___
+$code.=<<___;
+.Lprologue_avx2:
+ vzeroall
+
+ mov $inp,%r13 # borrow $a0
+ vpinsrq \$1,$out,$offload,$offload
+ lea 0x80($key),$inp # size optimization, reassign
+ lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
+ mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
+ mov $ctx,%r15 # borrow $a2
+ mov $in0,%rsi # borrow $a3
+ vmovdqu ($ivp),$iv # load IV
+ lea -9(%r14),%r14
+
+ vmovdqa 0x00(%r12,%r14,8),$mask14
+ vmovdqa 0x10(%r12,%r14,8),$mask12
+ vmovdqa 0x20(%r12,%r14,8),$mask10
+
+ sub \$-16*$SZ,%r13 # inp++, size optimization
+ mov $SZ*0(%r15),$A
+ lea (%rsi,%r13),%r12 # borrow $a0
+ mov $SZ*1(%r15),$B
+ cmp $len,%r13 # $_end
+ mov $SZ*2(%r15),$C
+ cmove %rsp,%r12 # next block or random data
+ mov $SZ*3(%r15),$D
+ mov $SZ*4(%r15),$E
+ mov $SZ*5(%r15),$F
+ mov $SZ*6(%r15),$G
+ mov $SZ*7(%r15),$H
+ vmovdqu 0x00-0x80($inp),$roundkey
+___
+ if ($SZ==4) { # SHA256
+ my @X = map("%ymm$_",(0..3));
+ my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
+
+$code.=<<___;
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
+ vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
+ vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
+ vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
+ vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
+
+ vinserti128 \$1,(%r12),@X[0],@X[0]
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
+ vpshufb $t3,@X[0],@X[0]
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
+ vpshufb $t3,@X[1],@X[1]
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
+
+ lea $TABLE(%rip),$Tbl
+ vpshufb $t3,@X[2],@X[2]
+ lea -16*$SZ(%r13),%r13
+ vpaddd 0x00($Tbl),@X[0],$t0
+ vpshufb $t3,@X[3],@X[3]
+ vpaddd 0x20($Tbl),@X[1],$t1
+ vpaddd 0x40($Tbl),@X[2],$t2
+ vpaddd 0x60($Tbl),@X[3],$t3
+ vmovdqa $t0,0x00(%rsp)
+ xor $a1,$a1
+ vmovdqa $t1,0x20(%rsp)
+ lea -$PUSH8(%rsp),%rsp
+ mov $B,$a3
+ vmovdqa $t2,0x00(%rsp)
+ xor $C,$a3 # magic
+ vmovdqa $t3,0x20(%rsp)
+ mov $F,$a4
+ sub \$-16*2*$SZ,$Tbl # size optimization
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ vmovdqu (%r13),$inout
+ vpinsrq \$0,%r13,$offload,$offload
+___
+
+sub AVX2_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
+my $base = "+2*$PUSH8(%rsp)";
+
+ &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
+ foreach (Xupdate_256_AVX()) { # 29 instructions
+ eval;
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ }
+ &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
+ foreach (@insns) { eval; } # remaining instructions
+ &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
+}
+ $aesni_cbc_idx=0;
+ for ($i=0,$j=0; $j<4; $j++) {
+ &AVX2_256_00_47($j,\&bodyx_00_15,@X);
+ push(@X,shift(@X)); # rotate(@X)
+ }
+ &vmovq ("%r13",$offload); # borrow $a0
+ &vpextrq ("%r15",$offload,1); # borrow $a2
+ &vpand ($temp,$temp,$mask14);
+ &vpor ($iv,$iv,$temp);
+ &vmovdqu ("(%r15,%r13)",$iv); # write output
+ &lea ("%r13","16(%r13)"); # inp++
+
+ &lea ($Tbl,16*2*$SZ."($Tbl)");
+ &cmpb (($SZ-1)."($Tbl)",0);
+ &jne (".Lavx2_00_47");
+
+ &vmovdqu ($inout,"(%r13)");
+ &vpinsrq ($offload,$offload,"%r13",0);
+
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
+ foreach(bodyx_00_15()) { eval; }
+ }
+ }
+$code.=<<___;
+ vpextrq \$1,$offload,%r12 # $_out, borrow $a4
+ vmovq $offload,%r13 # $_inp, borrow $a0
+ mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
+ add $a1,$A
+ lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
+
+ vpand $mask14,$temp,$temp
+ vpor $temp,$iv,$iv
+ vmovdqu $iv,(%r12,%r13) # write output
+ lea 16(%r13),%r13
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ add $SZ*7(%r15),$H
+
+ mov $A,$SZ*0(%r15)
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+
+ cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
+ je .Ldone_avx2
+
+ xor $a1,$a1
+ mov $B,$a3
+ mov $F,$a4
+ xor $C,$a3 # magic
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ vmovdqu (%r13),$inout
+ vpinsrq \$0,%r13,$offload,$offload
+___
+ $aesni_cbc_idx=0;
+ for ($i=0; $i<16; ) {
+ my $base="+16($Tbl)";
+ foreach(bodyx_00_15()) { eval; }
+ &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
+ }
+$code.=<<___;
+ vmovq $offload,%r13 # borrow $a0
+ vpextrq \$1,$offload,%r15 # borrow $a2
+ vpand $mask14,$temp,$temp
+ vpor $temp,$iv,$iv
+ lea -$PUSH8($Tbl),$Tbl
+ vmovdqu $iv,(%r15,%r13) # write output
+ lea 16(%r13),%r13 # inp++
+ cmp %rsp,$Tbl
+ jae .Lower_avx2
+
+ mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
+ lea 16*$SZ(%r13),%r13
+ mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
+ add $a1,$A
+ lea `2*$SZ*($rounds-8)`(%rsp),%rsp
+
+ add $SZ*0(%r15),$A
+ add $SZ*1(%r15),$B
+ add $SZ*2(%r15),$C
+ add $SZ*3(%r15),$D
+ add $SZ*4(%r15),$E
+ add $SZ*5(%r15),$F
+ add $SZ*6(%r15),$G
+ lea (%rsi,%r13),%r12
+ add $SZ*7(%r15),$H
+
+ cmp $_end,%r13
+
+ mov $A,$SZ*0(%r15)
+ cmove %rsp,%r12 # next block or stale data
+ mov $B,$SZ*1(%r15)
+ mov $C,$SZ*2(%r15)
+ mov $D,$SZ*3(%r15)
+ mov $E,$SZ*4(%r15)
+ mov $F,$SZ*5(%r15)
+ mov $G,$SZ*6(%r15)
+ mov $H,$SZ*7(%r15)
+
+ jbe .Loop_avx2
+ lea (%rsp),$Tbl
+
+.Ldone_avx2:
+ lea ($Tbl),%rsp
+ mov $_ivp,$ivp
+ mov $_rsp,%rsi
+ vmovdqu $iv,($ivp) # output IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps `$framesz+16*0`(%rsp),%xmm6
+ movaps `$framesz+16*1`(%rsp),%xmm7
+ movaps `$framesz+16*2`(%rsp),%xmm8
+ movaps `$framesz+16*3`(%rsp),%xmm9
+ movaps `$framesz+16*4`(%rsp),%xmm10
+ movaps `$framesz+16*5`(%rsp),%xmm11
+ movaps `$framesz+16*6`(%rsp),%xmm12
+ movaps `$framesz+16*7`(%rsp),%xmm13
+ movaps `$framesz+16*8`(%rsp),%xmm14
+ movaps `$framesz+16*9`(%rsp),%xmm15
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx2:
+ ret
+.size ${func}_avx2,.-${func}_avx2
+___
+}}
+}}
+{{
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my ($rounds,$Tbl)=("%r11d","%rbx");
+
+my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
+my @rndkey=("%xmm4","%xmm5");
+my $r=0;
+my $sn=0;
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
+my @MSG=map("%xmm$_",(10..13));
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ movups `16*$n`($in0),$in # load input
+ xorps $rndkey0,$in
+___
+ $code.=<<___ if ($n);
+ movups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ xorps $in,$iv
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Laesenclast$sn
+ movups `32+16*($k+0)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+1)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+ je .Laesenclast$sn
+ movups `32+16*($k+2)-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+3)-112`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+.Laesenclast$sn:
+ aesenclast $rndkey[0],$iv
+ movups 16-112($key),$rndkey[1] # forward reference
+ nop
+___
+ } else {
+ $code.=<<___;
+ movups `32+16*$k-112`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+if ($shaext) {
+my $Tbl="%rax";
+
+$code.=<<___;
+.type ${func}_shaext,\@function,6
+.align 32
+${func}_shaext:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+___
+$code.=<<___ if ($win64);
+ lea `-8-10*16`(%rsp),%rsp
+ movaps %xmm6,-8-10*16(%rax)
+ movaps %xmm7,-8-9*16(%rax)
+ movaps %xmm8,-8-8*16(%rax)
+ movaps %xmm9,-8-7*16(%rax)
+ movaps %xmm10,-8-6*16(%rax)
+ movaps %xmm11,-8-5*16(%rax)
+ movaps %xmm12,-8-4*16(%rax)
+ movaps %xmm13,-8-3*16(%rax)
+ movaps %xmm14,-8-2*16(%rax)
+ movaps %xmm15,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ lea K256+0x80(%rip),$Tbl
+ movdqu ($ctx),$ABEF # DCBA
+ movdqu 16($ctx),$CDGH # HGFE
+ movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
+
+ mov 240($key),$rounds
+ sub $in0,$out
+ movups ($key),$rndkey0 # $key[0]
+ movups ($ivp),$iv # load IV
+ movups 16($key),$rndkey[0] # forward reference
+ lea 112($key),$key # size optimization
+
+ pshufd \$0x1b,$ABEF,$Wi # ABCD
+ pshufd \$0xb1,$ABEF,$ABEF # CDAB
+ pshufd \$0x1b,$CDGH,$CDGH # EFGH
+ movdqa $TMP,$BSWAP # offload
+ palignr \$8,$CDGH,$ABEF # ABEF
+ punpcklqdq $Wi,$CDGH # CDGH
+
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu ($inp),@MSG[0]
+ movdqu 0x10($inp),@MSG[1]
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $TMP,@MSG[0]
+ movdqu 0x30($inp),@MSG[3]
+
+ movdqa 0*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ pshufb $TMP,@MSG[1]
+ movdqa $CDGH,$CDGH_SAVE # offload
+ movdqa $ABEF,$ABEF_SAVE # offload
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 0-3
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 1*32-0x80($Tbl),$Wi
+ paddd @MSG[1],$Wi
+ pshufb $TMP,@MSG[2]
+ lea 0x40($inp),$inp
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 4-7
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 2*32-0x80($Tbl),$Wi
+ paddd @MSG[2],$Wi
+ pshufb $TMP,@MSG[3]
+ sha256msg1 @MSG[1],@MSG[0]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 8-11
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[3],$TMP
+ palignr \$4,@MSG[2],$TMP
+ paddd $TMP,@MSG[0]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 3*32-0x80($Tbl),$Wi
+ paddd @MSG[3],$Wi
+ sha256msg2 @MSG[3],@MSG[0]
+ sha256msg1 @MSG[2],@MSG[1]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 12-15
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ movdqa @MSG[0],$TMP
+ palignr \$4,@MSG[3],$TMP
+ paddd $TMP,@MSG[1]
+ sha256rnds2 $CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+ &$aesenc() if (($r%10)==0);
+$code.=<<___;
+ movdqa $i*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ sha256msg2 @MSG[0],@MSG[1]
+ sha256msg1 @MSG[3],@MSG[2]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 16-19...
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[1],$TMP
+ palignr \$4,@MSG[0],$TMP
+ paddd $TMP,@MSG[2]
+___
+ &$aesenc();
+ &$aesenc() if ($r==19);
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+___
+ push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ movdqa 13*32-0x80($Tbl),$Wi
+ paddd @MSG[0],$Wi
+ sha256msg2 @MSG[0],@MSG[1]
+ sha256msg1 @MSG[3],@MSG[2]
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 52-55
+ pshufd \$0x0e,$Wi,$Wi
+ movdqa @MSG[1],$TMP
+ palignr \$4,@MSG[0],$TMP
+ paddd $TMP,@MSG[2]
+___
+ &$aesenc();
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 14*32-0x80($Tbl),$Wi
+ paddd @MSG[1],$Wi
+ sha256msg2 @MSG[1],@MSG[2]
+ movdqa $BSWAP,$TMP
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 56-59
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+
+ movdqa 15*32-0x80($Tbl),$Wi
+ paddd @MSG[2],$Wi
+___
+ &$aesenc();
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $ABEF,$CDGH # 60-63
+ pshufd \$0x0e,$Wi,$Wi
+___
+ &$aesenc();
+$code.=<<___;
+ sha256rnds2 $CDGH,$ABEF
+ #pxor $CDGH,$rndkey0 # black magic
+___
+ while ($r<40) { &$aesenc(); } # remaining aesenc's
+$code.=<<___;
+ #xorps $CDGH,$rndkey0 # black magic
+ paddd $CDGH_SAVE,$CDGH
+ paddd $ABEF_SAVE,$ABEF
+
+ dec $len
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+ jnz .Loop_shaext
+
+ pshufd \$0xb1,$CDGH,$CDGH # DCHG
+ pshufd \$0x1b,$ABEF,$TMP # FEBA
+ pshufd \$0xb1,$ABEF,$ABEF # BAFE
+ punpckhqdq $CDGH,$ABEF # DCBA
+ palignr \$8,$TMP,$CDGH # HGFE
+
+ movups $iv,($ivp) # write IV
+ movdqu $ABEF,($ctx)
+ movdqu $CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps 0*16(%rsp),%xmm6
+ movaps 1*16(%rsp),%xmm7
+ movaps 2*16(%rsp),%xmm8
+ movaps 3*16(%rsp),%xmm9
+ movaps 4*16(%rsp),%xmm10
+ movaps 5*16(%rsp),%xmm11
+ movaps 6*16(%rsp),%xmm12
+ movaps 7*16(%rsp),%xmm13
+ movaps 8*16(%rsp),%xmm14
+ movaps 9*16(%rsp),%xmm15
+ lea 8+10*16(%rsp),%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size ${func}_shaext,.-${func}_shaext
+___
+}
+}}}}}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64 && $avx) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HanderlData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+___
+$code.=<<___ if ($shaext);
+ lea aesni_cbc_sha256_enc_shaext(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lnot_in_shaext
+
+ lea (%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea 168(%rax),%rax # adjust stack pointer
+ jmp .Lin_prologue
+.Lnot_in_shaext:
+___
+$code.=<<___ if ($avx>1);
+ lea .Lavx2_shortcut(%rip),%r10
+ cmp %r10,%rbx # context->Rip<avx2_shortcut
+ jb .Lnot_in_avx2
+
+ and \$-256*$SZ,%rax
+ add \$`2*$SZ*($rounds-8)`,%rax
+.Lnot_in_avx2:
+___
+$code.=<<___;
+ mov %rax,%rsi # put aside Rsp
+ mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+ lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+ .rva .LSEH_begin_${func}_xop
+ .rva .LSEH_end_${func}_xop
+ .rva .LSEH_info_${func}_xop
+
+ .rva .LSEH_begin_${func}_avx
+ .rva .LSEH_end_${func}_avx
+ .rva .LSEH_info_${func}_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_${func}_avx2
+ .rva .LSEH_end_${func}_avx2
+ .rva .LSEH_info_${func}_avx2
+___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_${func}_shaext
+ .rva .LSEH_end_${func}_shaext
+ .rva .LSEH_info_${func}_shaext
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_${func}_xop:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
+
+.LSEH_info_${func}_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_${func}_avx2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_${func}_shaext:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ unshift @opcode,$rex|0x40 if($rex);
+}
+
+{
+ my %opcodelet = (
+ "sha256rnds2" => 0xcb,
+ "sha256msg1" => 0xcc,
+ "sha256msg2" => 0xcd );
+
+ sub sha256op38 {
+ my $instr = shift;
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ rex(\@opcode,$2,$1);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+ }
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesni-x86.pl b/openssl-1.1.0h/crypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000..ed1a47c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,3413 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
+# 53-67% 67-84% 91-94% 95-98% 97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt.
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB OCB
+# Westmere 3.77/1.37 1.37 1.52 1.27
+# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
+# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
+# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
+# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
+
+$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
+ # generates drop-in replacement for
+ # crypto/aes/asm/aes-586.pl:-)
+$inline=1; # inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0],$0);
+
+&external_label("OPENSSL_ia32cap_P");
+&static_label("key_const");
+
+if ($PREFIX eq "aesni") { $movekey=\&movups; }
+else { $movekey=\&movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx"; # backup copy for $rounds
+$key_="ebp"; # backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5"; $in1="xmm5";
+$inout4="xmm6"; $in0="xmm6";
+$inout5="xmm7"; $ivec="xmm7";
+
+# AESNI extension
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc { aescommon(0xdb,@_); }
+sub aesenc { aescommon(0xdc,@_); }
+sub aesenclast { aescommon(0xdd,@_); }
+sub aesdec { aescommon(0xde,@_); }
+sub aesdeclast { aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+ $sn++;
+
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($ivec,$rndkey0) if (defined($ivec));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout,$ivec) if (defined($ivec));
+ &xorps ($inout,$rndkey0) if (!defined($ivec));
+ &set_label("${p}1_loop_$sn");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &dec ($rounds);
+ &$movekey ($rndkey1,&QWP(0,$key));
+ &lea ($key,&DWP(16,$key));
+ &jnz (&label("${p}1_loop_$sn"));
+ eval"&aes${p}last ($inout,$rndkey1)";
+}}
+
+sub aesni_generate1 # fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+ &function_begin_B("_aesni_${p}rypt1");
+ &movups ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(0x10,$key));
+ &xorps ($inout,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0x20,$key));
+ &lea ($key,&DWP(0x30,$key));
+ &cmp ($rounds,11);
+ &jb (&label("${p}128"));
+ &lea ($key,&DWP(0x20,$key));
+ &je (&label("${p}192"));
+ &lea ($key,&DWP(0x20,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x40,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x30,$key));
+ &set_label("${p}192");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x20,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x10,$key));
+ &set_label("${p}128");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x10,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x20,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x30,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x40,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x50,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x60,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x70,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ eval"&aes${p}last ($inout,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &pxor ($rndkey0,$rndkey0); # clear register bank
+ &pxor ($rndkey1,$rndkey1);
+ &movups (&QWP(0,"eax"),$inout0);
+ &pxor ($inout0,$inout0);
+ &ret ();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &pxor ($rndkey0,$rndkey0); # clear register bank
+ &pxor ($rndkey1,$rndkey1);
+ &movups (&QWP(0,"eax"),$inout0);
+ &pxor ($inout0,$inout0);
+ &ret ();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge, but it's unfeasible to accommodate such implementation
+# in XMM registers addreassable in 32-bit mode and therefore maximum
+# of 6x is used instead...
+
+sub aesni_generate2
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt2");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shl ($rounds,4);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key));
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ &add ($rounds,16);
+
+ &set_label("${p}2_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("${p}2_loop"));
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt2");
+}
+
+sub aesni_generate3
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt3");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shl ($rounds,4);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key));
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ &add ($rounds,16);
+
+ &set_label("${p}3_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ eval"&aes${p} ($inout2,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("${p}3_loop"));
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt4");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &shl ($rounds,4);
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key));
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ &data_byte (0x0f,0x1f,0x40,0x00);
+ &add ($rounds,16);
+
+ &set_label("${p}4_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("${p}4_loop"));
+
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ eval"&aes${p}last ($inout3,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt6");
+ &static_label("_aesni_${p}rypt6_enter");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shl ($rounds,4);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0); # pxor does better here
+ &pxor ($inout2,$rndkey0);
+ eval"&aes${p} ($inout0,$rndkey1)";
+ &pxor ($inout3,$rndkey0);
+ &pxor ($inout4,$rndkey0);
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &pxor ($inout5,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key,$rounds));
+ &add ($rounds,16);
+ &jmp (&label("_aesni_${p}rypt6_inner"));
+
+ &set_label("${p}6_loop",16);
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &set_label("_aesni_${p}rypt6_inner");
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ &set_label("_aesni_${p}rypt6_enter");
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+ eval"&aes${p} ($inout4,$rndkey0)";
+ eval"&aes${p} ($inout5,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("${p}6_loop"));
+
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ eval"&aes${p}last ($inout3,$rndkey0)";
+ eval"&aes${p}last ($inout4,$rndkey0)";
+ eval"&aes${p}last ($inout5,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+# size_t length, const AES_KEY *key,
+# int enc);
+&function_begin("aesni_ecb_encrypt");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &and ($len,-16);
+ &jz (&label("ecb_ret"));
+ &mov ($rounds,&DWP(240,$key));
+ &test ($rounds_,$rounds_);
+ &jz (&label("ecb_decrypt"));
+
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &cmp ($len,0x60);
+ &jb (&label("ecb_enc_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+ &call ("_aesni_encrypt6");
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_enc_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &cmp ($len,0x20);
+ &jb (&label("ecb_enc_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_enc_two"));
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_enc_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+ &je (&label("ecb_enc_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_encrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+ &call ("_aesni_encrypt2");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+ &call ("_aesni_encrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+ &call ("_aesni_encrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &jmp (&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &cmp ($len,0x60);
+ &jb (&label("ecb_dec_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+ &call ("_aesni_decrypt6");
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_dec_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &cmp ($len,0x20);
+ &jb (&label("ecb_dec_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_dec_two"));
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_dec_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+ &je (&label("ecb_dec_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_decrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+ &call ("_aesni_decrypt2");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+ &call ("_aesni_decrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+ &call ("_aesni_decrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
+ &mov ($rounds,&DWP(240,$key));
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds_,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds_);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
+ &lea ($key_,&DWP(0,$key));
+ &movdqa ($inout3,&QWP(0,"esp"));
+ &movdqa ($inout0,$ivec);
+ &lea ($key,&DWP(32,$key,$rounds));
+ &sub ($rounds_,$rounds);
+ &pshufb ($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &mov ($rounds,$rounds_);
+ &movups ($in0,&QWP(0,$inp));
+
+ &xorps ($inout0,$rndkey0);
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &xorps ($rndkey0,$in0);
+ &xorps ($cmac,$rndkey0); # cmac^=inp
+ &$movekey ($rndkey0,&QWP(32,$key_));
+
+&set_label("ccm64_enc2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ &aesenc ($inout0,$rndkey0);
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("ccm64_enc2_loop"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &paddq ($ivec,&QWP(16,"esp"));
+ &dec ($len);
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
+
+ &lea ($inp,&DWP(16,$inp));
+ &xorps ($in0,$inout0); # inp^=E(ivec)
+ &movdqa ($inout0,$ivec);
+ &movups (&QWP(0,$out),$in0); # save output
+ &pshufb ($inout0,$inout3);
+ &lea ($out,&DWP(16,$out));
+ &jnz (&label("ccm64_enc_outer"));
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movups (&QWP(0,$out),$cmac);
+
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
+ &mov ($rounds,&DWP(240,$key));
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds_,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds_);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
+ &movdqa ($inout0,$ivec);
+
+ &mov ($key_,$key);
+ &mov ($rounds_,$rounds);
+
+ &pshufb ($ivec,$inout3);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &shl ($rounds_,4);
+ &mov ($rounds,16);
+ &movups ($in0,&QWP(0,$inp)); # load inp
+ &paddq ($ivec,&QWP(16,"esp"));
+ &lea ($inp,&QWP(16,$inp));
+ &sub ($rounds,$rounds_);
+ &lea ($key,&DWP(32,$key_,$rounds_));
+ &mov ($rounds_,$rounds);
+ &jmp (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+ &xorps ($in0,$inout0); # inp ^= E(ivec)
+ &movdqa ($inout0,$ivec);
+ &movups (&QWP(0,$out),$in0); # save output
+ &lea ($out,&DWP(16,$out));
+ &pshufb ($inout0,$inout3);
+
+ &sub ($len,1);
+ &jz (&label("ccm64_dec_break"));
+
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &mov ($rounds,$rounds_);
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &xorps ($in0,$rndkey0);
+ &xorps ($inout0,$rndkey0);
+ &xorps ($cmac,$in0); # cmac^=out
+ &$movekey ($rndkey0,&QWP(32,$key_));
+
+&set_label("ccm64_dec2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ &aesenc ($inout0,$rndkey0);
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("ccm64_dec2_loop"));
+ &movups ($in0,&QWP(0,$inp)); # load inp
+ &paddq ($ivec,&QWP(16,"esp"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
+ &lea ($inp,&QWP(16,$inp));
+ &jmp (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+ &mov ($rounds,&DWP(240,$key_));
+ &mov ($key,$key_);
+ if ($inline)
+ { &aesni_inline_generate1("enc",$cmac,$in0); }
+ else
+ { &call ("_aesni_encrypt1",$cmac); }
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movups (&QWP(0,$out),$cmac);
+
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# stack layout:
+# 0 pshufb mask
+# 16 vector addend: 0,6,6,6
+# 32 counter-less ivec
+# 48 1st triplet of counter vector
+# 64 2nd triplet of counter vector
+# 80 saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($key_,"esp");
+ &sub ("esp",88);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(80,"esp"),$key_);
+
+ &cmp ($len,1);
+ &je (&label("ctr32_one_shortcut"));
+
+ &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds,6);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds);
+ &mov (&DWP(20,"esp"),$rounds);
+ &mov (&DWP(24,"esp"),$rounds);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
+ &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
+
+ &mov ($rounds,&DWP(240,$key)); # key->rounds
+
+ # compose 2 vectors of 3x32-bit counters
+ &bswap ($rounds_);
+ &pxor ($rndkey0,$rndkey0);
+ &pxor ($rndkey1,$rndkey1);
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
+ &pinsrd ($rndkey0,$rounds_,0);
+ &lea ($key_,&DWP(3,$rounds_));
+ &pinsrd ($rndkey1,$key_,0);
+ &inc ($rounds_);
+ &pinsrd ($rndkey0,$rounds_,1);
+ &inc ($key_);
+ &pinsrd ($rndkey1,$key_,1);
+ &inc ($rounds_);
+ &pinsrd ($rndkey0,$rounds_,2);
+ &inc ($key_);
+ &pinsrd ($rndkey1,$key_,2);
+ &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+ &movdqu ($inout4,&QWP(0,$key)); # key[0]
+ &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+
+ &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
+ &pshufd ($inout1,$rndkey0,2<<6);
+ &cmp ($len,6);
+ &jb (&label("ctr32_tail"));
+ &pxor ($inout5,$inout4); # counter-less ivec^key[0]
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
+ &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
+ &mov ($key_,$key); # backup $key
+ &sub ($rounds_,$rounds); # backup twisted $rounds
+ &lea ($key,&DWP(32,$key,$rounds));
+ &sub ($len,6);
+ &jmp (&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+ # inlining _aesni_encrypt6's prologue gives ~6% improvement...
+ &pshufd ($inout2,$rndkey0,1<<6);
+ &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
+ &pshufd ($inout3,$rndkey1,3<<6);
+ &pxor ($inout0,$rndkey0); # merge counter-less ivec
+ &pshufd ($inout4,$rndkey1,2<<6);
+ &pxor ($inout1,$rndkey0);
+ &pshufd ($inout5,$rndkey1,1<<6);
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &aesenc ($inout0,$rndkey1);
+ &pxor ($inout4,$rndkey0);
+ &pxor ($inout5,$rndkey0);
+ &aesenc ($inout1,$rndkey1);
+ &$movekey ($rndkey0,&QWP(32,$key_));
+ &mov ($rounds,$rounds_);
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
+ &aesenc ($inout4,$rndkey1);
+ &aesenc ($inout5,$rndkey1);
+
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
+ &xorps ($inout2,$rndkey1);
+ &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+
+ &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
+ &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
+
+ &movups ($inout1,&QWP(0x30,$inp));
+ &movups ($inout2,&QWP(0x40,$inp));
+ &xorps ($inout3,$inout1);
+ &movups ($inout1,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+ &xorps ($inout4,$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &xorps ($inout5,$inout1);
+ &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+ &movups (&QWP(0x40,$out),$inout4);
+ &pshufd ($inout0,$rndkey0,3<<6);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+
+ &pshufd ($inout1,$rndkey0,2<<6);
+ &sub ($len,6);
+ &jnc (&label("ctr32_loop6"));
+
+ &add ($len,6);
+ &jz (&label("ctr32_ret"));
+ &movdqu ($inout5,&QWP(0,$key_));
+ &mov ($key,$key_);
+ &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
+ &mov ($rounds,&DWP(240,$key_)); # restore $rounds
+
+&set_label("ctr32_tail");
+ &por ($inout0,$inout5);
+ &cmp ($len,2);
+ &jb (&label("ctr32_one"));
+
+ &pshufd ($inout2,$rndkey0,1<<6);
+ &por ($inout1,$inout5);
+ &je (&label("ctr32_two"));
+
+ &pshufd ($inout3,$rndkey1,3<<6);
+ &por ($inout2,$inout5);
+ &cmp ($len,4);
+ &jb (&label("ctr32_three"));
+
+ &pshufd ($inout4,$rndkey1,2<<6);
+ &por ($inout3,$inout5);
+ &je (&label("ctr32_four"));
+
+ &por ($inout4,$inout5);
+ &call ("_aesni_encrypt6");
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout2,$rndkey1);
+ &movups ($rndkey1,&QWP(0x40,$inp));
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout4,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+ &movups ($inout0,&QWP(0,$rounds_)); # load ivec
+ &mov ($rounds,&DWP(240,$key));
+
+&set_label("ctr32_one");
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups ($in0,&QWP(0,$inp));
+ &xorps ($in0,$inout0);
+ &movups (&QWP(0,$out),$in0);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+ &call ("_aesni_encrypt2");
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,$inp));
+ &xorps ($inout0,$inout3);
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+ &call ("_aesni_encrypt3");
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,$inp));
+ &xorps ($inout0,$inout3);
+ &movups ($inout5,&QWP(0x20,$inp));
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+ &call ("_aesni_encrypt4");
+ &movups ($inout4,&QWP(0,$inp));
+ &movups ($inout5,&QWP(0x10,$inp));
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout0,$inout4);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout1,$inout5);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(48,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(64,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
+ &mov ("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &and ("esp",-16); # align stack
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($len,16*6);
+ &jc (&label("xts_enc_short"));
+
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
+ &sub ($rounds_,$rounds);
+ &lea ($key,&DWP(32,$key,$rounds));
+ &jmp (&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &mov ($rounds,$rounds_); # restore $rounds
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesenc ($inout0,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesenc ($inout1,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key_));
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
+ &aesenc ($inout4,$rndkey1);
+ &aesenc ($inout5,$rndkey1);
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_enc_loop6"));
+
+ &mov ($rounds,&DWP(240,$key_)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+ &add ($len,16*6);
+ &jz (&label("xts_enc_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_enc_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_enc_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_enc_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_enc_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_encrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+
+ &call ("_aesni_encrypt2");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_encrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+ &movdqa ($inout3,$tweak);
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($inout3,$twtmp,0x13);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+ &movz ($rounds,&BP(0,$inp));
+ &movz ($key,&BP(-16,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(-16,$out),&LB($rounds));
+ &mov (&BP(0,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_enc_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(-16,$out)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(-16,$out),$inout0); # write output
+
+&set_label("xts_enc_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm3","xmm3");
+ &movdqa (&QWP(16*1,"esp"),"xmm0");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(16*2,"esp"),"xmm0");
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(16*3,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(16*4,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
+ &movdqa (&QWP(16*5,"esp"),"xmm0");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &and ("esp",-16); # align stack
+
+ &xor ($rounds_,$rounds_); # if(len%16) len-=16;
+ &test ($len,15);
+ &setnz (&LB($rounds_));
+ &shl ($rounds_,4);
+ &sub ($len,$rounds_);
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &sub ($len,16*6);
+ &jc (&label("xts_dec_short"));
+
+ &shl ($rounds,4);
+ &mov ($rounds_,16);
+ &sub ($rounds_,$rounds);
+ &lea ($key,&DWP(32,$key,$rounds));
+ &jmp (&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &mov ($rounds,$rounds_);
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesdec ($inout0,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesdec ($inout1,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key_));
+ &aesdec ($inout2,$rndkey1);
+ &aesdec ($inout3,$rndkey1);
+ &aesdec ($inout4,$rndkey1);
+ &aesdec ($inout5,$rndkey1);
+ &call (&label("_aesni_decrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_dec_loop6"));
+
+ &mov ($rounds,&DWP(240,$key_)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+ &add ($len,16*6);
+ &jz (&label("xts_dec_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_dec_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_dec_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_dec_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_dec_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_decrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+
+ &call ("_aesni_decrypt2");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_decrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_decrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(16*6,"esp"));
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+ &pshufd ($inout3,$twtmp,0x13);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,$twmask); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_steal");
+ &movz ($rounds,&BP(16,$inp));
+ &movz ($key,&BP(0,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(0,$out),&LB($rounds));
+ &mov (&BP(16,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_dec_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$out)); # load input
+ &xorps ($inout0,$inout4); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout4); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_ret");
+ &pxor ("xmm0","xmm0"); # clear register bank
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
+ &pxor ("xmm3","xmm3");
+ &movdqa (&QWP(16*1,"esp"),"xmm0");
+ &pxor ("xmm4","xmm4");
+ &movdqa (&QWP(16*2,"esp"),"xmm0");
+ &pxor ("xmm5","xmm5");
+ &movdqa (&QWP(16*3,"esp"),"xmm0");
+ &pxor ("xmm6","xmm6");
+ &movdqa (&QWP(16*4,"esp"),"xmm0");
+ &pxor ("xmm7","xmm7");
+ &movdqa (&QWP(16*5,"esp"),"xmm0");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_decrypt");
+}
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+# offsets within stack frame
+my $checksum = 16*6;
+my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
+
+# reassigned registers
+my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
+# $l_, $blocks, $inp, $key are permanently allocated in registers;
+# remaining non-volatile ones are offloaded to stack, which even
+# stay invariant after written to stack.
+
+&function_begin("aesni_ocb_encrypt");
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
+ &mov ($block,&wparam(4)); # start_block_num
+ &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
+ &mov ($l_,&wparam(6)); # L_
+
+ &mov ($rounds,"esp");
+ &sub ("esp",$esp_off+4); # alloca
+ &and ("esp",-16); # align stack
+
+ &sub ($out,$inp);
+ &shl ($len,4);
+ &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
+ &mov (&DWP($out_off,"esp"),$out);
+ &mov (&DWP($end_off,"esp"),$len);
+ &mov (&DWP($esp_off,"esp"),$rounds);
+
+ &mov ($rounds,&DWP(240,$key));
+
+ &test ($block,1);
+ &jnz (&label("odd"));
+
+ &bsf ($i3,$block);
+ &add ($block,1);
+ &shl ($i3,4);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+ &mov ($i3,$key); # put aside key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16,$inp));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,$inout4); # pass the checksum
+
+ &movups (&QWP(-16,$out,$inp),$inout0); # store output
+
+ &mov ($rounds,&DWP(240,$i3));
+ &mov ($key,$i3); # restore key
+ &mov ($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+ &shl ($rounds,4);
+ &mov ($out,16);
+ &sub ($out,$rounds); # twisted rounds
+ &mov (&DWP($key_off,"esp"),$key);
+ &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
+ &mov (&DWP($rounds_off,"esp"),$out);
+
+ &cmp ($inp,$len);
+ &ja (&label("short"));
+ &jmp (&label("grandloop"));
+
+&set_label("grandloop",32);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &lea ($i5,&DWP(5,$block));
+ &add ($block,6);
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &bsf ($i5,$i5);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &shl ($i5,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+ &movdqu ($inout5,&QWP(0,$l_,$i5));
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+ &movdqa (&QWP(16*5,"esp"),$inout5);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &movdqu ($inout5,&QWP(16*5,$inp));
+ &lea ($inp,&DWP(16*6,$inp));
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($rndkey1,$inout3);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($rndkey1,$inout4);
+ &pxor ($inout4,$rndkey0);
+ &pxor ($rndkey1,$inout5);
+ &pxor ($inout5,$rndkey0);
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,&QWP(16*5,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($inout1,$rndkey1);
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
+ &aesenc ($inout4,$rndkey1);
+ &aesenc ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &mov ($len,&DWP($end_off,"esp"));
+ &call ("_aesni_encrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,$rndkey0);
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+ &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
+ &movdqu (&QWP(-16*5,$out,$inp),$inout1);
+ &movdqu (&QWP(-16*4,$out,$inp),$inout2);
+ &movdqu (&QWP(-16*3,$out,$inp),$inout3);
+ &movdqu (&QWP(-16*2,$out,$inp),$inout4);
+ &movdqu (&QWP(-16*1,$out,$inp),$inout5);
+ &cmp ($inp,$len); # done yet?
+ &jb (&label("grandloop"));
+
+&set_label("short");
+ &add ($len,16*6);
+ &sub ($len,$inp);
+ &jz (&label("done"));
+
+ &cmp ($len,16*2);
+ &jb (&label("one"));
+ &je (&label("two"));
+
+ &cmp ($len,16*4);
+ &jb (&label("three"));
+ &je (&label("four"));
+
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout5,$inout5);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($rndkey1,$inout3);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($rndkey1,$inout4);
+ &pxor ($inout4,$rndkey0);
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($inout1,$rndkey1);
+ &aesenc ($inout2,$rndkey1);
+ &aesenc ($inout3,$rndkey1);
+ &aesenc ($inout4,$rndkey1);
+ &aesenc ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,$rndkey0);
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+ &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
+ &movdqu (&QWP(16*1,$out,$inp),$inout1);
+ &movdqu (&QWP(16*2,$out,$inp),$inout2);
+ &movdqu (&QWP(16*3,$out,$inp),$inout3);
+ &movdqu (&QWP(16*4,$out,$inp),$inout4);
+
+ &jmp (&label("done"));
+
+&set_label("one",16);
+ &movdqu ($inout5,&QWP(0,$l_));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ &mov ($out,&DWP($out_off,"esp"));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,$inout4); # pass the checksum
+ &movups (&QWP(0,$out,$inp),$inout0);
+
+ &jmp (&label("done"));
+
+&set_label("two",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout4,&QWP(0,$l_));
+ &movdqu ($inout5,&QWP(0,$l_,$i1));
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout4,$rndkey0); # ^ last offset_i
+ &pxor ($inout5,$inout4);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout4); # ^ offset_i
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$inout5);
+
+ &movdqa ($inout3,$rndkey1)
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt2");
+
+ &xorps ($inout0,$inout4); # ^ offset_i
+ &xorps ($inout1,$inout5);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,$inout3); # pass the checksum
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+
+ &jmp (&label("done"));
+
+&set_label("three",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout3,&QWP(0,$l_));
+ &movdqu ($inout4,&QWP(0,$l_,$i1));
+ &movdqa ($inout5,$inout3);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout3,$rndkey0); # ^ last offset_i
+ &pxor ($inout4,$inout3);
+ &pxor ($inout5,$inout4);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,$inout3); # ^ offset_i
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,$inout4);
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$inout5);
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # ^ offset_i
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+
+ &jmp (&label("done"));
+
+&set_label("four",16);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout2,&QWP(0,$l_));
+ &movdqu ($inout3,&QWP(0,$l_,$i1));
+ &movdqa ($inout4,$inout2);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+
+ &pxor ($inout2,$rndkey0); # ^ last offset_i
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &pxor ($inout3,$inout2);
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*0,"esp"),$inout2);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*1,"esp"),$inout3);
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($rndkey1,$inout1);
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($rndkey1,$inout2);
+ &pxor ($inout2,$inout4);
+ &pxor ($rndkey1,$inout3);
+ &pxor ($inout3,$inout5);
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1)
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_encrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout4);
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &xorps ($inout3,$inout5);
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &movups (&QWP(16*3,$out,$inp),$inout3);
+
+&set_label("done");
+ &mov ($key,&DWP($esp_off,"esp"));
+ &pxor ($inout0,$inout0); # clear register bank
+ &pxor ($inout1,$inout1);
+ &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
+ &pxor ($inout2,$inout2);
+ &movdqa (&QWP(16*1,"esp"),$inout0);
+ &pxor ($inout3,$inout3);
+ &movdqa (&QWP(16*2,"esp"),$inout0);
+ &pxor ($inout4,$inout4);
+ &movdqa (&QWP(16*3,"esp"),$inout0);
+ &pxor ($inout5,$inout5);
+ &movdqa (&QWP(16*4,"esp"),$inout0);
+ &movdqa (&QWP(16*5,"esp"),$inout0);
+ &movdqa (&QWP(16*6,"esp"),$inout0);
+
+ &lea ("esp",&DWP(0,$key));
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+ &movdqu (&QWP(0,$rounds),$rndkey0);
+ &pxor ($rndkey0,$rndkey0);
+ &movdqu (&QWP(0,$rounds_),$rndkey1);
+ &pxor ($rndkey1,$rndkey1);
+&function_end("aesni_ocb_encrypt");
+
+&function_begin("aesni_ocb_decrypt");
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
+ &mov ($block,&wparam(4)); # start_block_num
+ &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
+ &mov ($l_,&wparam(6)); # L_
+
+ &mov ($rounds,"esp");
+ &sub ("esp",$esp_off+4); # alloca
+ &and ("esp",-16); # align stack
+
+ &sub ($out,$inp);
+ &shl ($len,4);
+ &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
+ &mov (&DWP($out_off,"esp"),$out);
+ &mov (&DWP($end_off,"esp"),$len);
+ &mov (&DWP($esp_off,"esp"),$rounds);
+
+ &mov ($rounds,&DWP(240,$key));
+
+ &test ($block,1);
+ &jnz (&label("odd"));
+
+ &bsf ($i3,$block);
+ &add ($block,1);
+ &shl ($i3,4);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+ &mov ($i3,$key); # put aside key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16,$inp));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movaps ($rndkey1,$inout4); # pass the checksum
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &xorps ($rndkey1,$inout0); # checksum
+ &movups (&QWP(-16,$out,$inp),$inout0); # store output
+
+ &mov ($rounds,&DWP(240,$i3));
+ &mov ($key,$i3); # restore key
+ &mov ($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+ &shl ($rounds,4);
+ &mov ($out,16);
+ &sub ($out,$rounds); # twisted rounds
+ &mov (&DWP($key_off,"esp"),$key);
+ &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
+ &mov (&DWP($rounds_off,"esp"),$out);
+
+ &cmp ($inp,$len);
+ &ja (&label("short"));
+ &jmp (&label("grandloop"));
+
+&set_label("grandloop",32);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &lea ($i5,&DWP(5,$block));
+ &add ($block,6);
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &bsf ($i5,$i5);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &shl ($i5,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+ &movdqu ($inout5,&QWP(0,$l_,$i5));
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+ &movdqa (&QWP(16*5,"esp"),$inout5);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &movdqu ($inout5,&QWP(16*5,$inp));
+ &lea ($inp,&DWP(16*6,$inp));
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($inout4,$rndkey0);
+ &pxor ($inout5,$rndkey0);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,&QWP(16*5,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesdec ($inout0,$rndkey1);
+ &aesdec ($inout1,$rndkey1);
+ &aesdec ($inout2,$rndkey1);
+ &aesdec ($inout3,$rndkey1);
+ &aesdec ($inout4,$rndkey1);
+ &aesdec ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &mov ($len,&DWP($end_off,"esp"));
+ &call ("_aesni_decrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &pxor ($inout5,$rndkey0);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
+ &pxor ($rndkey1,$inout1);
+ &movdqu (&QWP(-16*5,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movdqu (&QWP(-16*4,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout3);
+ &movdqu (&QWP(-16*3,$out,$inp),$inout3);
+ &pxor ($rndkey1,$inout4);
+ &movdqu (&QWP(-16*2,$out,$inp),$inout4);
+ &pxor ($rndkey1,$inout5);
+ &movdqu (&QWP(-16*1,$out,$inp),$inout5);
+ &cmp ($inp,$len); # done yet?
+ &jb (&label("grandloop"));
+
+&set_label("short");
+ &add ($len,16*6);
+ &sub ($len,$inp);
+ &jz (&label("done"));
+
+ &cmp ($len,16*2);
+ &jb (&label("one"));
+ &je (&label("two"));
+
+ &cmp ($len,16*4);
+ &jb (&label("three"));
+ &je (&label("four"));
+
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout0,&QWP(0,$l_));
+ &movdqu ($inout1,&QWP(0,$l_,$i1));
+ &mov ($rounds,&DWP($rounds_off,"esp"));
+ &movdqa ($inout2,$inout0);
+ &movdqu ($inout3,&QWP(0,$l_,$i3));
+ &movdqa ($inout4,$inout0);
+
+ &pxor ($inout0,$rndkey0); # ^ last offset_i
+ &pxor ($inout1,$inout0);
+ &movdqa (&QWP(16*0,"esp"),$inout0);
+ &pxor ($inout2,$inout1);
+ &movdqa (&QWP(16*1,"esp"),$inout1);
+ &pxor ($inout3,$inout2);
+ &movdqa (&QWP(16*2,"esp"),$inout2);
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*3,"esp"),$inout3);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*4,"esp"),$inout4);
+
+ &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout5,$inout5);
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout0,$rndkey0); # ^ roundkey[0]
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &pxor ($inout4,$rndkey0);
+
+ &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,&QWP(16*4,"esp"));
+
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &aesdec ($inout0,$rndkey1);
+ &aesdec ($inout1,$rndkey1);
+ &aesdec ($inout2,$rndkey1);
+ &aesdec ($inout3,$rndkey1);
+ &aesdec ($inout4,$rndkey1);
+ &aesdec ($inout5,$rndkey1);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt6_enter");
+
+ &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &pxor ($inout4,$rndkey0);
+
+ &pxor ($rndkey1,$inout0); # checksum
+ &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
+ &pxor ($rndkey1,$inout1);
+ &movdqu (&QWP(16*1,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movdqu (&QWP(16*2,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout3);
+ &movdqu (&QWP(16*3,$out,$inp),$inout3);
+ &pxor ($rndkey1,$inout4);
+ &movdqu (&QWP(16*4,$out,$inp),$inout4);
+
+ &jmp (&label("done"));
+
+&set_label("one",16);
+ &movdqu ($inout5,&QWP(0,$l_));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &mov ($rounds,&DWP(240,$key));
+
+ &pxor ($inout5,$rndkey0); # ^ last offset_i
+ &pxor ($inout0,$inout5); # ^ offset_i
+
+ &movdqa ($inout4,$rndkey1);
+ &mov ($out,&DWP($out_off,"esp"));
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+
+ &xorps ($inout0,$inout5); # ^ offset_i
+ &movaps ($rndkey1,$inout4); # pass the checksum
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &xorps ($rndkey1,$inout0); # checksum
+ &movups (&QWP(0,$out,$inp),$inout0);
+
+ &jmp (&label("done"));
+
+&set_label("two",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout4,&QWP(0,$l_));
+ &movdqu ($inout5,&QWP(0,$l_,$i1));
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &movdqa ($inout3,$rndkey1);
+ &pxor ($inout4,$rndkey0); # ^ last offset_i
+ &pxor ($inout5,$inout4);
+
+ &pxor ($inout0,$inout4); # ^ offset_i
+ &pxor ($inout1,$inout5);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt2");
+
+ &xorps ($inout0,$inout4); # ^ offset_i
+ &xorps ($inout1,$inout5);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &xorps ($inout3,$inout0); # checksum
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &xorps ($inout3,$inout1);
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &movaps ($rndkey1,$inout3); # pass the checksum
+
+ &jmp (&label("done"));
+
+&set_label("three",16);
+ &lea ($i1,&DWP(1,$block));
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &bsf ($i1,$i1);
+ &shl ($i1,4);
+ &movdqu ($inout3,&QWP(0,$l_));
+ &movdqu ($inout4,&QWP(0,$l_,$i1));
+ &movdqa ($inout5,$inout3);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout3,$rndkey0); # ^ last offset_i
+ &pxor ($inout4,$inout3);
+ &pxor ($inout5,$inout4);
+
+ &pxor ($inout0,$inout3); # ^ offset_i
+ &pxor ($inout1,$inout4);
+ &pxor ($inout2,$inout5);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt3");
+
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &xorps ($inout0,$inout3); # ^ offset_i
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &pxor ($rndkey1,$inout0); # checksum
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout1);
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout2);
+
+ &jmp (&label("done"));
+
+&set_label("four",16);
+ &lea ($i1,&DWP(1,$block));
+ &lea ($i3,&DWP(3,$block));
+ &bsf ($i1,$i1);
+ &bsf ($i3,$i3);
+ &mov ($key,&DWP($key_off,"esp")); # restore key
+ &shl ($i1,4);
+ &shl ($i3,4);
+ &movdqu ($inout2,&QWP(0,$l_));
+ &movdqu ($inout3,&QWP(0,$l_,$i1));
+ &movdqa ($inout4,$inout2);
+ &movdqu ($inout5,&QWP(0,$l_,$i3));
+
+ &pxor ($inout2,$rndkey0); # ^ last offset_i
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &pxor ($inout3,$inout2);
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &pxor ($inout4,$inout3);
+ &movdqa (&QWP(16*0,"esp"),$inout2);
+ &pxor ($inout5,$inout4);
+ &movdqa (&QWP(16*1,"esp"),$inout3);
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &mov ($rounds,&DWP(240,$key));
+
+ &movdqa (&QWP($checksum,"esp"),$rndkey1);
+ &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &pxor ($inout2,$inout4);
+ &pxor ($inout3,$inout5);
+
+ &mov ($out,&DWP($out_off,"esp"));
+ &call ("_aesni_decrypt4");
+
+ &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+ &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout4);
+ &movups (&QWP(16*0,$out,$inp),$inout0); # store output
+ &pxor ($rndkey1,$inout0); # checksum
+ &xorps ($inout3,$inout5);
+ &movups (&QWP(16*1,$out,$inp),$inout1);
+ &pxor ($rndkey1,$inout1);
+ &movdqa ($rndkey0,$inout5); # pass last offset_i
+ &movups (&QWP(16*2,$out,$inp),$inout2);
+ &pxor ($rndkey1,$inout2);
+ &movups (&QWP(16*3,$out,$inp),$inout3);
+ &pxor ($rndkey1,$inout3);
+
+&set_label("done");
+ &mov ($key,&DWP($esp_off,"esp"));
+ &pxor ($inout0,$inout0); # clear register bank
+ &pxor ($inout1,$inout1);
+ &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
+ &pxor ($inout2,$inout2);
+ &movdqa (&QWP(16*1,"esp"),$inout0);
+ &pxor ($inout3,$inout3);
+ &movdqa (&QWP(16*2,"esp"),$inout0);
+ &pxor ($inout4,$inout4);
+ &movdqa (&QWP(16*3,"esp"),$inout0);
+ &pxor ($inout5,$inout5);
+ &movdqa (&QWP(16*4,"esp"),$inout0);
+ &movdqa (&QWP(16*5,"esp"),$inout0);
+ &movdqa (&QWP(16*6,"esp"),$inout0);
+
+ &lea ("esp",&DWP(0,$key));
+ &mov ($rounds,&wparam(5)); # &offset_i
+ &mov ($rounds_,&wparam(7)); # &checksum
+ &movdqu (&QWP(0,$rounds),$rndkey0);
+ &pxor ($rndkey0,$rndkey0);
+ &movdqu (&QWP(0,$rounds_),$rndkey1);
+ &pxor ($rndkey1,$rndkey1);
+&function_end("aesni_ocb_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+ &mov ($inp,&wparam(0));
+ &mov ($rounds_,"esp");
+ &mov ($out,&wparam(1));
+ &sub ($rounds_,24);
+ &mov ($len,&wparam(2));
+ &and ($rounds_,-16);
+ &mov ($key,&wparam(3));
+ &mov ($key_,&wparam(4));
+ &test ($len,$len);
+ &jz (&label("cbc_abort"));
+
+ &cmp (&wparam(5),0);
+ &xchg ($rounds_,"esp"); # alloca
+ &movups ($ivec,&QWP(0,$key_)); # load IV
+ &mov ($rounds,&DWP(240,$key));
+ &mov ($key_,$key); # backup $key
+ &mov (&DWP(16,"esp"),$rounds_); # save original %esp
+ &mov ($rounds_,$rounds); # backup $rounds
+ &je (&label("cbc_decrypt"));
+
+ &movaps ($inout0,$ivec);
+ &cmp ($len,16);
+ &jb (&label("cbc_enc_tail"));
+ &sub ($len,16);
+ &jmp (&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+ &movups ($ivec,&QWP(0,$inp)); # input actually
+ &lea ($inp,&DWP(16,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc",$inout0,$ivec); }
+ else
+ { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
+ &mov ($rounds,$rounds_); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &movups (&QWP(0,$out),$inout0); # store output
+ &lea ($out,&DWP(16,$out));
+ &sub ($len,16);
+ &jnc (&label("cbc_enc_loop"));
+ &add ($len,16);
+ &jnz (&label("cbc_enc_tail"));
+ &movaps ($ivec,$inout0);
+ &pxor ($inout0,$inout0);
+ &jmp (&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+ &mov ("ecx",$len); # zaps $rounds
+ &data_word(0xA4F3F689); # rep movsb
+ &mov ("ecx",16); # zero tail
+ &sub ("ecx",$len);
+ &xor ("eax","eax"); # zaps $len
+ &data_word(0xAAF3F689); # rep stosb
+ &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
+ &mov ($rounds,$rounds_); # restore $rounds
+ &mov ($inp,$out); # $inp and $out are the same
+ &mov ($key,$key_); # restore $key
+ &jmp (&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+ &cmp ($len,0x50);
+ &jbe (&label("cbc_dec_tail"));
+ &movaps (&QWP(0,"esp"),$ivec); # save IV
+ &sub ($len,0x50);
+ &jmp (&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+ &movaps (&QWP(0,"esp"),$rndkey0); # save IV
+ &movups (&QWP(0,$out),$inout5);
+ &lea ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+
+ &call ("_aesni_decrypt6");
+
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,&QWP(0,"esp")); # ^=IV
+ &xorps ($inout1,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout2,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout3,$rndkey1);
+ &movups ($rndkey1,&QWP(0x40,$inp));
+ &xorps ($inout4,$rndkey0);
+ &movups ($rndkey0,&QWP(0x50,$inp)); # IV
+ &xorps ($inout5,$rndkey1);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &lea ($inp,&DWP(0x60,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &mov ($rounds,$rounds_); # restore $rounds
+ &movups (&QWP(0x30,$out),$inout3);
+ &mov ($key,$key_); # restore $key
+ &movups (&QWP(0x40,$out),$inout4);
+ &lea ($out,&DWP(0x50,$out));
+ &sub ($len,0x60);
+ &ja (&label("cbc_dec_loop6"));
+
+ &movaps ($inout0,$inout5);
+ &movaps ($ivec,$rndkey0);
+ &add ($len,0x50);
+ &jle (&label("cbc_dec_clear_tail_collected"));
+ &movups (&QWP(0,$out),$inout0);
+ &lea ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &movaps ($in0,$inout0);
+ &cmp ($len,0x10);
+ &jbe (&label("cbc_dec_one"));
+
+ &movups ($inout1,&QWP(0x10,$inp));
+ &movaps ($in1,$inout1);
+ &cmp ($len,0x20);
+ &jbe (&label("cbc_dec_two"));
+
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x30);
+ &jbe (&label("cbc_dec_three"));
+
+ &movups ($inout3,&QWP(0x30,$inp));
+ &cmp ($len,0x40);
+ &jbe (&label("cbc_dec_four"));
+
+ &movups ($inout4,&QWP(0x40,$inp));
+ &movaps (&QWP(0,"esp"),$ivec); # save IV
+ &movups ($inout0,&QWP(0,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_decrypt6");
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,&QWP(0,"esp")); # ^= IV
+ &xorps ($inout1,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout2,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout3,$rndkey1);
+ &movups ($ivec,&QWP(0x40,$inp)); # IV
+ &xorps ($inout4,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &pxor ($inout2,$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &pxor ($inout3,$inout3);
+ &lea ($out,&DWP(0x40,$out));
+ &movaps ($inout0,$inout4);
+ &pxor ($inout4,$inout4);
+ &sub ($len,0x50);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$ivec);
+ &movaps ($ivec,$in0);
+ &sub ($len,0x10);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+ &call ("_aesni_decrypt2");
+ &xorps ($inout0,$ivec);
+ &xorps ($inout1,$in0);
+ &movups (&QWP(0,$out),$inout0);
+ &movaps ($inout0,$inout1);
+ &pxor ($inout1,$inout1);
+ &lea ($out,&DWP(0x10,$out));
+ &movaps ($ivec,$in1);
+ &sub ($len,0x20);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+ &call ("_aesni_decrypt3");
+ &xorps ($inout0,$ivec);
+ &xorps ($inout1,$in0);
+ &xorps ($inout2,$in1);
+ &movups (&QWP(0,$out),$inout0);
+ &movaps ($inout0,$inout2);
+ &pxor ($inout2,$inout2);
+ &movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
+ &lea ($out,&DWP(0x20,$out));
+ &movups ($ivec,&QWP(0x20,$inp));
+ &sub ($len,0x30);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+ &call ("_aesni_decrypt4");
+ &movups ($rndkey1,&QWP(0x10,$inp));
+ &movups ($rndkey0,&QWP(0x20,$inp));
+ &xorps ($inout0,$ivec);
+ &movups ($ivec,&QWP(0x30,$inp));
+ &xorps ($inout1,$in0);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &pxor ($inout1,$inout1);
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0x20,$out),$inout2);
+ &pxor ($inout2,$inout2);
+ &lea ($out,&DWP(0x30,$out));
+ &movaps ($inout0,$inout3);
+ &pxor ($inout3,$inout3);
+ &sub ($len,0x40);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_clear_tail_collected",16);
+ &pxor ($inout1,$inout1);
+ &pxor ($inout2,$inout2);
+ &pxor ($inout3,$inout3);
+ &pxor ($inout4,$inout4);
+&set_label("cbc_dec_tail_collected");
+ &and ($len,15);
+ &jnz (&label("cbc_dec_tail_partial"));
+ &movups (&QWP(0,$out),$inout0);
+ &pxor ($rndkey0,$rndkey0);
+ &jmp (&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+ &movaps (&QWP(0,"esp"),$inout0);
+ &pxor ($rndkey0,$rndkey0);
+ &mov ("ecx",16);
+ &mov ($inp,"esp");
+ &sub ("ecx",$len);
+ &data_word(0xA4F3F689); # rep movsb
+ &movdqa (&QWP(0,"esp"),$inout0);
+
+&set_label("cbc_ret");
+ &mov ("esp",&DWP(16,"esp")); # pull original %esp
+ &mov ($key_,&wparam(4));
+ &pxor ($inout0,$inout0);
+ &pxor ($rndkey1,$rndkey1);
+ &movups (&QWP(0,$key_),$ivec); # output IV
+ &pxor ($ivec,$ivec);
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+# "eax" const unsigned char *userKey
+# $rounds int bits
+# $key AES_KEY *key
+# output:
+# "eax" return code
+# $round rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+ &push ("ebp");
+ &push ("ebx");
+ &test ("eax","eax");
+ &jz (&label("bad_pointer"));
+ &test ($key,$key);
+ &jz (&label("bad_pointer"));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
+
+ &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
+ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &mov ("ebp",&DWP(4,"ebp"));
+ &lea ($key,&DWP(16,$key));
+ &and ("ebp",1<<28|1<<11); # AVX and XOP bits
+ &cmp ($rounds,256);
+ &je (&label("14rounds"));
+ &cmp ($rounds,192);
+ &je (&label("12rounds"));
+ &cmp ($rounds,128);
+ &jne (&label("bad_keybits"));
+
+&set_label("10rounds",16);
+ &cmp ("ebp",1<<28);
+ &je (&label("10rounds_alt"));
+
+ &mov ($rounds,9);
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 1
+ &call (&label("key_128_cold"));
+ &aeskeygenassist("xmm1","xmm0",0x2); # round 2
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 3
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 4
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 5
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 6
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x40); # round 7
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x80); # round 8
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x36); # round 10
+ &call (&label("key_128"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(80,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
+&set_label("key_128",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_128_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
+ &xorps ("xmm0","xmm1");
+ &ret();
+
+&set_label("10rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &mov ($rounds,8);
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &movdqa ("xmm2","xmm0");
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key128");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(16,$key));
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm0");
+ &movdqa ("xmm2","xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key128"));
+
+ &movdqa ("xmm4",&QWP(0x30,"ebx"));
+
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &movdqa ("xmm2","xmm0");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(16,$key),"xmm0");
+
+ &mov ($rounds,9);
+ &mov (&DWP(96,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
+&set_label("12rounds",16);
+ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
+ &cmp ("ebp",1<<28);
+ &je (&label("12rounds_alt"));
+
+ &mov ($rounds,11);
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
+ &call (&label("key_192a_cold"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
+ &call (&label("key_192b"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(48,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
+&set_label("key_192a",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+ &movaps ("xmm5","xmm2");
+&set_label("key_192b_warm");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &movdqa ("xmm3","xmm2");
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &pslldq ("xmm3",4);
+ &xorps ("xmm0","xmm4");
+ &pshufd ("xmm1","xmm1",0b01010101); # critical path
+ &pxor ("xmm2","xmm3");
+ &pxor ("xmm0","xmm1");
+ &pshufd ("xmm3","xmm0",0b11111111);
+ &pxor ("xmm2","xmm3");
+ &ret();
+
+&set_label("key_192b",16);
+ &movaps ("xmm3","xmm0");
+ &shufps ("xmm5","xmm0",0b01000100);
+ &$movekey (&QWP(0,$key),"xmm5");
+ &shufps ("xmm3","xmm2",0b01001110);
+ &$movekey (&QWP(16,$key),"xmm3");
+ &lea ($key,&DWP(32,$key));
+ &jmp (&label("key_192b_warm"));
+
+&set_label("12rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x10,"ebx"));
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &mov ($rounds,8);
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+&set_label("loop_key192");
+ &movq (&QWP(0,$key),"xmm2");
+ &movdqa ("xmm1","xmm2");
+ &pshufb ("xmm2","xmm5");
+ &aesenclast ("xmm2","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(24,$key));
+
+ &movdqa ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm0","xmm3");
+
+ &pshufd ("xmm3","xmm0",0xff);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+
+ &pxor ("xmm0","xmm2");
+ &pxor ("xmm2","xmm3");
+ &movdqu (&QWP(-16,$key),"xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key192"));
+
+ &mov ($rounds,11);
+ &mov (&DWP(32,$key),$rounds);
+
+ &jmp (&label("good_key"));
+
+&set_label("14rounds",16);
+ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
+ &lea ($key,&DWP(16,$key));
+ &cmp ("ebp",1<<28);
+ &je (&label("14rounds_alt"));
+
+ &mov ($rounds,13);
+ &$movekey (&QWP(-32,$key),"xmm0"); # round 0
+ &$movekey (&QWP(-16,$key),"xmm2"); # round 1
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 2
+ &call (&label("key_256a_cold"));
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 3
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 4
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x02); # round 5
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 6
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 7
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 8
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 9
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 10
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 11
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 12
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 13
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 14
+ &call (&label("key_256a"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(16,$key),$rounds);
+ &xor ("eax","eax");
+
+ &jmp (&label("good_key"));
+
+&set_label("key_256a",16);
+ &$movekey (&QWP(0,$key),"xmm2");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
+ &xorps ("xmm0","xmm1");
+ &ret();
+
+&set_label("key_256b",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+
+ &shufps ("xmm4","xmm2",0b00010000);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm4","xmm2",0b10001100);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm1","xmm1",0b10101010); # critical path
+ &xorps ("xmm2","xmm1");
+ &ret();
+
+&set_label("14rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &mov ($rounds,7);
+ &movdqu (&QWP(-32,$key),"xmm0");
+ &movdqa ("xmm1","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm2");
+
+&set_label("loop_key256");
+ &pshufb ("xmm2","xmm5");
+ &aesenclast ("xmm2","xmm4");
+
+ &movdqa ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm3","xmm0");
+ &pslldq ("xmm0",4);
+ &pxor ("xmm0","xmm3");
+ &pslld ("xmm4",1);
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &dec ($rounds);
+ &jz (&label("done_key256"));
+
+ &pshufd ("xmm2","xmm0",0xff);
+ &pxor ("xmm3","xmm3");
+ &aesenclast ("xmm2","xmm3");
+
+ &movdqa ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm3","xmm1");
+ &pslldq ("xmm1",4);
+ &pxor ("xmm1","xmm3");
+
+ &pxor ("xmm2","xmm1");
+ &movdqu (&QWP(16,$key),"xmm2");
+ &lea ($key,&DWP(32,$key));
+ &movdqa ("xmm1","xmm2");
+ &jmp (&label("loop_key256"));
+
+&set_label("done_key256");
+ &mov ($rounds,13);
+ &mov (&DWP(16,$key),$rounds);
+
+&set_label("good_key");
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &xor ("eax","eax");
+ &pop ("ebx");
+ &pop ("ebp");
+ &ret ();
+
+&set_label("bad_pointer",4);
+ &mov ("eax",-1);
+ &pop ("ebx");
+ &pop ("ebp");
+ &ret ();
+&set_label("bad_keybits",4);
+ &pxor ("xmm0","xmm0");
+ &mov ("eax",-2);
+ &pop ("ebx");
+ &pop ("ebp");
+ &ret ();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &call ("_aesni_set_encrypt_key");
+ &ret ();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &call ("_aesni_set_encrypt_key");
+ &mov ($key,&wparam(2));
+ &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
+ &test ("eax","eax");
+ &jnz (&label("dec_key_ret"));
+ &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
+
+ &$movekey ("xmm0",&QWP(0,$key)); # just swap
+ &$movekey ("xmm1",&QWP(0,"eax"));
+ &$movekey (&QWP(0,"eax"),"xmm0");
+ &$movekey (&QWP(0,$key),"xmm1");
+ &lea ($key,&DWP(16,$key));
+ &lea ("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+ &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
+ &$movekey ("xmm1",&QWP(0,"eax"));
+ &aesimc ("xmm0","xmm0");
+ &aesimc ("xmm1","xmm1");
+ &lea ($key,&DWP(16,$key));
+ &lea ("eax",&DWP(-16,"eax"));
+ &$movekey (&QWP(16,"eax"),"xmm0");
+ &$movekey (&QWP(-16,$key),"xmm1");
+ &cmp ("eax",$key);
+ &ja (&label("dec_key_inverse"));
+
+ &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
+ &aesimc ("xmm0","xmm0");
+ &$movekey (&QWP(0,$key),"xmm0");
+
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &xor ("eax","eax"); # return success
+&set_label("dec_key_ret");
+ &ret ();
+&function_end_B("${PREFIX}_set_decrypt_key");
+
+&set_label("key_const",64);
+&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
+&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
+&data_word(1,1,1,1);
+&data_word(0x1b,0x1b,0x1b,0x1b);
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesni-x86_64.pl b/openssl-1.1.0h/crypto/aes/asm/aesni-x86_64.pl
new file mode 100644
index 0000000..98ca179
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesni-x86_64.pl
@@ -0,0 +1,5060 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
+# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
+# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
+# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
+# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor 3x 6x 8x
+# theoretical asymptotic limit 1.67 0.83 0.625
+# measured performance for 8KB block 1.05 0.86 0.84
+#
+# "as if" interleave factor 4.7x 5.8x 6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt 1.16 0.93 0.74
+# CTR 1.14 0.91 0.74
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB OCB
+# Westmere 3.77/1.25 1.25 1.25 1.26
+# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
+# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
+# Skylake 2.62/0.63 0.63 0.63 0.63
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
+# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
+# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
+#
+# (*) Atom Silvermont ECB result is suboptimal because of penalties
+# incurred by operations on %xmm8-15. As ECB is not considered
+# critical, nothing was done to mitigate the problem.
+
+$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
+ # generates drop-in replacement for
+ # crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+$code=".text\n";
+$code.=".extern OPENSSL_ia32cap_P\n";
+
+$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8"; # cbc, ctr, ...
+
+$rnds_="%r10d"; # backup copy for $rounds
+$key_="%r11"; # backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0"; $rndkey1="%xmm1";
+$inout0="%xmm2"; $inout1="%xmm3";
+$inout2="%xmm4"; $inout3="%xmm5";
+$inout4="%xmm6"; $inout5="%xmm7";
+$inout6="%xmm8"; $inout7="%xmm9";
+
+$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
+$in0="%xmm8"; $iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+ $movkey ($key),$rndkey0
+ $movkey 16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+ xorps $rndkey0,$ivec
+ lea 32($key),$key
+ xorps $ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+ lea 32($key),$key
+ xorps $rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+ aes${p} $rndkey1,$inout
+ dec $rounds
+ $movkey ($key),$rndkey1
+ lea 16($key),$key
+ jnz .Loop_${p}1_$sn # loop body is 16 bytes
+ aes${p}last $rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_encrypt:
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movups $inout0,($out) # output
+ pxor $inout0,$inout0
+ ret
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_decrypt:
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movups $inout0,($out) # output
+ pxor $inout0,$inout0
+ ret
+.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs...
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt2,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt2:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
+
+.L${dir}_loop2:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop2
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ ret
+.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt3,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt3:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
+
+.L${dir}_loop3:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop3
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ ret
+.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt4,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt4:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ xorps $rndkey0,$inout3
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ .byte 0x0f,0x1f,0x00
+ add \$16,%rax
+
+.L${dir}_loop4:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop4
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ ret
+.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt6,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt6:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ aes${dir} $rndkey1,$inout2
+ pxor $rndkey0,$inout5
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
+ jmp .L${dir}_loop6_enter
+.align 16
+.L${dir}_loop6:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+.L${dir}_loop6_enter:
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop6
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ ret
+.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt8,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt8:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ pxor $rndkey0,$inout2
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ aes${dir} $rndkey1,$inout0
+ pxor $rndkey0,$inout5
+ pxor $rndkey0,$inout6
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout7
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
+ jmp .L${dir}_loop8_inner
+.align 16
+.L${dir}_loop8:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+.L${dir}_loop8_inner:
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+.L${dir}_loop8_enter:
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ aes${dir} $rndkey0,$inout6
+ aes${dir} $rndkey0,$inout7
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop8
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ aes${dir}last $rndkey0,$inout6
+ aes${dir}last $rndkey0,$inout7
+ ret
+.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+# size_t length, const AES_KEY *key,
+# int enc);
+$code.=<<___;
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,\@function,5
+.align 16
+aesni_ecb_encrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # offload $inout4..7
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+ and \$-16,$len # if ($len<16)
+ jz .Lecb_ret # return
+
+ mov 240($key),$rounds # key->rounds
+ $movkey ($key),$rndkey0
+ mov $key,$key_ # backup $key
+ mov $rounds,$rnds_ # backup $rounds
+ test %r8d,%r8d # 5th argument
+ jz .Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_enc_tail # short input
+
+ movdqu ($inp),$inout0 # load 8 input blocks
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups $inout0,($out) # store 8 output blocks
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0 # load 8 input blocks
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ sub \$0x80,$len
+ jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
+
+ movups $inout0,($out) # store 8 output blocks
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
+
+.Lecb_enc_tail: # $len is less than 8*16
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_enc_one
+ movups 0x10($inp),$inout1
+ je .Lecb_enc_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_enc_three
+ movups 0x30($inp),$inout3
+ je .Lecb_enc_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_enc_five
+ movups 0x50($inp),$inout5
+ je .Lecb_enc_six
+ movdqu 0x60($inp),$inout6
+ xorps $inout7,$inout7
+ call _aesni_encrypt8
+ movups $inout0,($out) # store 7 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # store one output block
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ call _aesni_encrypt2
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,0x10($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups $inout0,($out) # store 4 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps $inout5,$inout5
+ call _aesni_encrypt6
+ movups $inout0,($out) # store 5 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups $inout0,($out) # store 6 output blocks
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ jmp .Lecb_ret
+ #--------------------------- ECB DECRYPT ------------------------------#
+.align 16
+.Lecb_decrypt:
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_dec_tail # short input
+
+ movdqu ($inp),$inout0 # load 8 input blocks
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups $inout0,($out) # store 8 output blocks
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0 # load 8 input blocks
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp # $inp+=8*16
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ $movkey ($key_),$rndkey0
+ sub \$0x80,$len
+ jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
+
+ movups $inout0,($out) # store 8 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ movups $inout5,0x50($out)
+ pxor $inout5,$inout5
+ movups $inout6,0x60($out)
+ pxor $inout6,$inout6
+ movups $inout7,0x70($out)
+ pxor $inout7,$inout7
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
+
+.Lecb_dec_tail:
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_dec_one
+ movups 0x10($inp),$inout1
+ je .Lecb_dec_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_dec_three
+ movups 0x30($inp),$inout3
+ je .Lecb_dec_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_dec_five
+ movups 0x50($inp),$inout5
+ je .Lecb_dec_six
+ movups 0x60($inp),$inout6
+ $movkey ($key),$rndkey0
+ xorps $inout7,$inout7
+ call _aesni_decrypt8
+ movups $inout0,($out) # store 7 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ movups $inout5,0x50($out)
+ pxor $inout5,$inout5
+ movups $inout6,0x60($out)
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # store one output block
+ pxor $inout0,$inout0 # clear register bank
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ call _aesni_decrypt2
+ movups $inout0,($out) # store 2 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups $inout0,($out) # store 3 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups $inout0,($out) # store 4 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ movups $inout0,($out) # store 5 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups $inout0,($out) # store 6 output blocks
+ pxor $inout0,$inout0 # clear register bank
+ movups $inout1,0x10($out)
+ pxor $inout1,$inout1
+ movups $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movups $inout3,0x30($out)
+ pxor $inout3,$inout3
+ movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ movups $inout5,0x50($out)
+ pxor $inout5,$inout5
+
+.Lecb_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lecb_enc_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9"; # 6th argument
+
+my $increment="%xmm9";
+my $iv="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in0
+ movaps %xmm9,0x30(%rsp) # $increment
+.Lccm64_enc_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movdqu ($ivp),$iv
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ shl \$4,$rounds
+ mov \$16,$rnds_
+ lea 0($key),$key_
+ movdqu ($cmac),$inout1
+ movdqa $iv,$inout0
+ lea 32($key,$rounds),$key # end of key schedule
+ pshufb $bswap_mask,$iv
+ sub %rax,%r10 # twisted $rounds
+ jmp .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+ $movkey ($key_),$rndkey0
+ mov %r10,%rax
+ movups ($inp),$in0 # load inp
+
+ xorps $rndkey0,$inout0 # counter
+ $movkey 16($key_),$rndkey1
+ xorps $in0,$rndkey0
+ xorps $rndkey0,$inout1 # cmac^=inp
+ $movkey 32($key_),$rndkey0
+
+.Lccm64_enc2_loop:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Lccm64_enc2_loop
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ paddq $increment,$iv
+ dec $len # $len-- ($len is in blocks)
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+
+ lea 16($inp),$inp
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ pshufb $bswap_mask,$inout0
+ lea 16($out),$out # $out+=16
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
+
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in8
+ movaps %xmm9,0x30(%rsp) # $increment
+.Lccm64_dec_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movups ($ivp),$iv
+ movdqu ($cmac),$inout1
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ movaps $iv,$inout0
+ mov $rounds,$rnds_
+ mov $key,$key_
+ pshufb $bswap_mask,$iv
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ shl \$4,$rnds_
+ mov \$16,$rounds
+ movups ($inp),$in0 # load inp
+ paddq $increment,$iv
+ lea 16($inp),$inp # $inp+=16
+ sub %r10,%rax # twisted $rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ mov %rax,%r10
+ jmp .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ lea 16($out),$out # $out+=16
+ pshufb $bswap_mask,$inout0
+
+ sub \$1,$len # $len-- ($len is in blocks)
+ jz .Lccm64_dec_break # if ($len==0) break
+
+ $movkey ($key_),$rndkey0
+ mov %r10,%rax
+ $movkey 16($key_),$rndkey1
+ xorps $rndkey0,$in0
+ xorps $rndkey0,$inout0
+ xorps $in0,$inout1 # cmac^=out
+ $movkey 32($key_),$rndkey0
+ jmp .Lccm64_dec2_loop
+.align 16
+.Lccm64_dec2_loop:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Lccm64_dec2_loop
+ movups ($inp),$in0 # load input
+ paddq $increment,$iv
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+ lea 16($inp),$inp # $inp+=16
+ jmp .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+ #xorps $in0,$inout1 # cmac^=out
+ mov 240($key_),$rounds
+___
+ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
+{
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("${key_}d","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,\@function,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+ cmp \$1,$len
+ jne .Lctr32_bulk
+
+ # handle single block without allocating stack frame,
+ # useful when handling edges
+ movups ($ivp),$inout0
+ movups ($inp),$inout1
+ mov 240($key),%edx # key->rounds
+___
+ &aesni_generate1("enc",$key,"%edx");
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ xorps $inout1,$inout0
+ pxor $inout1,$inout1
+ movups $inout0,($out)
+ xorps $inout0,$inout0
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
+.Lctr32_body:
+___
+$code.=<<___;
+ lea -8(%rax),%rbp
+
+ # 8 16-byte words on top of stack are counter values
+ # xor-ed with zero-round key
+
+ movdqu ($ivp),$inout0
+ movdqu ($key),$rndkey0
+ mov 12($ivp),$ctr # counter LSB
+ pxor $rndkey0,$inout0
+ mov 12($key),$key0 # 0-round key LSB
+ movdqa $inout0,0x00(%rsp) # populate counter block
+ bswap $ctr
+ movdqa $inout0,$inout1
+ movdqa $inout0,$inout2
+ movdqa $inout0,$inout3
+ movdqa $inout0,0x40(%rsp)
+ movdqa $inout0,0x50(%rsp)
+ movdqa $inout0,0x60(%rsp)
+ mov %rdx,%r10 # about to borrow %rdx
+ movdqa $inout0,0x70(%rsp)
+
+ lea 1($ctr),%rax
+ lea 2($ctr),%rdx
+ bswap %eax
+ bswap %edx
+ xor $key0,%eax
+ xor $key0,%edx
+ pinsrd \$3,%eax,$inout1
+ lea 3($ctr),%rax
+ movdqa $inout1,0x10(%rsp)
+ pinsrd \$3,%edx,$inout2
+ bswap %eax
+ mov %r10,%rdx # restore %rdx
+ lea 4($ctr),%r10
+ movdqa $inout2,0x20(%rsp)
+ xor $key0,%eax
+ bswap %r10d
+ pinsrd \$3,%eax,$inout3
+ xor $key0,%r10d
+ movdqa $inout3,0x30(%rsp)
+ lea 5($ctr),%r9
+ mov %r10d,0x40+12(%rsp)
+ bswap %r9d
+ lea 6($ctr),%r10
+ mov 240($key),$rounds # key->rounds
+ xor $key0,%r9d
+ bswap %r10d
+ mov %r9d,0x50+12(%rsp)
+ xor $key0,%r10d
+ lea 7($ctr),%r9
+ mov %r10d,0x60+12(%rsp)
+ bswap %r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ xor $key0,%r9d
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
+ mov %r9d,0x70+12(%rsp)
+
+ $movkey 0x10($key),$rndkey1
+
+ movdqa 0x40(%rsp),$inout4
+ movdqa 0x50(%rsp),$inout5
+
+ cmp \$8,$len # $len is in blocks
+ jb .Lctr32_tail # short input if ($len<8)
+
+ sub \$6,$len # $len is biased by -6
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
+ je .Lctr32_6x # [which denotes Atom Silvermont]
+
+ lea 0x80($key),$key # size optimization
+ sub \$2,$len # $len is biased by -8
+ jmp .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+ shl \$4,$rounds
+ mov \$48,$rnds_
+ bswap $key0
+ lea 32($key,$rounds),$key # end of key schedule
+ sub %rax,%r10 # twisted $rounds
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ add \$6,$ctr # next counter value
+ $movkey -48($key,$rnds_),$rndkey0
+ aesenc $rndkey1,$inout0
+ mov $ctr,%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout1
+ movbe %eax,`0x00+12`(%rsp) # store next counter value
+ lea 1($ctr),%eax
+ aesenc $rndkey1,$inout2
+ xor $key0,%eax
+ movbe %eax,`0x10+12`(%rsp)
+ aesenc $rndkey1,$inout3
+ lea 2($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout4
+ movbe %eax,`0x20+12`(%rsp)
+ lea 3($ctr),%eax
+ aesenc $rndkey1,$inout5
+ $movkey -32($key,$rnds_),$rndkey1
+ xor $key0,%eax
+
+ aesenc $rndkey0,$inout0
+ movbe %eax,`0x30+12`(%rsp)
+ lea 4($ctr),%eax
+ aesenc $rndkey0,$inout1
+ xor $key0,%eax
+ movbe %eax,`0x40+12`(%rsp)
+ aesenc $rndkey0,$inout2
+ lea 5($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey0,$inout3
+ movbe %eax,`0x50+12`(%rsp)
+ mov %r10,%rax # mov $rnds_,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,$rnds_),$rndkey0
+
+ call .Lenc_loop6
+
+ movdqu ($inp),$inout6 # load 6 input blocks
+ movdqu 0x10($inp),$inout7
+ movdqu 0x20($inp),$in0
+ movdqu 0x30($inp),$in1
+ movdqu 0x40($inp),$in2
+ movdqu 0x50($inp),$in3
+ lea 0x60($inp),$inp # $inp+=6*16
+ $movkey -64($key,$rnds_),$rndkey1
+ pxor $inout0,$inout6 # inp^=E(ctr)
+ movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
+ pxor $inout1,$inout7
+ movaps 0x10(%rsp),$inout1
+ pxor $inout2,$in0
+ movaps 0x20(%rsp),$inout2
+ pxor $inout3,$in1
+ movaps 0x30(%rsp),$inout3
+ pxor $inout4,$in2
+ movaps 0x40(%rsp),$inout4
+ pxor $inout5,$in3
+ movaps 0x50(%rsp),$inout5
+ movdqu $inout6,($out) # store 6 output blocks
+ movdqu $inout7,0x10($out)
+ movdqu $in0,0x20($out)
+ movdqu $in1,0x30($out)
+ movdqu $in2,0x40($out)
+ movdqu $in3,0x50($out)
+ lea 0x60($out),$out # $out+=6*16
+
+ sub \$6,$len
+ jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
+
+ add \$6,$len # restore real remaining $len
+ jz .Lctr32_done # done if ($len==0)
+
+ lea -48($rnds_),$rounds
+ lea -80($key,$rnds_),$key # restore $key
+ neg $rounds
+ shr \$4,$rounds # restore $rounds
+ jmp .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+ add \$8,$ctr # next counter value
+ movdqa 0x60(%rsp),$inout6
+ aesenc $rndkey1,$inout0
+ mov $ctr,%r9d
+ movdqa 0x70(%rsp),$inout7
+ aesenc $rndkey1,$inout1
+ bswap %r9d
+ $movkey 0x20-0x80($key),$rndkey0
+ aesenc $rndkey1,$inout2
+ xor $key0,%r9d
+ nop
+ aesenc $rndkey1,$inout3
+ mov %r9d,0x00+12(%rsp) # store next counter value
+ lea 1($ctr),%r9
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkeyx,$inout0
+ aesenc $rndkeyx,$inout1
+ xor $key0,%r9d
+ .byte 0x66,0x90
+ aesenc $rndkeyx,$inout2
+ aesenc $rndkeyx,$inout3
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
+ lea $i($ctr),%r9
+ aesenc $rndkeyx,$inout4
+ aesenc $rndkeyx,$inout5
+ aesenc $rndkeyx,$inout6
+ aesenc $rndkeyx,$inout7
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ xor $key0,%r9d
+ movdqu 0x00($inp),$in0 # start loading input
+ aesenc $rndkey0,$inout3
+ mov %r9d,0x70+12(%rsp)
+ cmp \$11,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xa0-0x80($key),$rndkey0
+
+ jb .Lctr32_enc_done
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xb0-0x80($key),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xc0-0x80($key),$rndkey0
+ je .Lctr32_enc_done
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xd0-0x80($key),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xe0-0x80($key),$rndkey0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 0x10($inp),$in1
+ pxor $rndkey0,$in0 # input^=round[last]
+ movdqu 0x20($inp),$in2
+ pxor $rndkey0,$in1
+ movdqu 0x30($inp),$in3
+ pxor $rndkey0,$in2
+ movdqu 0x40($inp),$in4
+ pxor $rndkey0,$in3
+ movdqu 0x50($inp),$in5
+ pxor $rndkey0,$in4
+ pxor $rndkey0,$in5
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
+ lea 0x80($inp),$inp # $inp+=8*16
+
+ aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
+ pxor $rndkey0,$rndkey1 # borrowed $rndkey
+ movdqu 0x70-0x80($inp),$in0
+ aesenclast $in1,$inout1
+ pxor $rndkey0,$in0
+ movdqa 0x00(%rsp),$in1 # load next counter block
+ aesenclast $in2,$inout2
+ aesenclast $in3,$inout3
+ movdqa 0x10(%rsp),$in2
+ movdqa 0x20(%rsp),$in3
+ aesenclast $in4,$inout4
+ aesenclast $in5,$inout5
+ movdqa 0x30(%rsp),$in4
+ movdqa 0x40(%rsp),$in5
+ aesenclast $rndkey1,$inout6
+ movdqa 0x50(%rsp),$rndkey0
+ $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
+ aesenclast $in0,$inout7
+
+ movups $inout0,($out) # store 8 output blocks
+ movdqa $in1,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in2,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in3,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in4,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in5,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey0,$inout5
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+
+ sub \$8,$len
+ jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
+
+ add \$8,$len # restore real remainig $len
+ jz .Lctr32_done # done if ($len==0)
+ lea -0x80($key),$key
+
+.Lctr32_tail:
+ # note that at this point $inout0..5 are populated with
+ # counter values xor-ed with 0-round key
+ lea 16($key),$key
+ cmp \$4,$len
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
+
+ # if ($len>4) compute 7 E(counter)
+ shl \$4,$rounds
+ movdqa 0x60(%rsp),$inout6
+ pxor $inout7,$inout7
+
+ $movkey 16($key),$rndkey0
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
+ neg %rax
+ aesenc $rndkey1,$inout2
+ add \$16,%rax # prepare for .Lenc_loop8_enter
+ movups ($inp),$in0
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ movups 0x10($inp),$in1 # pre-load input
+ movups 0x20($inp),$in2
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+
+ call .Lenc_loop8_enter
+
+ movdqu 0x30($inp),$in3
+ pxor $in0,$inout0
+ movdqu 0x40($inp),$in0
+ pxor $in1,$inout1
+ movdqu $inout0,($out) # store output
+ pxor $in2,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in3,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in0,$inout4
+ movdqu $inout3,0x30($out)
+ movdqu $inout4,0x40($out)
+ cmp \$6,$len
+ jb .Lctr32_done # $len was 5, stop store
+
+ movups 0x50($inp),$in1
+ xorps $in1,$inout5
+ movups $inout5,0x50($out)
+ je .Lctr32_done # $len was 6, stop store
+
+ movups 0x60($inp),$in2
+ xorps $in2,$inout6
+ movups $inout6,0x60($out)
+ jmp .Lctr32_done # $len was 7, stop store
+
+.align 32
+.Lctr32_loop4:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop4
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ movups ($inp),$in0 # load input
+ movups 0x10($inp),$in1
+ aesenclast $rndkey1,$inout2
+ aesenclast $rndkey1,$inout3
+ movups 0x20($inp),$in2
+ movups 0x30($inp),$in3
+
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ pxor $in2,$inout2
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout3
+ movdqu $inout3,0x30($out)
+ jmp .Lctr32_done # $len was 4, stop store
+
+.align 32
+.Lctr32_loop3:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop3
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ aesenclast $rndkey1,$inout2
+
+ movups ($inp),$in0 # load input
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ cmp \$2,$len
+ jb .Lctr32_done # $len was 1, stop store
+
+ movups 0x10($inp),$in1
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ je .Lctr32_done # $len was 2, stop store
+
+ movups 0x20($inp),$in2
+ xorps $in2,$inout2
+ movups $inout2,0x20($out) # $len was 3, stop store
+
+.Lctr32_done:
+ xorps %xmm0,%xmm0 # clear regiser bank
+ xor $key0,$key0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,0x70(%rsp)
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+ movaps %xmm0,0x70(%rsp)
+___
+$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lctr32_epilogue:
+ ret
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x70 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,\@function,6
+.align 16
+aesni_xts_encrypt:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ lea -8(%rax),%rbp
+ movups ($ivp),$inout0 # load clear-text tweak
+ mov 240(%r8),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+ $movkey ($key),$rndkey0 # zero round key
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
+ movdqa .Lxts_magic(%rip),$twmask
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
+___
+ # alternative tweak calculation algorithm is based on suggestions
+ # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+ # and should help in the future...
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ movdqa @tweak[5],@tweak[$i]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
+___
+ }
+$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
+ sub \$16*6,$len
+ jc .Lxts_enc_short # if $len-=6*16 borrowed
+
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
+ jmp .Lxts_enc_grandloop
+
+.align 32
+.Lxts_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqa $rndkey0,$twmask
+ movdqu `16*1`($inp),$inout1
+ pxor @tweak[0],$inout0 # input^=tweak^round[0]
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[1],$inout1
+ aesenc $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[2],$inout2
+ aesenc $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[3],$inout3
+ aesenc $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
+ pxor @tweak[4],$inout4
+ aesenc $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
+
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
+ aesenc $rndkey1,$inout4
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
+
+ aesenc $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_enc_loop6
+.align 32
+.Lxts_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -80($key,%rax),$rndkey0
+ jnz .Lxts_enc_loop6
+
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
+ aesenc $rndkey1,$inout1
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
+ aesenc $rndkey1,$inout5
+ $movkey -64($key),$rndkey1
+
+ movdqa $twres,$twtmp
+ aesenc $rndkey0,$inout0
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
+ aesenc $rndkey0,$inout1
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
+ aesenc $rndkey0,$inout5
+ $movkey -48($key),$rndkey0
+
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
+ aesenc $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
+ aesenc $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
+
+ paddd $twres,$twres
+ aesenc $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesenc $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesenc $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesenclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesenclast `16*1`(%rsp),$inout1
+ aesenclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesenclast `16*3`(%rsp),$inout3
+ aesenclast `16*4`(%rsp),$inout4
+ aesenclast `16*5`(%rsp),$inout5
+ pxor $twres,@tweak[5]
+
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
+ sub \$16*6,$len
+ jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
+
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
+ mov $key_,$key # restore $key
+ shr \$4,$rounds # restore original value
+
+.Lxts_enc_short:
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_enc_done # done if ($len==0)
+
+ pxor $rndkey0,@tweak[1]
+ cmp \$0x20,$len
+ jb .Lxts_enc_one # $len is 1*16
+ pxor $rndkey0,@tweak[2]
+ je .Lxts_enc_two # $len is 2*16
+
+ pxor $rndkey0,@tweak[3]
+ cmp \$0x40,$len
+ jb .Lxts_enc_three # $len is 3*16
+ pxor $rndkey0,@tweak[4]
+ je .Lxts_enc_four # $len is 4*16
+
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp # $inp+=5*16
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+ pxor $inout5,$inout5
+
+ call _aesni_encrypt6
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[5],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out) # store 5 output blocks
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out # $out+=5*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp # inp+=1*16
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out) # store one output block
+ lea 16*1($out),$out # $out+=1*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp # $inp+=2*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_encrypt2
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out # $out+=2*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp # $inp+=3*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_encrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out # $out+=3*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp # $inp+=4*16
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_encrypt4
+
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+ and \$15,$len_ # see if $len%16 is 0
+ jz .Lxts_enc_ret
+ mov $len_,$len
+
+.Lxts_enc_steal:
+ movzb ($inp),%eax # borrow $rounds ...
+ movzb -16($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,-16($out)
+ mov %cl,0($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_enc_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups -16($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,-16($out)
+
+.Lxts_enc_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lxts_enc_epilogue:
+ ret
+.size aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,\@function,6
+.align 16
+aesni_xts_decrypt:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ lea -8(%rax),%rbp
+ movups ($ivp),$inout0 # load clear-text tweak
+ mov 240($key2),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+ xor %eax,%eax # if ($len%16) len-=16;
+ test \$15,$len
+ setnz %al
+ shl \$4,%rax
+ sub %rax,$len
+
+ $movkey ($key),$rndkey0 # zero round key
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
+ movdqa .Lxts_magic(%rip),$twmask
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
+___
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ movdqa @tweak[5],@tweak[$i]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
+___
+ }
+$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
+ sub \$16*6,$len
+ jc .Lxts_dec_short # if $len-=6*16 borrowed
+
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
+ jmp .Lxts_dec_grandloop
+
+.align 32
+.Lxts_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqa $rndkey0,$twmask
+ movdqu `16*1`($inp),$inout1
+ pxor @tweak[0],$inout0 # intput^=tweak^round[0]
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[1],$inout1
+ aesdec $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[2],$inout2
+ aesdec $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[3],$inout3
+ aesdec $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
+ pxor @tweak[4],$inout4
+ aesdec $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
+
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
+ aesdec $rndkey1,$inout4
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
+
+ aesdec $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_dec_loop6
+.align 32
+.Lxts_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -80($key,%rax),$rndkey0
+ jnz .Lxts_dec_loop6
+
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
+ aesdec $rndkey1,$inout1
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
+ aesdec $rndkey1,$inout5
+ $movkey -64($key),$rndkey1
+
+ movdqa $twres,$twtmp
+ aesdec $rndkey0,$inout0
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
+ aesdec $rndkey0,$inout1
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
+ aesdec $rndkey0,$inout5
+ $movkey -48($key),$rndkey0
+
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
+ aesdec $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
+ aesdec $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
+
+ paddd $twres,$twres
+ aesdec $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesdec $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesdec $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesdeclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesdeclast `16*1`(%rsp),$inout1
+ aesdeclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesdeclast `16*3`(%rsp),$inout3
+ aesdeclast `16*4`(%rsp),$inout4
+ aesdeclast `16*5`(%rsp),$inout5
+ pxor $twres,@tweak[5]
+
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
+ sub \$16*6,$len
+ jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
+
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
+ mov $key_,$key # restore $key
+ shr \$4,$rounds # restore original value
+
+.Lxts_dec_short:
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ pxor $rndkey0,@tweak[1]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_dec_done # done if ($len==0)
+
+ pxor $rndkey0,@tweak[2]
+ cmp \$0x20,$len
+ jb .Lxts_dec_one # $len is 1*16
+ pxor $rndkey0,@tweak[3]
+ je .Lxts_dec_two # $len is 2*16
+
+ pxor $rndkey0,@tweak[4]
+ cmp \$0x40,$len
+ jb .Lxts_dec_three # $len is 3*16
+ je .Lxts_dec_four # $len is 4*16
+
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp # $inp+=5*16
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+
+ call _aesni_decrypt6
+
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out) # store 5 output blocks
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ pxor $twtmp,$twtmp
+ movdqu $inout3,16*3($out)
+ pcmpgtd @tweak[5],$twtmp
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out # $out+=5*16
+ pshufd \$0x13,$twtmp,@tweak[1] # $twres
+ and \$15,$len_
+ jz .Lxts_dec_ret
+
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,@tweak[1] # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ jmp .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp # $inp+=1*16
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out) # store one output block
+ movdqa @tweak[2],@tweak[1]
+ lea 16*1($out),$out # $out+=1*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp # $inp+=2*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_decrypt2
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[3],@tweak[1]
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out # $out+=2*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp # $inp+=3*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_decrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[4],@tweak[1]
+ xorps @tweak[2],$inout2
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out # $out+=3*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp # $inp+=4*16
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_decrypt4
+
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ movdqa @tweak[5],@tweak[1]
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+ and \$15,$len_ # see if $len%16 is 0
+ jz .Lxts_dec_ret
+.Lxts_dec_done2:
+ mov $len_,$len
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($inp),$inout0
+ xorps @tweak[1],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[1],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp),%eax # borrow $rounds ...
+ movzb ($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,($out)
+ mov %cl,16($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_dec_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lxts_dec_epilogue:
+ ret
+.size aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+}
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+ lea (%rsp),%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_enc_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_enc_done
+
+.Locb_enc_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_enc_short
+ jmp .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_encrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_enc_grandloop
+
+.Locb_enc_short:
+ add \$6,$blocks
+ jz .Locb_enc_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_enc_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_enc_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_enc_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_enc_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_encrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out)
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+
+.Locb_enc_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+ lea 0xa0(%rsp),%rsp
+___
+$code.=<<___;
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+.Locb_enc_epilogue:
+ ret
+.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor $inout4,$checksum
+ pxor @offset[4],$inout4
+ pxor $inout5,$checksum
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesenc $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesenclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ aesenclast @offset[4],$inout4
+ aesenclast @offset[5],$inout5
+ ret
+.size __ocb_encrypt6,.-__ocb_encrypt6
+
+.type __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop4
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast @offset[0],$inout0
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ ret
+.size __ocb_encrypt4,.-__ocb_encrypt4
+
+.type __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesenc $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesenc $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+ aesenc $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop1
+
+ aesenc $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast $inout5,$inout0
+ ret
+.size __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+ lea (%rsp),%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_dec_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ xorps $inout0,$checksum # accumulate checksum
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_dec_done
+
+.Locb_dec_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_dec_short
+ jmp .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_decrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+ movups $inout5,`16*5`($out)
+ pxor $inout5,$checksum
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_dec_grandloop
+
+.Locb_dec_short:
+ add \$6,$blocks
+ jz .Locb_dec_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_dec_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_dec_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_dec_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_dec_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_decrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ xorps $inout2,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+
+.Locb_dec_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+ lea 0xa0(%rsp),%rsp
+___
+$code.=<<___;
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+.Locb_dec_epilogue:
+ ret
+.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor @offset[4],$inout4
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop6
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesdeclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ aesdeclast @offset[4],$inout4
+ aesdeclast @offset[5],$inout5
+ ret
+.size __ocb_decrypt6,.-__ocb_decrypt6
+
+.type __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop4
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast @offset[0],$inout0
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ ret
+.size __ocb_decrypt4,.-__ocb_decrypt4
+
+.type __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesdec $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesdec $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+ aesdec $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop1
+
+ aesdec $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast $inout5,$inout0
+ ret
+.size __ocb_decrypt1,.-__ocb_decrypt1
+___
+} }}
+
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+{
+my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+my $inp_=$key_;
+
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+ test $len,$len # check length
+ jz .Lcbc_ret
+
+ mov 240($key),$rnds_ # key->rounds
+ mov $key,$key_ # backup $key
+ test %r9d,%r9d # 6th argument
+ jz .Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+ movups ($ivp),$inout0 # load iv as initial state
+ mov $rnds_,$rounds
+ cmp \$16,$len
+ jb .Lcbc_enc_tail
+ sub \$16,$len
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups ($inp),$inout1 # load input
+ lea 16($inp),$inp
+ #xorps $inout1,$inout0
+___
+ &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+ mov $rnds_,$rounds # restore $rounds
+ mov $key_,$key # restore $key
+ movups $inout0,0($out) # store output
+ lea 16($out),$out
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ add \$16,$len
+ jnz .Lcbc_enc_tail
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movups $inout0,($ivp)
+ pxor $inout0,$inout0
+ pxor $inout1,$inout1
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ mov $len,%rcx # zaps $key
+ xchg $inp,$out # $inp is %rsi and $out is %rdi now
+ .long 0x9066A4F3 # rep movsb
+ mov \$16,%ecx # zero tail
+ sub $len,%rcx
+ xor %eax,%eax
+ .long 0x9066AAF3 # rep stosb
+ lea -16(%rdi),%rdi # rewind $out by 1 block
+ mov $rnds_,$rounds # restore $rounds
+ mov %rdi,%rsi # $inp and $out are the same
+ mov $key_,$key # restore $key
+ xor $len,$len # len=16
+ jmp .Lcbc_enc_loop # one more spin
+ #--------------------------- CBC DECRYPT ------------------------------#
+.align 16
+.Lcbc_decrypt:
+ cmp \$16,$len
+ jne .Lcbc_decrypt_bulk
+
+ # handle single block without allocating stack frame,
+ # useful in ciphertext stealing mode
+ movdqu ($inp),$inout0 # load input
+ movdqu ($ivp),$inout1 # load iv
+ movdqa $inout0,$inout2 # future iv
+___
+ &aesni_generate1("dec",$key,$rnds_);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movdqu $inout2,($ivp) # store iv
+ xorps $inout1,$inout0 # ^=iv
+ pxor $inout1,$inout1
+ movups $inout0,($out) # store output
+ pxor $inout0,$inout0
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lcbc_decrypt_body:
+___
+$code.=<<___;
+ lea -8(%rax),%rbp
+ movups ($ivp),$iv
+ mov $rnds_,$rounds
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ $movkey ($key),$rndkey0
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+ mov OPENSSL_ia32cap_P+4(%rip),%r9d
+ cmp \$0x70,$len
+ jbe .Lcbc_dec_six_or_seven
+
+ and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
+ sub \$0x50,$len # $len is biased by -5*16
+ cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
+ je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
+ sub \$0x20,$len # $len is biased by -7*16
+ lea 0x70($key),$key # size optimization
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movups $inout7,($out)
+ lea 0x10($out),$out
+.Lcbc_dec_loop8_enter:
+ movdqu 0x60($inp),$inout6
+ pxor $rndkey0,$inout0
+ movdqu 0x70($inp),$inout7
+ pxor $rndkey0,$inout1
+ $movkey 0x10-0x70($key),$rndkey1
+ pxor $rndkey0,$inout2
+ xor $inp_,$inp_
+ cmp \$0x70,$len # is there at least 0x60 bytes ahead?
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ pxor $rndkey0,$inout5
+ pxor $rndkey0,$inout6
+
+ aesdec $rndkey1,$inout0
+ pxor $rndkey0,$inout7
+ $movkey 0x20-0x70($key),$rndkey0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ aesdec $rndkey1,$inout6
+ setnc ${inp_}b
+ shl \$7,$inp_
+ aesdec $rndkey1,$inout7
+ add $inp,$inp_
+ $movkey 0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___ if ($i==7);
+ cmp \$11,$rounds
+___
+$code.=<<___;
+ aesdec $rndkeyx,$inout0
+ aesdec $rndkeyx,$inout1
+ aesdec $rndkeyx,$inout2
+ aesdec $rndkeyx,$inout3
+ aesdec $rndkeyx,$inout4
+ aesdec $rndkeyx,$inout5
+ aesdec $rndkeyx,$inout6
+ aesdec $rndkeyx,$inout7
+ $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
+ nop
+___
+$code.=<<___ if ($i==7);
+ jb .Lcbc_dec_done
+___
+$code.=<<___ if ($i==9);
+ je .Lcbc_dec_done
+___
+$code.=<<___ if ($i==11);
+ jmp .Lcbc_dec_done
+___
+}
+$code.=<<___;
+.align 16
+.Lcbc_dec_done:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$iv
+ pxor $rndkey0,$in0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$in1
+ pxor $rndkey0,$in2
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ pxor $rndkey0,$in3
+ pxor $rndkey0,$in4
+ aesdec $rndkey1,$inout6
+ aesdec $rndkey1,$inout7
+ movdqu 0x50($inp),$rndkey1
+
+ aesdeclast $iv,$inout0
+ movdqu 0x60($inp),$iv # borrow $iv
+ pxor $rndkey0,$rndkey1
+ aesdeclast $in0,$inout1
+ pxor $rndkey0,$iv
+ movdqu 0x70($inp),$rndkey0 # next IV
+ aesdeclast $in1,$inout2
+ lea 0x80($inp),$inp
+ movdqu 0x00($inp_),$in0
+ aesdeclast $in2,$inout3
+ aesdeclast $in3,$inout4
+ movdqu 0x10($inp_),$in1
+ movdqu 0x20($inp_),$in2
+ aesdeclast $in4,$inout5
+ aesdeclast $rndkey1,$inout6
+ movdqu 0x30($inp_),$in3
+ movdqu 0x40($inp_),$in4
+ aesdeclast $iv,$inout7
+ movdqa $rndkey0,$iv # return $iv
+ movdqu 0x50($inp_),$rndkey1
+ $movkey -0x70($key),$rndkey0
+
+ movups $inout0,($out) # store output
+ movdqa $in0,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in1,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in2,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in3,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in4,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey1,$inout5
+ movups $inout6,0x60($out)
+ lea 0x70($out),$out
+
+ sub \$0x80,$len
+ ja .Lcbc_dec_loop8
+
+ movaps $inout7,$inout0
+ lea -0x70($key),$key
+ add \$0x70,$len
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout7,($out)
+ lea 0x10($out),$out
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ movaps $in0,$inout0
+.Lcbc_dec_six_or_seven:
+ cmp \$0x60,$len
+ ja .Lcbc_dec_seven
+
+ movaps $inout5,$inout6
+ call _aesni_decrypt6
+ pxor $iv,$inout0 # ^= IV
+ movaps $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ lea 0x50($out),$out
+ movdqa $inout5,$inout0
+ pxor $inout5,$inout5
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 0x60($inp),$inout6
+ xorps $inout7,$inout7
+ call _aesni_decrypt8
+ movups 0x50($inp),$inout7
+ pxor $iv,$inout0 # ^= IV
+ movups 0x60($inp),$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout7,$inout6
+ movdqu $inout5,0x50($out)
+ pxor $inout5,$inout5
+ lea 0x60($out),$out
+ movdqa $inout6,$inout0
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_loop6:
+ movups $inout5,($out)
+ lea 0x10($out),$out
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+.Lcbc_dec_loop6_enter:
+ lea 0x60($inp),$inp
+ movdqa $inout5,$inout6
+
+ call _aesni_decrypt6
+
+ pxor $iv,$inout0 # ^= IV
+ movdqa $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ mov $key_,$key
+ movdqu $inout3,0x30($out)
+ pxor $in4,$inout5
+ mov $rnds_,$rounds
+ movdqu $inout4,0x40($out)
+ lea 0x50($out),$out
+ sub \$0x60,$len
+ ja .Lcbc_dec_loop6
+
+ movdqa $inout5,$inout0
+ add \$0x50,$len
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout5,($out)
+ lea 0x10($out),$out
+
+.Lcbc_dec_tail:
+ movups ($inp),$inout0
+ sub \$0x10,$len
+ jbe .Lcbc_dec_one # $len is 1*16 or less
+
+ movups 0x10($inp),$inout1
+ movaps $inout0,$in0
+ sub \$0x10,$len
+ jbe .Lcbc_dec_two # $len is 2*16 or less
+
+ movups 0x20($inp),$inout2
+ movaps $inout1,$in1
+ sub \$0x10,$len
+ jbe .Lcbc_dec_three # $len is 3*16 or less
+
+ movups 0x30($inp),$inout3
+ movaps $inout2,$in2
+ sub \$0x10,$len
+ jbe .Lcbc_dec_four # $len is 4*16 or less
+
+ movups 0x40($inp),$inout4 # $len is 5*16 or less
+ movaps $inout3,$in3
+ movaps $inout4,$in4
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ pxor $iv,$inout0
+ movaps $in4,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ lea 0x40($out),$out
+ movdqa $inout4,$inout0
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
+ sub \$0x10,$len
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_one:
+ movaps $inout0,$in0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps $iv,$inout0
+ movaps $in0,$iv
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ movaps $inout1,$in1
+ call _aesni_decrypt2
+ pxor $iv,$inout0
+ movaps $in1,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ movdqa $inout1,$inout0
+ pxor $inout1,$inout1 # clear register bank
+ lea 0x10($out),$out
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ movaps $inout2,$in2
+ call _aesni_decrypt3
+ pxor $iv,$inout0
+ movaps $in2,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ movdqa $inout2,$inout0
+ pxor $inout2,$inout2
+ lea 0x20($out),$out
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ movaps $inout3,$in3
+ call _aesni_decrypt4
+ pxor $iv,$inout0
+ movaps $in3,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movdqa $inout3,$inout0
+ pxor $inout3,$inout3
+ lea 0x30($out),$out
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor $inout1,$inout1 # clear register bank
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+___
+$code.=<<___ if (!$win64);
+ pxor $inout4,$inout4 # %xmm6..9
+ pxor $inout5,$inout5
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+___
+$code.=<<___;
+.Lcbc_dec_tail_collected:
+ movups $iv,($ivp)
+ and \$15,$len
+ jnz .Lcbc_dec_tail_partial
+ movups $inout0,($out)
+ pxor $inout0,$inout0
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps $inout0,(%rsp)
+ pxor $inout0,$inout0
+ mov \$16,%rcx
+ mov $out,%rdi
+ sub $len,%rcx
+ lea (%rsp),%rsi
+ .long 0x9066A4F3 # rep movsb
+ movdqa $inout0,(%rsp)
+
+.Lcbc_dec_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps %xmm0,0x10(%rsp) # clear stack
+ movaps 0x20(%rsp),%xmm7
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm8
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm9
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm10
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm11
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm12
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm13
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm14
+ movaps %xmm0,0x90(%rsp)
+ movaps 0xa0(%rsp),%xmm15
+ movaps %xmm0,0xa0(%rsp)
+___
+$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lcbc_ret:
+ ret
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
+# int bits, AES_KEY *key)
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# *$key key schedule
+#
+{ my ($inp,$bits,$key) = @_4args;
+ $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_decrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+ call __aesni_set_encrypt_key
+ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
+ test %eax,%eax
+ jnz .Ldec_key_ret
+ lea 16($key,$bits),$inp # points at the end of key schedule
+
+ $movkey ($key),%xmm0 # just swap
+ $movkey ($inp),%xmm1
+ $movkey %xmm0,($inp)
+ $movkey %xmm1,($key)
+ lea 16($key),$key
+ lea -16($inp),$inp
+
+.Ldec_key_inverse:
+ $movkey ($key),%xmm0 # swap and inverse
+ $movkey ($inp),%xmm1
+ aesimc %xmm0,%xmm0
+ aesimc %xmm1,%xmm1
+ lea 16($key),$key
+ lea -16($inp),$inp
+ $movkey %xmm0,16($inp)
+ $movkey %xmm1,-16($key)
+ cmp $key,$inp
+ ja .Ldec_key_inverse
+
+ $movkey ($key),%xmm0 # inverse middle
+ aesimc %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ $movkey %xmm0,($inp)
+ pxor %xmm0,%xmm0
+.Ldec_key_ret:
+ add \$8,%rsp
+ ret
+.LSEH_end_set_decrypt_key:
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+
+# This is based on submission by
+#
+# Huang Ying <ying.huang@intel.com>
+# Vinodh Gopal <vinodh.gopal@intel.com>
+# Kahraman Akdemir
+#
+# Aggressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
+# int bits, AES_KEY * const key);
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# $bits rounds-1 (used in aesni_set_decrypt_key)
+# *$key key schedule
+# $key pointer to key schedule (used in
+# aesni_set_decrypt_key)
+#
+# Subroutine is frame-less, which means that only volatile registers
+# are used. Note that it's declared "abi-omnipotent", which means that
+# amount of volatile registers is smaller on Windows.
+#
+$code.=<<___;
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+ mov \$-1,%rax
+ test $inp,$inp
+ jz .Lenc_key_ret
+ test $key,$key
+ jz .Lenc_key_ret
+
+ mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
+ and OPENSSL_ia32cap_P+4(%rip),%r10d
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
+ cmp \$256,$bits
+ je .L14rounds
+ cmp \$192,$bits
+ je .L12rounds
+ cmp \$128,$bits
+ jne .Lbad_keybits
+
+.L10rounds:
+ mov \$9,$bits # 10 rounds for 128-bit key
+ cmp \$`1<<28`,%r10d # AVX, bit no XOP
+ je .L10rounds_alt
+
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
+ call .Lkey_expansion_128_cold
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
+ call .Lkey_expansion_128
+ $movkey %xmm0,(%rax)
+ mov $bits,80(%rax) # 240(%rdx)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ mov \$8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,($key)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+ lea 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ dec %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ mov $bits,96(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
+ mov \$11,$bits # 12 rounds for 192
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L12rounds_alt
+
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
+ call .Lkey_expansion_192a_cold
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
+ call .Lkey_expansion_192b
+ $movkey %xmm0,(%rax)
+ mov $bits,48(%rax) # 240(%rdx)
+ xor %rax, %rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$8,%r10d
+ movdqu %xmm0,($key)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+ pslld \$1, %xmm4
+ lea 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd \$0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ dec %r10d
+ jnz .Loop_key192
+
+ mov $bits,32(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16($inp),%xmm2 # remaning half of *userKey
+ mov \$13,$bits # 14 rounds for 256
+ lea 16(%rax),%rax
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L14rounds_alt
+
+ $movkey %xmm0,($key) # round 0
+ $movkey %xmm2,16($key) # round 1
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
+ call .Lkey_expansion_256a_cold
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
+ call .Lkey_expansion_256a
+ $movkey %xmm0,(%rax)
+ mov $bits,16(%rax) # 240(%rdx)
+ xor %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$7,%r10d
+ movdqu %xmm0,0($key)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16($key)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld \$1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ dec %r10d
+ jz .Ldone_key256
+
+ pshufd \$0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+ aesenclast %xmm3,%xmm2
+
+ movdqa %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ lea 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ mov $bits,16(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ mov \$-2,%rax
+.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ add \$8,%rsp
+ ret
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_192a:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+ shufps \$0b00010000,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ pslldq \$4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd \$0b11111111,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps \$0b01000100,%xmm0,%xmm5
+ $movkey %xmm5,(%rax)
+ shufps \$0b01001110,%xmm2,%xmm3
+ $movkey %xmm3,16(%rax)
+ lea 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ $movkey %xmm2,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_256b:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+
+ shufps \$0b00010000,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10001100,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm2
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+ .long 6,6,6,0
+.Lincrement64:
+ .long 1,0,0,0
+.Lxts_magic:
+ .long 0x87,0,1,0
+.Lincrement1:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+ .long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+ .long 1,1,1,1
+.Lkey_rcon1b:
+ .long 0x1b,0x1b,0x1b,0x1b
+
+.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type ecb_ccm64_se_handler,\@abi-omnipotent
+.align 16
+ecb_ccm64_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 0(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x58(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
+
+.type ctr_xts_se_handler,\@abi-omnipotent
+.align 16
+ctr_xts_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 160($context),%rax # pull context->Rbp
+ lea -0xa0(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+ jmp .Lcommon_rbp_tail
+.size ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type ocb_se_handler,\@abi-omnipotent
+.align 16
+ocb_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10
+ cmp %r10,%rbx # context->Rip>=pop label
+ jae .Locb_no_xmm
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea (%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size ocb_se_handler,.-ocb_se_handler
+___
+$code.=<<___;
+.type cbc_se_handler,\@abi-omnipotent
+.align 16
+cbc_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lcbc_decrypt_bulk(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lcommon_seh_tail
+
+ lea .Lcbc_decrypt_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<cbc_decrypt_body
+ jb .Lrestore_cbc_rax
+
+ lea .Lcbc_ret(%rip),%r10
+ cmp %r10,%rbx # context->Rip>="epilogue" label
+ jae .Lcommon_seh_tail
+
+ lea 16(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_rbp_tail:
+ mov 160($context),%rax # pull context->Rbp
+ mov (%rax),%rbp # restore saved %rbp
+ lea 8(%rax),%rax # adjust stack pointer
+ mov %rbp,160($context) # restore context->Rbp
+ jmp .Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+ mov 120($context),%rax
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size cbc_se_handler,.-cbc_se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+ .rva .LSEH_begin_aesni_ecb_encrypt
+ .rva .LSEH_end_aesni_ecb_encrypt
+ .rva .LSEH_info_ecb
+
+ .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_info_ccm64_enc
+
+ .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_info_ccm64_dec
+
+ .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_end_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_info_ctr32
+
+ .rva .LSEH_begin_aesni_xts_encrypt
+ .rva .LSEH_end_aesni_xts_encrypt
+ .rva .LSEH_info_xts_enc
+
+ .rva .LSEH_begin_aesni_xts_decrypt
+ .rva .LSEH_end_aesni_xts_decrypt
+ .rva .LSEH_info_xts_dec
+
+ .rva .LSEH_begin_aesni_ocb_encrypt
+ .rva .LSEH_end_aesni_ocb_encrypt
+ .rva .LSEH_info_ocb_enc
+
+ .rva .LSEH_begin_aesni_ocb_decrypt
+ .rva .LSEH_end_aesni_ocb_decrypt
+ .rva .LSEH_info_ocb_dec
+___
+$code.=<<___;
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_cbc
+
+ .rva ${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_set_decrypt_key
+ .rva .LSEH_info_key
+
+ .rva ${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_set_encrypt_key
+ .rva .LSEH_info_key
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+ .byte 9,0,0,0
+ .rva ecb_ccm64_se_handler
+ .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
+.LSEH_info_ccm64_enc:
+ .byte 9,0,0,0
+ .rva ecb_ccm64_se_handler
+ .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
+.LSEH_info_ccm64_dec:
+ .byte 9,0,0,0
+ .rva ecb_ccm64_se_handler
+ .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
+.LSEH_info_ctr32:
+ .byte 9,0,0,0
+ .rva ctr_xts_se_handler
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
+.LSEH_info_xts_enc:
+ .byte 9,0,0,0
+ .rva ctr_xts_se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.LSEH_info_xts_dec:
+ .byte 9,0,0,0
+ .rva ctr_xts_se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+.LSEH_info_ocb_enc:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
+ .rva .Locb_enc_pop
+ .long 0
+.LSEH_info_ocb_dec:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
+ .rva .Locb_dec_pop
+ .long 0
+___
+$code.=<<___;
+.LSEH_info_cbc:
+ .byte 9,0,0,0
+ .rva cbc_se_handler
+.LSEH_info_key:
+ .byte 0x01,0x04,0x01,0x00
+ .byte 0x04,0x02,0x00,0x00 # sub rsp,8
+___
+}
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ rex(\@opcode,$4,$3);
+ push @opcode,0x0f,0x3a,0xdf;
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
+ my $c=$2;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesimc" => 0xdb,
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+sub movbe {
+ ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesp8-ppc.pl b/openssl-1.1.0h/crypto/aes/asm/aesp8-ppc.pl
new file mode 100755
index 0000000..b7e92f6
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesp8-ppc.pl
@@ -0,0 +1,3805 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS
+# POWER8[le] 3.96/0.72 0.74 1.1
+# POWER8[be] 3.75/0.65 0.66 1.0
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+ $UCMP ="cmpld";
+ $SHL ="sldi";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+ $UCMP ="cmplw";
+ $SHL ="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p8";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{ # Key setup procedures #
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine "any"
+
+.text
+
+.align 7
+rcon:
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
+.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
+.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
+.long 0,0,0,0 ?asis
+Lconsts:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $ptr #vvvvv "distance between . and rcon
+ addi $ptr,$ptr,-0x48
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl .${prefix}_set_encrypt_key
+.align 5
+.${prefix}_set_encrypt_key:
+Lset_encrypt_key:
+ mflr r11
+ $PUSH r11,$LRSAVE($sp)
+
+ li $ptr,-1
+ ${UCMP}i $inp,0
+ beq- Lenc_key_abort # if ($inp==0) return -1;
+ ${UCMP}i $out,0
+ beq- Lenc_key_abort # if ($out==0) return -1;
+ li $ptr,-2
+ cmpwi $bits,128
+ blt- Lenc_key_abort
+ cmpwi $bits,256
+ bgt- Lenc_key_abort
+ andi. r0,$bits,0x3f
+ bne- Lenc_key_abort
+
+ lis r0,0xfff0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ bl Lconsts
+ mtlr r11
+
+ neg r9,$inp
+ lvx $in0,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ lvsr $key,0,r9 # borrow $key
+ li r8,0x20
+ cmpwi $bits,192
+ lvx $in1,0,$inp
+ le?vspltisb $mask,0x0f # borrow $mask
+ lvx $rcon,0,$ptr
+ le?vxor $key,$key,$mask # adjust for byte swap
+ lvx $mask,r8,$ptr
+ addi $ptr,$ptr,0x10
+ vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
+ li $cnt,8
+ vxor $zero,$zero,$zero
+ mtctr $cnt
+
+ ?lvsr $outperm,0,$out
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$zero,$outmask,$outperm
+
+ blt Loop128
+ addi $inp,$inp,8
+ beq L192
+ addi $inp,$inp,8
+ b L256
+
+.align 4
+Loop128:
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ bdnz Loop128
+
+ lvx $rcon,0,$ptr # last two round keys
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+
+ vperm $key,$in0,$in0,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,0x50
+
+ li $rounds,10
+ b Ldone
+
+.align 4
+L192:
+ lvx $tmp,0,$inp
+ li $cnt,4
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ vspltisb $key,8 # borrow $key
+ mtctr $cnt
+ vsububm $mask,$mask,$key # adjust the mask
+
+Loop192:
+ vperm $key,$in1,$in1,$mask # roate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vcipherlast $key,$key,$rcon
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+
+ vsldoi $stage,$zero,$in1,8
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vsldoi $stage,$stage,$in0,8
+
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vsldoi $stage,$in0,$in1,8
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vperm $outtail,$stage,$stage,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vspltw $tmp,$in0,3
+ vxor $tmp,$tmp,$in1
+ vsldoi $in1,$zero,$in1,12 # >>32
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in1,$in1,$tmp
+ vxor $in0,$in0,$key
+ vxor $in1,$in1,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdnz Loop192
+
+ li $rounds,12
+ addi $out,$out,0x20
+ b Ldone
+
+.align 4
+L256:
+ lvx $tmp,0,$inp
+ li $cnt,7
+ li $rounds,14
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $out,$out,16
+ vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
+ mtctr $cnt
+
+Loop256:
+ vperm $key,$in1,$in1,$mask # rotate-n-splat
+ vsldoi $tmp,$zero,$in0,12 # >>32
+ vperm $outtail,$in1,$in1,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ vcipherlast $key,$key,$rcon
+ stvx $stage,0,$out
+ addi $out,$out,16
+
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in0,$in0,$tmp
+ vadduwm $rcon,$rcon,$rcon
+ vxor $in0,$in0,$key
+ vperm $outtail,$in0,$in0,$outperm # rotate
+ vsel $stage,$outhead,$outtail,$outmask
+ vmr $outhead,$outtail
+ stvx $stage,0,$out
+ addi $inp,$out,15 # 15 is not typo
+ addi $out,$out,16
+ bdz Ldone
+
+ vspltw $key,$in0,3 # just splat
+ vsldoi $tmp,$zero,$in1,12 # >>32
+ vsbox $key,$key
+
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+ vsldoi $tmp,$zero,$tmp,12 # >>32
+ vxor $in1,$in1,$tmp
+
+ vxor $in1,$in1,$key
+ b Loop256
+
+.align 4
+Ldone:
+ lvx $in1,0,$inp # redundant in aligned case
+ vsel $in1,$outhead,$in1,$outmask
+ stvx $in1,0,$inp
+ li $ptr,0
+ mtspr 256,$vrsave
+ stw $rounds,0($out)
+
+Lenc_key_abort:
+ mr r3,$ptr
+ blr
+ .long 0
+ .byte 0,12,0x14,1,0,0,3,0
+ .long 0
+.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl .${prefix}_set_decrypt_key
+.align 5
+.${prefix}_set_decrypt_key:
+ $STU $sp,-$FRAME($sp)
+ mflr r10
+ $PUSH r10,$FRAME+$LRSAVE($sp)
+ bl Lset_encrypt_key
+ mtlr r10
+
+ cmpwi r3,0
+ bne- Ldec_key_abort
+
+ slwi $cnt,$rounds,4
+ subi $inp,$out,240 # first round key
+ srwi $rounds,$rounds,1
+ add $out,$inp,$cnt # last round key
+ mtctr $rounds
+
+Ldeckey:
+ lwz r0, 0($inp)
+ lwz r6, 4($inp)
+ lwz r7, 8($inp)
+ lwz r8, 12($inp)
+ addi $inp,$inp,16
+ lwz r9, 0($out)
+ lwz r10,4($out)
+ lwz r11,8($out)
+ lwz r12,12($out)
+ stw r0, 0($out)
+ stw r6, 4($out)
+ stw r7, 8($out)
+ stw r8, 12($out)
+ subi $out,$out,16
+ stw r9, -16($inp)
+ stw r10,-12($inp)
+ stw r11,-8($inp)
+ stw r12,-4($inp)
+ bdnz Ldeckey
+
+ xor r3,r3,r3 # return value
+Ldec_key_abort:
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,0,3,0
+ .long 0
+.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{ # Single block en- and decrypt procedures #
+sub gen_block () {
+my $dir = shift;
+my $n = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl .${prefix}_${dir}crypt
+.align 5
+.${prefix}_${dir}crypt:
+ lwz $rounds,240($key)
+ lis r0,0xfc00
+ mfspr $vrsave,256
+ li $idx,15 # 15 is not typo
+ mtspr 256,r0
+
+ lvx v0,0,$inp
+ neg r11,$out
+ lvx v1,$idx,$inp
+ lvsl v2,0,$inp # inpperm
+ le?vspltisb v4,0x0f
+ ?lvsl v3,0,r11 # outperm
+ le?vxor v2,v2,v4
+ li $idx,16
+ vperm v0,v0,v1,v2 # align [and byte swap in LE]
+ lvx v1,0,$key
+ ?lvsl v5,0,$key # keyperm
+ srwi $rounds,$rounds,1
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ subi $rounds,$rounds,1
+ ?vperm v1,v1,v2,v5 # align round key
+
+ vxor v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Loop_${dir}c:
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ addi $idx,$idx,16
+ ?vperm v1,v1,v2,v5
+ v${n}cipher v0,v0,v1
+ lvx v1,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_${dir}c
+
+ ?vperm v2,v2,v1,v5
+ v${n}cipher v0,v0,v2
+ lvx v2,$idx,$key
+ ?vperm v1,v1,v2,v5
+ v${n}cipherlast v0,v0,v1
+
+ vspltisb v2,-1
+ vxor v1,v1,v1
+ li $idx,15 # 15 is not typo
+ ?vperm v2,v1,v2,v3 # outmask
+ le?vxor v3,v3,v4
+ lvx v1,0,$out # outhead
+ vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
+ vsel v1,v1,v0,v2
+ lvx v4,$idx,$out
+ stvx v1,0,$out
+ vsel v0,v0,v4,v2
+ stvx v0,$idx,$out
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+#########################################################################
+{{{ # CBC en- and decrypt procedures #
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
+ map("v$_",(4..10));
+$code.=<<___;
+.globl .${prefix}_cbc_encrypt
+.align 5
+.${prefix}_cbc_encrypt:
+ ${UCMP}i $len,16
+ bltlr-
+
+ cmpwi $enc,0 # test direction
+ lis r0,0xffe0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ li $idx,15
+ vxor $rndkey0,$rndkey0,$rndkey0
+ le?vspltisb $tmp,0x0f
+
+ lvx $ivec,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $ivec,$ivec,$inptail,$inpperm
+
+ neg r11,$inp
+ ?lvsl $keyperm,0,$key # prepare for unaligned key
+ lwz $rounds,240($key)
+
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inptail,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ ?lvsr $outperm,0,$out # prepare for unaligned store
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+
+ srwi $rounds,$rounds,1
+ li $idx,16
+ subi $rounds,$rounds,1
+ beq Lcbc_dec
+
+Lcbc_enc:
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ mtctr $rounds
+ subi $len,$len,16 # len-=16
+
+ lvx $rndkey0,0,$key
+ vperm $inout,$inout,$inptail,$inpperm
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ vxor $inout,$inout,$ivec
+
+Loop_cbc_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_cbc_enc
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipherlast $ivec,$inout,$rndkey0
+ ${UCMP}i $len,16
+
+ vperm $tmp,$ivec,$ivec,$outperm
+ vsel $inout,$outhead,$tmp,$outmask
+ vmr $outhead,$tmp
+ stvx $inout,0,$out
+ addi $out,$out,16
+ bge Lcbc_enc
+
+ b Lcbc_done
+
+.align 4
+Lcbc_dec:
+ ${UCMP}i $len,128
+ bge _aesp8_cbc_decrypt8x
+ vmr $tmp,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ mtctr $rounds
+ subi $len,$len,16 # len-=16
+
+ lvx $rndkey0,0,$key
+ vperm $tmp,$tmp,$inptail,$inpperm
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$tmp,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+
+Loop_cbc_dec:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_cbc_dec
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipherlast $inout,$inout,$rndkey0
+ ${UCMP}i $len,16
+
+ vxor $inout,$inout,$ivec
+ vmr $ivec,$tmp
+ vperm $tmp,$inout,$inout,$outperm
+ vsel $inout,$outhead,$tmp,$outmask
+ vmr $outhead,$tmp
+ stvx $inout,0,$out
+ addi $out,$out,16
+ bge Lcbc_dec
+
+Lcbc_done:
+ addi $out,$out,-1
+ lvx $inout,0,$out # redundant in aligned case
+ vsel $inout,$outhead,$inout,$outmask
+ stvx $inout,0,$out
+
+ neg $enc,$ivp # write [unaligned] iv
+ li $idx,15 # 15 is not typo
+ vxor $rndkey0,$rndkey0,$rndkey0
+ vspltisb $outmask,-1
+ le?vspltisb $tmp,0x0f
+ ?lvsl $outperm,0,$enc
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+ lvx $outhead,0,$ivp
+ vperm $ivec,$ivec,$ivec,$outperm
+ vsel $inout,$outhead,$ivec,$outmask
+ lvx $inptail,$idx,$ivp
+ stvx $inout,0,$ivp
+ vsel $inout,$ivec,$inptail,$outmask
+ stvx $inout,$idx,$ivp
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,6,0
+ .long 0
+___
+#########################################################################
+{{ # Optimized CBC decrypt procedure #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
+
+$code.=<<___;
+.align 5
+_aesp8_cbc_decrypt8x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ li r10,`$FRAME+8*16+15`
+ li r11,`$FRAME+8*16+31`
+ stvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+ subi $len,$len,128 # bias
+
+ lvx $rndkey0,$x00,$key # load key schedule
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ lvx v31,$x00,$key
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_cbc_dec_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_cbc_dec_key
+
+ lvx v26,$x10,$key
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key
+ ?vperm v29,v29,v30,$keyperm
+ lvx $out0,$x70,$key # borrow $out0
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$out0,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ #lvx $inptail,0,$inp # "caller" already did this
+ #addi $inp,$inp,15 # 15 is not typo
+ subi $inp,$inp,15 # undo "caller"
+
+ le?li $idx,8
+ lvx_u $in0,$x00,$inp # load first 8 "words"
+ le?lvsl $inpperm,0,$idx
+ le?vspltisb $tmp,0x0f
+ lvx_u $in1,$x10,$inp
+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
+ lvx_u $in2,$x20,$inp
+ le?vperm $in0,$in0,$in0,$inpperm
+ lvx_u $in3,$x30,$inp
+ le?vperm $in1,$in1,$in1,$inpperm
+ lvx_u $in4,$x40,$inp
+ le?vperm $in2,$in2,$in2,$inpperm
+ vxor $out0,$in0,$rndkey0
+ lvx_u $in5,$x50,$inp
+ le?vperm $in3,$in3,$in3,$inpperm
+ vxor $out1,$in1,$rndkey0
+ lvx_u $in6,$x60,$inp
+ le?vperm $in4,$in4,$in4,$inpperm
+ vxor $out2,$in2,$rndkey0
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+ le?vperm $in5,$in5,$in5,$inpperm
+ vxor $out3,$in3,$rndkey0
+ le?vperm $in6,$in6,$in6,$inpperm
+ vxor $out4,$in4,$rndkey0
+ le?vperm $in7,$in7,$in7,$inpperm
+ vxor $out5,$in5,$rndkey0
+ vxor $out6,$in6,$rndkey0
+ vxor $out7,$in7,$rndkey0
+
+ mtctr $rounds
+ b Loop_cbc_dec8x
+.align 5
+Loop_cbc_dec8x:
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_cbc_dec8x
+
+ subic $len,$len,128 # $len-=128
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+
+ subfe. r0,r0,r0 # borrow?-1:0
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+
+ and r0,r0,$len
+ vncipher $out0,$out0,v26
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+ vncipher $out6,$out6,v26
+ vncipher $out7,$out7,v26
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in7 are loaded
+ # with last "words"
+ vncipher $out0,$out0,v27
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+ vncipher $out6,$out6,v27
+ vncipher $out7,$out7,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ vncipher $out6,$out6,v28
+ vncipher $out7,$out7,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vncipher $out0,$out0,v29
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ vncipher $out6,$out6,v29
+ vncipher $out7,$out7,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+
+ vncipher $out0,$out0,v30
+ vxor $ivec,$ivec,v31 # xor with last round key
+ vncipher $out1,$out1,v30
+ vxor $in0,$in0,v31
+ vncipher $out2,$out2,v30
+ vxor $in1,$in1,v31
+ vncipher $out3,$out3,v30
+ vxor $in2,$in2,v31
+ vncipher $out4,$out4,v30
+ vxor $in3,$in3,v31
+ vncipher $out5,$out5,v30
+ vxor $in4,$in4,v31
+ vncipher $out6,$out6,v30
+ vxor $in5,$in5,v31
+ vncipher $out7,$out7,v30
+ vxor $in6,$in6,v31
+
+ vncipherlast $out0,$out0,$ivec
+ vncipherlast $out1,$out1,$in0
+ lvx_u $in0,$x00,$inp # load next input block
+ vncipherlast $out2,$out2,$in1
+ lvx_u $in1,$x10,$inp
+ vncipherlast $out3,$out3,$in2
+ le?vperm $in0,$in0,$in0,$inpperm
+ lvx_u $in2,$x20,$inp
+ vncipherlast $out4,$out4,$in3
+ le?vperm $in1,$in1,$in1,$inpperm
+ lvx_u $in3,$x30,$inp
+ vncipherlast $out5,$out5,$in4
+ le?vperm $in2,$in2,$in2,$inpperm
+ lvx_u $in4,$x40,$inp
+ vncipherlast $out6,$out6,$in5
+ le?vperm $in3,$in3,$in3,$inpperm
+ lvx_u $in5,$x50,$inp
+ vncipherlast $out7,$out7,$in6
+ le?vperm $in4,$in4,$in4,$inpperm
+ lvx_u $in6,$x60,$inp
+ vmr $ivec,$in7
+ le?vperm $in5,$in5,$in5,$inpperm
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $in6,$in6,$in6,$inpperm
+ vxor $out0,$in0,$rndkey0
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $in7,$in7,$in7,$inpperm
+ vxor $out1,$in1,$rndkey0
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ vxor $out2,$in2,$rndkey0
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ vxor $out3,$in3,$rndkey0
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ vxor $out4,$in4,$rndkey0
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ vxor $out5,$in5,$rndkey0
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x60,$out
+ vxor $out6,$in6,$rndkey0
+ stvx_u $out7,$x70,$out
+ addi $out,$out,0x80
+ vxor $out7,$in7,$rndkey0
+
+ mtctr $rounds
+ beq Loop_cbc_dec8x # did $len-=128 borrow?
+
+ addic. $len,$len,128
+ beq Lcbc_dec8x_done
+ nop
+ nop
+
+Loop_cbc_dec8x_tail: # up to 7 "words" tail...
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_cbc_dec8x_tail
+
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ vncipher $out6,$out6,v24
+ vncipher $out7,$out7,v24
+
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ vncipher $out6,$out6,v25
+ vncipher $out7,$out7,v25
+
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+ vncipher $out6,$out6,v26
+ vncipher $out7,$out7,v26
+
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+ vncipher $out6,$out6,v27
+ vncipher $out7,$out7,v27
+
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ vncipher $out6,$out6,v28
+ vncipher $out7,$out7,v28
+
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ vncipher $out6,$out6,v29
+ vncipher $out7,$out7,v29
+
+ vncipher $out1,$out1,v30
+ vxor $ivec,$ivec,v31 # last round key
+ vncipher $out2,$out2,v30
+ vxor $in1,$in1,v31
+ vncipher $out3,$out3,v30
+ vxor $in2,$in2,v31
+ vncipher $out4,$out4,v30
+ vxor $in3,$in3,v31
+ vncipher $out5,$out5,v30
+ vxor $in4,$in4,v31
+ vncipher $out6,$out6,v30
+ vxor $in5,$in5,v31
+ vncipher $out7,$out7,v30
+ vxor $in6,$in6,v31
+
+ cmplwi $len,32 # switch($len)
+ blt Lcbc_dec8x_one
+ nop
+ beq Lcbc_dec8x_two
+ cmplwi $len,64
+ blt Lcbc_dec8x_three
+ nop
+ beq Lcbc_dec8x_four
+ cmplwi $len,96
+ blt Lcbc_dec8x_five
+ nop
+ beq Lcbc_dec8x_six
+
+Lcbc_dec8x_seven:
+ vncipherlast $out1,$out1,$ivec
+ vncipherlast $out2,$out2,$in1
+ vncipherlast $out3,$out3,$in2
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out1,$out1,$out1,$inpperm
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x00,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x10,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x20,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x30,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x40,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x50,$out
+ stvx_u $out7,$x60,$out
+ addi $out,$out,0x70
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_six:
+ vncipherlast $out2,$out2,$ivec
+ vncipherlast $out3,$out3,$in2
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out2,$out2,$out2,$inpperm
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x00,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x10,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x20,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x30,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x40,$out
+ stvx_u $out7,$x50,$out
+ addi $out,$out,0x60
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_five:
+ vncipherlast $out3,$out3,$ivec
+ vncipherlast $out4,$out4,$in3
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out3,$out3,$out3,$inpperm
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x00,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x10,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x20,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x30,$out
+ stvx_u $out7,$x40,$out
+ addi $out,$out,0x50
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_four:
+ vncipherlast $out4,$out4,$ivec
+ vncipherlast $out5,$out5,$in4
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out4,$out4,$out4,$inpperm
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x00,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x10,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x20,$out
+ stvx_u $out7,$x30,$out
+ addi $out,$out,0x40
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_three:
+ vncipherlast $out5,$out5,$ivec
+ vncipherlast $out6,$out6,$in5
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out5,$out5,$out5,$inpperm
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x00,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x10,$out
+ stvx_u $out7,$x20,$out
+ addi $out,$out,0x30
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_two:
+ vncipherlast $out6,$out6,$ivec
+ vncipherlast $out7,$out7,$in6
+ vmr $ivec,$in7
+
+ le?vperm $out6,$out6,$out6,$inpperm
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x00,$out
+ stvx_u $out7,$x10,$out
+ addi $out,$out,0x20
+ b Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_one:
+ vncipherlast $out7,$out7,$ivec
+ vmr $ivec,$in7
+
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out7,0,$out
+ addi $out,$out,0x10
+
+Lcbc_dec8x_done:
+ le?vperm $ivec,$ivec,$ivec,$inpperm
+ stvx_u $ivec,0,$ivp # write [unaligned] iv
+
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $inpperm,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
+___
+}} }}}
+
+#########################################################################
+{{{ # CTR procedure[s] #
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
+ map("v$_",(4..11));
+my $dat=$tmp;
+
+$code.=<<___;
+.globl .${prefix}_ctr32_encrypt_blocks
+.align 5
+.${prefix}_ctr32_encrypt_blocks:
+ ${UCMP}i $len,1
+ bltlr-
+
+ lis r0,0xfff0
+ mfspr $vrsave,256
+ mtspr 256,r0
+
+ li $idx,15
+ vxor $rndkey0,$rndkey0,$rndkey0
+ le?vspltisb $tmp,0x0f
+
+ lvx $ivec,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ vspltisb $one,1
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $ivec,$ivec,$inptail,$inpperm
+ vsldoi $one,$rndkey0,$one,1
+
+ neg r11,$inp
+ ?lvsl $keyperm,0,$key # prepare for unaligned key
+ lwz $rounds,240($key)
+
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inptail,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ srwi $rounds,$rounds,1
+ li $idx,16
+ subi $rounds,$rounds,1
+
+ ${UCMP}i $len,8
+ bge _aesp8_ctr32_encrypt8x
+
+ ?lvsr $outperm,0,$out # prepare for unaligned store
+ vspltisb $outmask,-1
+ lvx $outhead,0,$out
+ ?vperm $outmask,$rndkey0,$outmask,$outperm
+ le?vxor $outperm,$outperm,$tmp
+
+ lvx $rndkey0,0,$key
+ mtctr $rounds
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$ivec,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ b Loop_ctr32_enc
+
+.align 5
+Loop_ctr32_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ bdnz Loop_ctr32_enc
+
+ vadduwm $ivec,$ivec,$one
+ vmr $dat,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ subic. $len,$len,1 # blocks--
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key
+ vperm $dat,$dat,$inptail,$inpperm
+ li $idx,16
+ ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
+ lvx $rndkey0,0,$key
+ vxor $dat,$dat,$rndkey1 # last round key
+ vcipherlast $inout,$inout,$dat
+
+ lvx $rndkey1,$idx,$key
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inout,$outperm
+ vsel $dat,$outhead,$inout,$outmask
+ mtctr $rounds
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vmr $outhead,$inout
+ vxor $inout,$ivec,$rndkey0
+ lvx $rndkey0,$idx,$key
+ addi $idx,$idx,16
+ stvx $dat,0,$out
+ addi $out,$out,16
+ bne Loop_ctr32_enc
+
+ addi $out,$out,-1
+ lvx $inout,0,$out # redundant in aligned case
+ vsel $inout,$outhead,$inout,$outmask
+ stvx $inout,0,$out
+
+ mtspr 256,$vrsave
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,6,0
+ .long 0
+___
+#########################################################################
+{{ # Optimized CTR procedure #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
+
+$code.=<<___;
+.align 5
+_aesp8_ctr32_encrypt8x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ li r10,`$FRAME+8*16+15`
+ li r11,`$FRAME+8*16+31`
+ stvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+
+ lvx $rndkey0,$x00,$key # load key schedule
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ lvx v31,$x00,$key
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_ctr32_enc_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key
+ addi $key,$key,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_ctr32_enc_key
+
+ lvx v26,$x10,$key
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key
+ ?vperm v29,v29,v30,$keyperm
+ lvx $out0,$x70,$key # borrow $out0
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$out0,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ vadduwm $two,$one,$one
+ subi $inp,$inp,15 # undo "caller"
+ $SHL $len,$len,4
+
+ vadduwm $out1,$ivec,$one # counter values ...
+ vadduwm $out2,$ivec,$two
+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
+ le?li $idx,8
+ vadduwm $out3,$out1,$two
+ vxor $out1,$out1,$rndkey0
+ le?lvsl $inpperm,0,$idx
+ vadduwm $out4,$out2,$two
+ vxor $out2,$out2,$rndkey0
+ le?vspltisb $tmp,0x0f
+ vadduwm $out5,$out3,$two
+ vxor $out3,$out3,$rndkey0
+ le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
+ vadduwm $out6,$out4,$two
+ vxor $out4,$out4,$rndkey0
+ vadduwm $out7,$out5,$two
+ vxor $out5,$out5,$rndkey0
+ vadduwm $ivec,$out6,$two # next counter value
+ vxor $out6,$out6,$rndkey0
+ vxor $out7,$out7,$rndkey0
+
+ mtctr $rounds
+ b Loop_ctr32_enc8x
+.align 5
+Loop_ctr32_enc8x:
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ vcipher $out6,$out6,v24
+ vcipher $out7,$out7,v24
+Loop_ctr32_enc8x_middle:
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ vcipher $out6,$out6,v25
+ vcipher $out7,$out7,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_ctr32_enc8x
+
+ subic r11,$len,256 # $len-256, borrow $key_
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ vcipher $out6,$out6,v24
+ vcipher $out7,$out7,v24
+
+ subfe r0,r0,r0 # borrow?-1:0
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ vcipher $out6,$out6,v25
+ vcipher $out7,$out7,v25
+
+ and r0,r0,r11
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vcipher $out0,$out0,v26
+ vcipher $out1,$out1,v26
+ vcipher $out2,$out2,v26
+ vcipher $out3,$out3,v26
+ vcipher $out4,$out4,v26
+ vcipher $out5,$out5,v26
+ vcipher $out6,$out6,v26
+ vcipher $out7,$out7,v26
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ subic $len,$len,129 # $len-=129
+ vcipher $out0,$out0,v27
+ addi $len,$len,1 # $len-=128 really
+ vcipher $out1,$out1,v27
+ vcipher $out2,$out2,v27
+ vcipher $out3,$out3,v27
+ vcipher $out4,$out4,v27
+ vcipher $out5,$out5,v27
+ vcipher $out6,$out6,v27
+ vcipher $out7,$out7,v27
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+
+ vcipher $out0,$out0,v28
+ lvx_u $in0,$x00,$inp # load input
+ vcipher $out1,$out1,v28
+ lvx_u $in1,$x10,$inp
+ vcipher $out2,$out2,v28
+ lvx_u $in2,$x20,$inp
+ vcipher $out3,$out3,v28
+ lvx_u $in3,$x30,$inp
+ vcipher $out4,$out4,v28
+ lvx_u $in4,$x40,$inp
+ vcipher $out5,$out5,v28
+ lvx_u $in5,$x50,$inp
+ vcipher $out6,$out6,v28
+ lvx_u $in6,$x60,$inp
+ vcipher $out7,$out7,v28
+ lvx_u $in7,$x70,$inp
+ addi $inp,$inp,0x80
+
+ vcipher $out0,$out0,v29
+ le?vperm $in0,$in0,$in0,$inpperm
+ vcipher $out1,$out1,v29
+ le?vperm $in1,$in1,$in1,$inpperm
+ vcipher $out2,$out2,v29
+ le?vperm $in2,$in2,$in2,$inpperm
+ vcipher $out3,$out3,v29
+ le?vperm $in3,$in3,$in3,$inpperm
+ vcipher $out4,$out4,v29
+ le?vperm $in4,$in4,$in4,$inpperm
+ vcipher $out5,$out5,v29
+ le?vperm $in5,$in5,$in5,$inpperm
+ vcipher $out6,$out6,v29
+ le?vperm $in6,$in6,$in6,$inpperm
+ vcipher $out7,$out7,v29
+ le?vperm $in7,$in7,$in7,$inpperm
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in7 are loaded
+ # with last "words"
+ subfe. r0,r0,r0 # borrow?-1:0
+ vcipher $out0,$out0,v30
+ vxor $in0,$in0,v31 # xor with last round key
+ vcipher $out1,$out1,v30
+ vxor $in1,$in1,v31
+ vcipher $out2,$out2,v30
+ vxor $in2,$in2,v31
+ vcipher $out3,$out3,v30
+ vxor $in3,$in3,v31
+ vcipher $out4,$out4,v30
+ vxor $in4,$in4,v31
+ vcipher $out5,$out5,v30
+ vxor $in5,$in5,v31
+ vcipher $out6,$out6,v30
+ vxor $in6,$in6,v31
+ vcipher $out7,$out7,v30
+ vxor $in7,$in7,v31
+
+ bne Lctr32_enc8x_break # did $len-129 borrow?
+
+ vcipherlast $in0,$out0,$in0
+ vcipherlast $in1,$out1,$in1
+ vadduwm $out1,$ivec,$one # counter values ...
+ vcipherlast $in2,$out2,$in2
+ vadduwm $out2,$ivec,$two
+ vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
+ vcipherlast $in3,$out3,$in3
+ vadduwm $out3,$out1,$two
+ vxor $out1,$out1,$rndkey0
+ vcipherlast $in4,$out4,$in4
+ vadduwm $out4,$out2,$two
+ vxor $out2,$out2,$rndkey0
+ vcipherlast $in5,$out5,$in5
+ vadduwm $out5,$out3,$two
+ vxor $out3,$out3,$rndkey0
+ vcipherlast $in6,$out6,$in6
+ vadduwm $out6,$out4,$two
+ vxor $out4,$out4,$rndkey0
+ vcipherlast $in7,$out7,$in7
+ vadduwm $out7,$out5,$two
+ vxor $out5,$out5,$rndkey0
+ le?vperm $in0,$in0,$in0,$inpperm
+ vadduwm $ivec,$out6,$two # next counter value
+ vxor $out6,$out6,$rndkey0
+ le?vperm $in1,$in1,$in1,$inpperm
+ vxor $out7,$out7,$rndkey0
+ mtctr $rounds
+
+ vcipher $out0,$out0,v24
+ stvx_u $in0,$x00,$out
+ le?vperm $in2,$in2,$in2,$inpperm
+ vcipher $out1,$out1,v24
+ stvx_u $in1,$x10,$out
+ le?vperm $in3,$in3,$in3,$inpperm
+ vcipher $out2,$out2,v24
+ stvx_u $in2,$x20,$out
+ le?vperm $in4,$in4,$in4,$inpperm
+ vcipher $out3,$out3,v24
+ stvx_u $in3,$x30,$out
+ le?vperm $in5,$in5,$in5,$inpperm
+ vcipher $out4,$out4,v24
+ stvx_u $in4,$x40,$out
+ le?vperm $in6,$in6,$in6,$inpperm
+ vcipher $out5,$out5,v24
+ stvx_u $in5,$x50,$out
+ le?vperm $in7,$in7,$in7,$inpperm
+ vcipher $out6,$out6,v24
+ stvx_u $in6,$x60,$out
+ vcipher $out7,$out7,v24
+ stvx_u $in7,$x70,$out
+ addi $out,$out,0x80
+
+ b Loop_ctr32_enc8x_middle
+
+.align 5
+Lctr32_enc8x_break:
+ cmpwi $len,-0x60
+ blt Lctr32_enc8x_one
+ nop
+ beq Lctr32_enc8x_two
+ cmpwi $len,-0x40
+ blt Lctr32_enc8x_three
+ nop
+ beq Lctr32_enc8x_four
+ cmpwi $len,-0x20
+ blt Lctr32_enc8x_five
+ nop
+ beq Lctr32_enc8x_six
+ cmpwi $len,0x00
+ blt Lctr32_enc8x_seven
+
+Lctr32_enc8x_eight:
+ vcipherlast $out0,$out0,$in0
+ vcipherlast $out1,$out1,$in1
+ vcipherlast $out2,$out2,$in2
+ vcipherlast $out3,$out3,$in3
+ vcipherlast $out4,$out4,$in4
+ vcipherlast $out5,$out5,$in5
+ vcipherlast $out6,$out6,$in6
+ vcipherlast $out7,$out7,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ le?vperm $out7,$out7,$out7,$inpperm
+ stvx_u $out6,$x60,$out
+ stvx_u $out7,$x70,$out
+ addi $out,$out,0x80
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_seven:
+ vcipherlast $out0,$out0,$in1
+ vcipherlast $out1,$out1,$in2
+ vcipherlast $out2,$out2,$in3
+ vcipherlast $out3,$out3,$in4
+ vcipherlast $out4,$out4,$in5
+ vcipherlast $out5,$out5,$in6
+ vcipherlast $out6,$out6,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ le?vperm $out6,$out6,$out6,$inpperm
+ stvx_u $out5,$x50,$out
+ stvx_u $out6,$x60,$out
+ addi $out,$out,0x70
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_six:
+ vcipherlast $out0,$out0,$in2
+ vcipherlast $out1,$out1,$in3
+ vcipherlast $out2,$out2,$in4
+ vcipherlast $out3,$out3,$in5
+ vcipherlast $out4,$out4,$in6
+ vcipherlast $out5,$out5,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ le?vperm $out5,$out5,$out5,$inpperm
+ stvx_u $out4,$x40,$out
+ stvx_u $out5,$x50,$out
+ addi $out,$out,0x60
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_five:
+ vcipherlast $out0,$out0,$in3
+ vcipherlast $out1,$out1,$in4
+ vcipherlast $out2,$out2,$in5
+ vcipherlast $out3,$out3,$in6
+ vcipherlast $out4,$out4,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$inpperm
+ stvx_u $out3,$x30,$out
+ stvx_u $out4,$x40,$out
+ addi $out,$out,0x50
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_four:
+ vcipherlast $out0,$out0,$in4
+ vcipherlast $out1,$out1,$in5
+ vcipherlast $out2,$out2,$in6
+ vcipherlast $out3,$out3,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$inpperm
+ stvx_u $out2,$x20,$out
+ stvx_u $out3,$x30,$out
+ addi $out,$out,0x40
+ b Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_three:
+ vcipherlast $out0,$out0,$in5
+ vcipherlast $out1,$out1,$in6
+ vcipherlast $out2,$out2,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ le?vperm $out2,$out2,$out2,$inpperm
+ stvx_u $out1,$x10,$out
+ stvx_u $out2,$x20,$out
+ addi $out,$out,0x30
+ b Lcbc_dec8x_done
+
+.align 5
+Lctr32_enc8x_two:
+ vcipherlast $out0,$out0,$in6
+ vcipherlast $out1,$out1,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ le?vperm $out1,$out1,$out1,$inpperm
+ stvx_u $out0,$x00,$out
+ stvx_u $out1,$x10,$out
+ addi $out,$out,0x20
+ b Lcbc_dec8x_done
+
+.align 5
+Lctr32_enc8x_one:
+ vcipherlast $out0,$out0,$in7
+
+ le?vperm $out0,$out0,$out0,$inpperm
+ stvx_u $out0,0,$out
+ addi $out,$out,0x10
+
+Lctr32_enc8x_done:
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $inpperm,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+ stvx $inpperm,r10,$sp
+ addi r10,r10,32
+ stvx $inpperm,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
+___
+}} }}}
+
+#########################################################################
+{{{ # XTS procedures #
+# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, #
+# const AES_KEY *key1, const AES_KEY *key2, #
+# [const] unsigned char iv[16]); #
+# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which #
+# input tweak value is assumed to be encrypted already, and last tweak #
+# value, one suitable for consecutive call on same chunk of data, is #
+# written back to original buffer. In addition, in "tweak chaining" #
+# mode only complete input blocks are processed. #
+
+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2));
+my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7));
+my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12));
+my $taillen = $key2;
+
+ ($inp,$idx) = ($idx,$inp); # reassign
+
+$code.=<<___;
+.globl .${prefix}_xts_encrypt
+.align 5
+.${prefix}_xts_encrypt:
+ mr $inp,r3 # reassign
+ li r3,-1
+ ${UCMP}i $len,16
+ bltlr-
+
+ lis r0,0xfff0
+ mfspr r12,256 # save vrsave
+ li r11,0
+ mtspr 256,r0
+
+ vspltisb $seven,0x07 # 0x070707..07
+ le?lvsl $leperm,r11,r11
+ le?vspltisb $tmp,0x0f
+ le?vxor $leperm,$leperm,$seven
+
+ li $idx,15
+ lvx $tweak,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $tweak,$tweak,$inptail,$inpperm
+
+ neg r11,$inp
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inout,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ ${UCMP}i $key2,0 # key2==NULL?
+ beq Lxts_enc_no_key2
+
+ ?lvsl $keyperm,0,$key2 # prepare for unaligned key
+ lwz $rounds,240($key2)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ lvx $rndkey0,0,$key2
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Ltweak_xts_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ bdnz Ltweak_xts_enc
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipherlast $tweak,$tweak,$rndkey0
+
+ li $ivp,0 # don't chain the tweak
+ b Lxts_enc
+
+Lxts_enc_no_key2:
+ li $idx,-16
+ and $len,$len,$idx # in "tweak chaining"
+ # mode only complete
+ # blocks are processed
+Lxts_enc:
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+
+ ?lvsl $keyperm,0,$key1 # prepare for unaligned key
+ lwz $rounds,240($key1)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ vslb $eighty7,$seven,$seven # 0x808080..80
+ vor $eighty7,$eighty7,$seven # 0x878787..87
+ vspltisb $tmp,1 # 0x010101..01
+ vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
+
+ ${UCMP}i $len,96
+ bge _aesp8_xts_encrypt6x
+
+ andi. $taillen,$len,15
+ subic r0,$len,32
+ subi $taillen,$taillen,16
+ subfe r0,r0,r0
+ and r0,r0,$taillen
+ add $inp,$inp,r0
+
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ mtctr $rounds
+ b Loop_xts_enc
+
+.align 5
+Loop_xts_enc:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ bdnz Loop_xts_enc
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $rndkey0,$rndkey0,$tweak
+ vcipherlast $output,$inout,$rndkey0
+
+ le?vperm $tmp,$output,$output,$leperm
+ be?nop
+ le?stvx_u $tmp,0,$out
+ be?stvx_u $output,0,$out
+ addi $out,$out,16
+
+ subic. $len,$len,16
+ beq Lxts_enc_done
+
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+
+ subic r0,$len,32
+ subfe r0,r0,r0
+ and r0,r0,$taillen
+ add $inp,$inp,r0
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $output,$output,$rndkey0 # just in case $len<16
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+
+ mtctr $rounds
+ ${UCMP}i $len,16
+ bge Loop_xts_enc
+
+ vxor $output,$output,$tweak
+ lvsr $inpperm,0,$len # $inpperm is no longer needed
+ vxor $inptail,$inptail,$inptail # $inptail is no longer needed
+ vspltisb $tmp,-1
+ vperm $inptail,$inptail,$tmp,$inpperm
+ vsel $inout,$inout,$output,$inptail
+
+ subi r11,$out,17
+ subi $out,$out,16
+ mtctr $len
+ li $len,16
+Loop_xts_enc_steal:
+ lbzu r0,1(r11)
+ stb r0,16(r11)
+ bdnz Loop_xts_enc_steal
+
+ mtctr $rounds
+ b Loop_xts_enc # one more time...
+
+Lxts_enc_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_enc_ret
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_enc_ret:
+ mtspr 256,r12 # restore vrsave
+ li r3,0
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
+
+.globl .${prefix}_xts_decrypt
+.align 5
+.${prefix}_xts_decrypt:
+ mr $inp,r3 # reassign
+ li r3,-1
+ ${UCMP}i $len,16
+ bltlr-
+
+ lis r0,0xfff8
+ mfspr r12,256 # save vrsave
+ li r11,0
+ mtspr 256,r0
+
+ andi. r0,$len,15
+ neg r0,r0
+ andi. r0,r0,16
+ sub $len,$len,r0
+
+ vspltisb $seven,0x07 # 0x070707..07
+ le?lvsl $leperm,r11,r11
+ le?vspltisb $tmp,0x0f
+ le?vxor $leperm,$leperm,$seven
+
+ li $idx,15
+ lvx $tweak,0,$ivp # load [unaligned] iv
+ lvsl $inpperm,0,$ivp
+ lvx $inptail,$idx,$ivp
+ le?vxor $inpperm,$inpperm,$tmp
+ vperm $tweak,$tweak,$inptail,$inpperm
+
+ neg r11,$inp
+ lvsr $inpperm,0,r11 # prepare for unaligned load
+ lvx $inout,0,$inp
+ addi $inp,$inp,15 # 15 is not typo
+ le?vxor $inpperm,$inpperm,$tmp
+
+ ${UCMP}i $key2,0 # key2==NULL?
+ beq Lxts_dec_no_key2
+
+ ?lvsl $keyperm,0,$key2 # prepare for unaligned key
+ lwz $rounds,240($key2)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ lvx $rndkey0,0,$key2
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ mtctr $rounds
+
+Ltweak_xts_dec:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipher $tweak,$tweak,$rndkey0
+ lvx $rndkey0,$idx,$key2
+ addi $idx,$idx,16
+ bdnz Ltweak_xts_dec
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vcipher $tweak,$tweak,$rndkey1
+ lvx $rndkey1,$idx,$key2
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vcipherlast $tweak,$tweak,$rndkey0
+
+ li $ivp,0 # don't chain the tweak
+ b Lxts_dec
+
+Lxts_dec_no_key2:
+ neg $idx,$len
+ andi. $idx,$idx,15
+ add $len,$len,$idx # in "tweak chaining"
+ # mode only complete
+ # blocks are processed
+Lxts_dec:
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+
+ ?lvsl $keyperm,0,$key1 # prepare for unaligned key
+ lwz $rounds,240($key1)
+ srwi $rounds,$rounds,1
+ subi $rounds,$rounds,1
+ li $idx,16
+
+ vslb $eighty7,$seven,$seven # 0x808080..80
+ vor $eighty7,$eighty7,$seven # 0x878787..87
+ vspltisb $tmp,1 # 0x010101..01
+ vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
+
+ ${UCMP}i $len,96
+ bge _aesp8_xts_decrypt6x
+
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ mtctr $rounds
+
+ ${UCMP}i $len,16
+ blt Ltail_xts_dec
+ be?b Loop_xts_dec
+
+.align 5
+Loop_xts_dec:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ bdnz Loop_xts_dec
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $rndkey0,$rndkey0,$tweak
+ vncipherlast $output,$inout,$rndkey0
+
+ le?vperm $tmp,$output,$output,$leperm
+ be?nop
+ le?stvx_u $tmp,0,$out
+ be?stvx_u $output,0,$out
+ addi $out,$out,16
+
+ subic. $len,$len,16
+ beq Lxts_dec_done
+
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ addi $inp,$inp,16
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $inout,$inout,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+
+ mtctr $rounds
+ ${UCMP}i $len,16
+ bge Loop_xts_dec
+
+Ltail_xts_dec:
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak1,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak1,$tweak1,$tmp
+
+ subi $inp,$inp,16
+ add $inp,$inp,$len
+
+ vxor $inout,$inout,$tweak # :-(
+ vxor $inout,$inout,$tweak1 # :-)
+
+Loop_xts_dec_short:
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vncipher $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+ bdnz Loop_xts_dec_short
+
+ ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
+ vncipher $inout,$inout,$rndkey1
+ lvx $rndkey1,$idx,$key1
+ li $idx,16
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+ vxor $rndkey0,$rndkey0,$tweak1
+ vncipherlast $output,$inout,$rndkey0
+
+ le?vperm $tmp,$output,$output,$leperm
+ be?nop
+ le?stvx_u $tmp,0,$out
+ be?stvx_u $output,0,$out
+
+ vmr $inout,$inptail
+ lvx $inptail,0,$inp
+ #addi $inp,$inp,16
+ lvx $rndkey0,0,$key1
+ lvx $rndkey1,$idx,$key1
+ addi $idx,$idx,16
+ vperm $inout,$inout,$inptail,$inpperm
+ ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
+
+ lvsr $inpperm,0,$len # $inpperm is no longer needed
+ vxor $inptail,$inptail,$inptail # $inptail is no longer needed
+ vspltisb $tmp,-1
+ vperm $inptail,$inptail,$tmp,$inpperm
+ vsel $inout,$inout,$output,$inptail
+
+ vxor $rndkey0,$rndkey0,$tweak
+ vxor $inout,$inout,$rndkey0
+ lvx $rndkey0,$idx,$key1
+ addi $idx,$idx,16
+
+ subi r11,$out,1
+ mtctr $len
+ li $len,16
+Loop_xts_dec_steal:
+ lbzu r0,1(r11)
+ stb r0,16(r11)
+ bdnz Loop_xts_dec_steal
+
+ mtctr $rounds
+ b Loop_xts_dec # one more time...
+
+Lxts_dec_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_dec_ret
+
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $tweak,$tweak,$tmp
+
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_dec_ret:
+ mtspr 256,r12 # restore vrsave
+ li r3,0
+ blr
+ .long 0
+ .byte 0,12,0x04,0,0x80,6,6,0
+ .long 0
+.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
+___
+#########################################################################
+{{ # Optimized XTS procedures #
+my $key_=$key2;
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5));
+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
+my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
+ # v26-v31 last 6 round keys
+my ($keyperm)=($out0); # aliases with "caller", redundant assignment
+my $taillen=$x70;
+
+$code.=<<___;
+.align 5
+_aesp8_xts_encrypt6x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ mflr r11
+ li r7,`$FRAME+8*16+15`
+ li r3,`$FRAME+8*16+31`
+ $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ stvx v20,r7,$sp # ABI says so
+ addi r7,r7,32
+ stvx v21,r3,$sp
+ addi r3,r3,32
+ stvx v22,r7,$sp
+ addi r7,r7,32
+ stvx v23,r3,$sp
+ addi r3,r3,32
+ stvx v24,r7,$sp
+ addi r7,r7,32
+ stvx v25,r3,$sp
+ addi r3,r3,32
+ stvx v26,r7,$sp
+ addi r7,r7,32
+ stvx v27,r3,$sp
+ addi r3,r3,32
+ stvx v28,r7,$sp
+ addi r7,r7,32
+ stvx v29,r3,$sp
+ addi r3,r3,32
+ stvx v30,r7,$sp
+ stvx v31,r3,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+
+ lvx $rndkey0,$x00,$key1 # load key schedule
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ lvx v31,$x00,$key1
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_xts_enc_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key1
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_xts_enc_key
+
+ lvx v26,$x10,$key1
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key1
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key1
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key1
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key1
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key1
+ ?vperm v29,v29,v30,$keyperm
+ lvx $twk5,$x70,$key1 # borrow $twk5
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$twk5,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ vperm $in0,$inout,$inptail,$inpperm
+ subi $inp,$inp,31 # undo "caller"
+ vxor $twk0,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $out0,$in0,$twk0
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in1,$x10,$inp
+ vxor $twk1,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in1,$in1,$in1,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out1,$in1,$twk1
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in2,$x20,$inp
+ andi. $taillen,$len,15
+ vxor $twk2,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in2,$in2,$in2,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out2,$in2,$twk2
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in3,$x30,$inp
+ sub $len,$len,$taillen
+ vxor $twk3,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in3,$in3,$in3,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out3,$in3,$twk3
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in4,$x40,$inp
+ subi $len,$len,0x60
+ vxor $twk4,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in4,$in4,$in4,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out4,$in4,$twk4
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ vxor $twk5,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in5,$in5,$in5,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out5,$in5,$twk5
+ vxor $tweak,$tweak,$tmp
+
+ vxor v31,v31,$rndkey0
+ mtctr $rounds
+ b Loop_xts_enc6x
+
+.align 5
+Loop_xts_enc6x:
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_enc6x
+
+ subic $len,$len,96 # $len-=96
+ vxor $in0,$twk0,v31 # xor with last round key
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk0,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vsldoi $tmp,$tmp,$tmp,15
+ vcipher $out4,$out4,v24
+ vcipher $out5,$out5,v24
+
+ subfe. r0,r0,r0 # borrow?-1:0
+ vand $tmp,$tmp,$eighty7
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vxor $tweak,$tweak,$tmp
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vxor $in1,$twk1,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk1,$tweak,$rndkey0
+ vcipher $out4,$out4,v25
+ vcipher $out5,$out5,v25
+
+ and r0,r0,$len
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vcipher $out0,$out0,v26
+ vcipher $out1,$out1,v26
+ vand $tmp,$tmp,$eighty7
+ vcipher $out2,$out2,v26
+ vcipher $out3,$out3,v26
+ vxor $tweak,$tweak,$tmp
+ vcipher $out4,$out4,v26
+ vcipher $out5,$out5,v26
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in5 are loaded
+ # with last "words"
+ vxor $in2,$twk2,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk2,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vcipher $out0,$out0,v27
+ vcipher $out1,$out1,v27
+ vsldoi $tmp,$tmp,$tmp,15
+ vcipher $out2,$out2,v27
+ vcipher $out3,$out3,v27
+ vand $tmp,$tmp,$eighty7
+ vcipher $out4,$out4,v27
+ vcipher $out5,$out5,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vxor $tweak,$tweak,$tmp
+ vcipher $out0,$out0,v28
+ vcipher $out1,$out1,v28
+ vxor $in3,$twk3,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk3,$tweak,$rndkey0
+ vcipher $out2,$out2,v28
+ vcipher $out3,$out3,v28
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vcipher $out4,$out4,v28
+ vcipher $out5,$out5,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vand $tmp,$tmp,$eighty7
+
+ vcipher $out0,$out0,v29
+ vcipher $out1,$out1,v29
+ vxor $tweak,$tweak,$tmp
+ vcipher $out2,$out2,v29
+ vcipher $out3,$out3,v29
+ vxor $in4,$twk4,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk4,$tweak,$rndkey0
+ vcipher $out4,$out4,v29
+ vcipher $out5,$out5,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+
+ vcipher $out0,$out0,v30
+ vcipher $out1,$out1,v30
+ vand $tmp,$tmp,$eighty7
+ vcipher $out2,$out2,v30
+ vcipher $out3,$out3,v30
+ vxor $tweak,$tweak,$tmp
+ vcipher $out4,$out4,v30
+ vcipher $out5,$out5,v30
+ vxor $in5,$twk5,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk5,$tweak,$rndkey0
+
+ vcipherlast $out0,$out0,$in0
+ lvx_u $in0,$x00,$inp # load next input block
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vcipherlast $out1,$out1,$in1
+ lvx_u $in1,$x10,$inp
+ vcipherlast $out2,$out2,$in2
+ le?vperm $in0,$in0,$in0,$leperm
+ lvx_u $in2,$x20,$inp
+ vand $tmp,$tmp,$eighty7
+ vcipherlast $out3,$out3,$in3
+ le?vperm $in1,$in1,$in1,$leperm
+ lvx_u $in3,$x30,$inp
+ vcipherlast $out4,$out4,$in4
+ le?vperm $in2,$in2,$in2,$leperm
+ lvx_u $in4,$x40,$inp
+ vxor $tweak,$tweak,$tmp
+ vcipherlast $tmp,$out5,$in5 # last block might be needed
+ # in stealing mode
+ le?vperm $in3,$in3,$in3,$leperm
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ le?vperm $in4,$in4,$in4,$leperm
+ le?vperm $in5,$in5,$in5,$leperm
+
+ le?vperm $out0,$out0,$out0,$leperm
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk0
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ vxor $out1,$in1,$twk1
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ vxor $out2,$in2,$twk2
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ vxor $out3,$in3,$twk3
+ le?vperm $out5,$tmp,$tmp,$leperm
+ stvx_u $out4,$x40,$out
+ vxor $out4,$in4,$twk4
+ le?stvx_u $out5,$x50,$out
+ be?stvx_u $tmp, $x50,$out
+ vxor $out5,$in5,$twk5
+ addi $out,$out,0x60
+
+ mtctr $rounds
+ beq Loop_xts_enc6x # did $len-=96 borrow?
+
+ addic. $len,$len,0x60
+ beq Lxts_enc6x_zero
+ cmpwi $len,0x20
+ blt Lxts_enc6x_one
+ nop
+ beq Lxts_enc6x_two
+ cmpwi $len,0x40
+ blt Lxts_enc6x_three
+ nop
+ beq Lxts_enc6x_four
+
+Lxts_enc6x_five:
+ vxor $out0,$in1,$twk0
+ vxor $out1,$in2,$twk1
+ vxor $out2,$in3,$twk2
+ vxor $out3,$in4,$twk3
+ vxor $out4,$in5,$twk4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk5 # unused tweak
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ vxor $tmp,$out4,$twk5 # last block prep for stealing
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ stvx_u $out4,$x40,$out
+ addi $out,$out,0x50
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_four:
+ vxor $out0,$in2,$twk0
+ vxor $out1,$in3,$twk1
+ vxor $out2,$in4,$twk2
+ vxor $out3,$in5,$twk3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk4 # unused tweak
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ vxor $tmp,$out3,$twk4 # last block prep for stealing
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ stvx_u $out3,$x30,$out
+ addi $out,$out,0x40
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_three:
+ vxor $out0,$in3,$twk0
+ vxor $out1,$in4,$twk1
+ vxor $out2,$in5,$twk2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk3 # unused tweak
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $tmp,$out2,$twk3 # last block prep for stealing
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ stvx_u $out2,$x20,$out
+ addi $out,$out,0x30
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_two:
+ vxor $out0,$in4,$twk0
+ vxor $out1,$in5,$twk1
+ vxor $out2,$out2,$out2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_enc5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk2 # unused tweak
+ vxor $tmp,$out1,$twk2 # last block prep for stealing
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ stvx_u $out1,$x10,$out
+ addi $out,$out,0x20
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_one:
+ vxor $out0,$in5,$twk0
+ nop
+Loop_xts_enc1x:
+ vcipher $out0,$out0,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_enc1x
+
+ add $inp,$inp,$taillen
+ cmpwi $taillen,0
+ vcipher $out0,$out0,v24
+
+ subi $inp,$inp,16
+ vcipher $out0,$out0,v25
+
+ lvsr $inpperm,0,$taillen
+ vcipher $out0,$out0,v26
+
+ lvx_u $in0,0,$inp
+ vcipher $out0,$out0,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vcipher $out0,$out0,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vcipher $out0,$out0,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $twk0,$twk0,v31
+
+ le?vperm $in0,$in0,$in0,$leperm
+ vcipher $out0,$out0,v30
+
+ vperm $in0,$in0,$in0,$inpperm
+ vcipherlast $out0,$out0,$twk0
+
+ vmr $twk0,$twk1 # unused tweak
+ vxor $tmp,$out0,$twk1 # last block prep for stealing
+ le?vperm $out0,$out0,$out0,$leperm
+ stvx_u $out0,$x00,$out # store output
+ addi $out,$out,0x10
+ bne Lxts_enc6x_steal
+ b Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_zero:
+ cmpwi $taillen,0
+ beq Lxts_enc6x_done
+
+ add $inp,$inp,$taillen
+ subi $inp,$inp,16
+ lvx_u $in0,0,$inp
+ lvsr $inpperm,0,$taillen # $in5 is no more
+ le?vperm $in0,$in0,$in0,$leperm
+ vperm $in0,$in0,$in0,$inpperm
+ vxor $tmp,$tmp,$twk0
+Lxts_enc6x_steal:
+ vxor $in0,$in0,$twk0
+ vxor $out0,$out0,$out0
+ vspltisb $out1,-1
+ vperm $out0,$out0,$out1,$inpperm
+ vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember?
+
+ subi r30,$out,17
+ subi $out,$out,16
+ mtctr $taillen
+Loop_xts_enc6x_steal:
+ lbzu r0,1(r30)
+ stb r0,16(r30)
+ bdnz Loop_xts_enc6x_steal
+
+ li $taillen,0
+ mtctr $rounds
+ b Loop_xts_enc1x # one more time...
+
+.align 4
+Lxts_enc6x_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_enc6x_ret
+
+ vxor $tweak,$twk0,$rndkey0
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_enc6x_ret:
+ mtlr r11
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $seven,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,6,6,0
+ .long 0
+
+.align 5
+_aesp8_xts_enc5x:
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz _aesp8_xts_enc5x
+
+ add $inp,$inp,$taillen
+ cmpwi $taillen,0
+ vcipher $out0,$out0,v24
+ vcipher $out1,$out1,v24
+ vcipher $out2,$out2,v24
+ vcipher $out3,$out3,v24
+ vcipher $out4,$out4,v24
+
+ subi $inp,$inp,16
+ vcipher $out0,$out0,v25
+ vcipher $out1,$out1,v25
+ vcipher $out2,$out2,v25
+ vcipher $out3,$out3,v25
+ vcipher $out4,$out4,v25
+ vxor $twk0,$twk0,v31
+
+ vcipher $out0,$out0,v26
+ lvsr $inpperm,0,$taillen # $in5 is no more
+ vcipher $out1,$out1,v26
+ vcipher $out2,$out2,v26
+ vcipher $out3,$out3,v26
+ vcipher $out4,$out4,v26
+ vxor $in1,$twk1,v31
+
+ vcipher $out0,$out0,v27
+ lvx_u $in0,0,$inp
+ vcipher $out1,$out1,v27
+ vcipher $out2,$out2,v27
+ vcipher $out3,$out3,v27
+ vcipher $out4,$out4,v27
+ vxor $in2,$twk2,v31
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vcipher $out0,$out0,v28
+ vcipher $out1,$out1,v28
+ vcipher $out2,$out2,v28
+ vcipher $out3,$out3,v28
+ vcipher $out4,$out4,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vxor $in3,$twk3,v31
+
+ vcipher $out0,$out0,v29
+ le?vperm $in0,$in0,$in0,$leperm
+ vcipher $out1,$out1,v29
+ vcipher $out2,$out2,v29
+ vcipher $out3,$out3,v29
+ vcipher $out4,$out4,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $in4,$twk4,v31
+
+ vcipher $out0,$out0,v30
+ vperm $in0,$in0,$in0,$inpperm
+ vcipher $out1,$out1,v30
+ vcipher $out2,$out2,v30
+ vcipher $out3,$out3,v30
+ vcipher $out4,$out4,v30
+
+ vcipherlast $out0,$out0,$twk0
+ vcipherlast $out1,$out1,$in1
+ vcipherlast $out2,$out2,$in2
+ vcipherlast $out3,$out3,$in3
+ vcipherlast $out4,$out4,$in4
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.align 5
+_aesp8_xts_decrypt6x:
+ $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ mflr r11
+ li r7,`$FRAME+8*16+15`
+ li r3,`$FRAME+8*16+31`
+ $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ stvx v20,r7,$sp # ABI says so
+ addi r7,r7,32
+ stvx v21,r3,$sp
+ addi r3,r3,32
+ stvx v22,r7,$sp
+ addi r7,r7,32
+ stvx v23,r3,$sp
+ addi r3,r3,32
+ stvx v24,r7,$sp
+ addi r7,r7,32
+ stvx v25,r3,$sp
+ addi r3,r3,32
+ stvx v26,r7,$sp
+ addi r7,r7,32
+ stvx v27,r3,$sp
+ addi r3,r3,32
+ stvx v28,r7,$sp
+ addi r7,r7,32
+ stvx v29,r3,$sp
+ addi r3,r3,32
+ stvx v30,r7,$sp
+ stvx v31,r3,$sp
+ li r0,-1
+ stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li $x10,0x10
+ $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ li $x20,0x20
+ $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ li $x30,0x30
+ $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ li $x40,0x40
+ $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ li $x50,0x50
+ $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ li $x60,0x60
+ $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ li $x70,0x70
+ mtspr 256,r0
+
+ subi $rounds,$rounds,3 # -4 in total
+
+ lvx $rndkey0,$x00,$key1 # load key schedule
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ lvx v31,$x00,$key1
+ ?vperm $rndkey0,$rndkey0,v30,$keyperm
+ addi $key_,$sp,$FRAME+15
+ mtctr $rounds
+
+Load_xts_dec_key:
+ ?vperm v24,v30,v31,$keyperm
+ lvx v30,$x10,$key1
+ addi $key1,$key1,0x20
+ stvx v24,$x00,$key_ # off-load round[1]
+ ?vperm v25,v31,v30,$keyperm
+ lvx v31,$x00,$key1
+ stvx v25,$x10,$key_ # off-load round[2]
+ addi $key_,$key_,0x20
+ bdnz Load_xts_dec_key
+
+ lvx v26,$x10,$key1
+ ?vperm v24,v30,v31,$keyperm
+ lvx v27,$x20,$key1
+ stvx v24,$x00,$key_ # off-load round[3]
+ ?vperm v25,v31,v26,$keyperm
+ lvx v28,$x30,$key1
+ stvx v25,$x10,$key_ # off-load round[4]
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ ?vperm v26,v26,v27,$keyperm
+ lvx v29,$x40,$key1
+ ?vperm v27,v27,v28,$keyperm
+ lvx v30,$x50,$key1
+ ?vperm v28,v28,v29,$keyperm
+ lvx v31,$x60,$key1
+ ?vperm v29,v29,v30,$keyperm
+ lvx $twk5,$x70,$key1 # borrow $twk5
+ ?vperm v30,v30,v31,$keyperm
+ lvx v24,$x00,$key_ # pre-load round[1]
+ ?vperm v31,v31,$twk5,$keyperm
+ lvx v25,$x10,$key_ # pre-load round[2]
+
+ vperm $in0,$inout,$inptail,$inpperm
+ subi $inp,$inp,31 # undo "caller"
+ vxor $twk0,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vand $tmp,$tmp,$eighty7
+ vxor $out0,$in0,$twk0
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in1,$x10,$inp
+ vxor $twk1,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in1,$in1,$in1,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out1,$in1,$twk1
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in2,$x20,$inp
+ andi. $taillen,$len,15
+ vxor $twk2,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in2,$in2,$in2,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out2,$in2,$twk2
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in3,$x30,$inp
+ sub $len,$len,$taillen
+ vxor $twk3,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in3,$in3,$in3,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out3,$in3,$twk3
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in4,$x40,$inp
+ subi $len,$len,0x60
+ vxor $twk4,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in4,$in4,$in4,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out4,$in4,$twk4
+ vxor $tweak,$tweak,$tmp
+
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ vxor $twk5,$tweak,$rndkey0
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ le?vperm $in5,$in5,$in5,$leperm
+ vand $tmp,$tmp,$eighty7
+ vxor $out5,$in5,$twk5
+ vxor $tweak,$tweak,$tmp
+
+ vxor v31,v31,$rndkey0
+ mtctr $rounds
+ b Loop_xts_dec6x
+
+.align 5
+Loop_xts_dec6x:
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_dec6x
+
+ subic $len,$len,96 # $len-=96
+ vxor $in0,$twk0,v31 # xor with last round key
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk0,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vsldoi $tmp,$tmp,$tmp,15
+ vncipher $out4,$out4,v24
+ vncipher $out5,$out5,v24
+
+ subfe. r0,r0,r0 # borrow?-1:0
+ vand $tmp,$tmp,$eighty7
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vxor $tweak,$tweak,$tmp
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vxor $in1,$twk1,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk1,$tweak,$rndkey0
+ vncipher $out4,$out4,v25
+ vncipher $out5,$out5,v25
+
+ and r0,r0,$len
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vncipher $out0,$out0,v26
+ vncipher $out1,$out1,v26
+ vand $tmp,$tmp,$eighty7
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vxor $tweak,$tweak,$tmp
+ vncipher $out4,$out4,v26
+ vncipher $out5,$out5,v26
+
+ add $inp,$inp,r0 # $inp is adjusted in such
+ # way that at exit from the
+ # loop inX-in5 are loaded
+ # with last "words"
+ vxor $in2,$twk2,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk2,$tweak,$rndkey0
+ vaddubm $tweak,$tweak,$tweak
+ vncipher $out0,$out0,v27
+ vncipher $out1,$out1,v27
+ vsldoi $tmp,$tmp,$tmp,15
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vand $tmp,$tmp,$eighty7
+ vncipher $out4,$out4,v27
+ vncipher $out5,$out5,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vxor $tweak,$tweak,$tmp
+ vncipher $out0,$out0,v28
+ vncipher $out1,$out1,v28
+ vxor $in3,$twk3,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk3,$tweak,$rndkey0
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vncipher $out4,$out4,v28
+ vncipher $out5,$out5,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vand $tmp,$tmp,$eighty7
+
+ vncipher $out0,$out0,v29
+ vncipher $out1,$out1,v29
+ vxor $tweak,$tweak,$tmp
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vxor $in4,$twk4,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk4,$tweak,$rndkey0
+ vncipher $out4,$out4,v29
+ vncipher $out5,$out5,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+
+ vncipher $out0,$out0,v30
+ vncipher $out1,$out1,v30
+ vand $tmp,$tmp,$eighty7
+ vncipher $out2,$out2,v30
+ vncipher $out3,$out3,v30
+ vxor $tweak,$tweak,$tmp
+ vncipher $out4,$out4,v30
+ vncipher $out5,$out5,v30
+ vxor $in5,$twk5,v31
+ vsrab $tmp,$tweak,$seven # next tweak value
+ vxor $twk5,$tweak,$rndkey0
+
+ vncipherlast $out0,$out0,$in0
+ lvx_u $in0,$x00,$inp # load next input block
+ vaddubm $tweak,$tweak,$tweak
+ vsldoi $tmp,$tmp,$tmp,15
+ vncipherlast $out1,$out1,$in1
+ lvx_u $in1,$x10,$inp
+ vncipherlast $out2,$out2,$in2
+ le?vperm $in0,$in0,$in0,$leperm
+ lvx_u $in2,$x20,$inp
+ vand $tmp,$tmp,$eighty7
+ vncipherlast $out3,$out3,$in3
+ le?vperm $in1,$in1,$in1,$leperm
+ lvx_u $in3,$x30,$inp
+ vncipherlast $out4,$out4,$in4
+ le?vperm $in2,$in2,$in2,$leperm
+ lvx_u $in4,$x40,$inp
+ vxor $tweak,$tweak,$tmp
+ vncipherlast $out5,$out5,$in5
+ le?vperm $in3,$in3,$in3,$leperm
+ lvx_u $in5,$x50,$inp
+ addi $inp,$inp,0x60
+ le?vperm $in4,$in4,$in4,$leperm
+ le?vperm $in5,$in5,$in5,$leperm
+
+ le?vperm $out0,$out0,$out0,$leperm
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk0
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ vxor $out1,$in1,$twk1
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ vxor $out2,$in2,$twk2
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ vxor $out3,$in3,$twk3
+ le?vperm $out5,$out5,$out5,$leperm
+ stvx_u $out4,$x40,$out
+ vxor $out4,$in4,$twk4
+ stvx_u $out5,$x50,$out
+ vxor $out5,$in5,$twk5
+ addi $out,$out,0x60
+
+ mtctr $rounds
+ beq Loop_xts_dec6x # did $len-=96 borrow?
+
+ addic. $len,$len,0x60
+ beq Lxts_dec6x_zero
+ cmpwi $len,0x20
+ blt Lxts_dec6x_one
+ nop
+ beq Lxts_dec6x_two
+ cmpwi $len,0x40
+ blt Lxts_dec6x_three
+ nop
+ beq Lxts_dec6x_four
+
+Lxts_dec6x_five:
+ vxor $out0,$in1,$twk0
+ vxor $out1,$in2,$twk1
+ vxor $out2,$in3,$twk2
+ vxor $out3,$in4,$twk3
+ vxor $out4,$in5,$twk4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk5 # unused tweak
+ vxor $twk1,$tweak,$rndkey0
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk1
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ le?vperm $out4,$out4,$out4,$leperm
+ stvx_u $out3,$x30,$out
+ stvx_u $out4,$x40,$out
+ addi $out,$out,0x50
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_four:
+ vxor $out0,$in2,$twk0
+ vxor $out1,$in3,$twk1
+ vxor $out2,$in4,$twk2
+ vxor $out3,$in5,$twk3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk4 # unused tweak
+ vmr $twk1,$twk5
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk5
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ le?vperm $out3,$out3,$out3,$leperm
+ stvx_u $out2,$x20,$out
+ stvx_u $out3,$x30,$out
+ addi $out,$out,0x40
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_three:
+ vxor $out0,$in3,$twk0
+ vxor $out1,$in4,$twk1
+ vxor $out2,$in5,$twk2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk3 # unused tweak
+ vmr $twk1,$twk4
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk4
+ le?vperm $out2,$out2,$out2,$leperm
+ stvx_u $out1,$x10,$out
+ stvx_u $out2,$x20,$out
+ addi $out,$out,0x30
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_two:
+ vxor $out0,$in4,$twk0
+ vxor $out1,$in5,$twk1
+ vxor $out2,$out2,$out2
+ vxor $out3,$out3,$out3
+ vxor $out4,$out4,$out4
+
+ bl _aesp8_xts_dec5x
+
+ le?vperm $out0,$out0,$out0,$leperm
+ vmr $twk0,$twk2 # unused tweak
+ vmr $twk1,$twk3
+ le?vperm $out1,$out1,$out1,$leperm
+ stvx_u $out0,$x00,$out # store output
+ vxor $out0,$in0,$twk3
+ stvx_u $out1,$x10,$out
+ addi $out,$out,0x20
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_one:
+ vxor $out0,$in5,$twk0
+ nop
+Loop_xts_dec1x:
+ vncipher $out0,$out0,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Loop_xts_dec1x
+
+ subi r0,$taillen,1
+ vncipher $out0,$out0,v24
+
+ andi. r0,r0,16
+ cmpwi $taillen,0
+ vncipher $out0,$out0,v25
+
+ sub $inp,$inp,r0
+ vncipher $out0,$out0,v26
+
+ lvx_u $in0,0,$inp
+ vncipher $out0,$out0,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vncipher $out0,$out0,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $twk0,$twk0,v31
+
+ le?vperm $in0,$in0,$in0,$leperm
+ vncipher $out0,$out0,v30
+
+ mtctr $rounds
+ vncipherlast $out0,$out0,$twk0
+
+ vmr $twk0,$twk1 # unused tweak
+ vmr $twk1,$twk2
+ le?vperm $out0,$out0,$out0,$leperm
+ stvx_u $out0,$x00,$out # store output
+ addi $out,$out,0x10
+ vxor $out0,$in0,$twk2
+ bne Lxts_dec6x_steal
+ b Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_zero:
+ cmpwi $taillen,0
+ beq Lxts_dec6x_done
+
+ lvx_u $in0,0,$inp
+ le?vperm $in0,$in0,$in0,$leperm
+ vxor $out0,$in0,$twk1
+Lxts_dec6x_steal:
+ vncipher $out0,$out0,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz Lxts_dec6x_steal
+
+ add $inp,$inp,$taillen
+ vncipher $out0,$out0,v24
+
+ cmpwi $taillen,0
+ vncipher $out0,$out0,v25
+
+ lvx_u $in0,0,$inp
+ vncipher $out0,$out0,v26
+
+ lvsr $inpperm,0,$taillen # $in5 is no more
+ vncipher $out0,$out0,v27
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+
+ vncipher $out0,$out0,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $twk1,$twk1,v31
+
+ le?vperm $in0,$in0,$in0,$leperm
+ vncipher $out0,$out0,v30
+
+ vperm $in0,$in0,$in0,$inpperm
+ vncipherlast $tmp,$out0,$twk1
+
+ le?vperm $out0,$tmp,$tmp,$leperm
+ le?stvx_u $out0,0,$out
+ be?stvx_u $tmp,0,$out
+
+ vxor $out0,$out0,$out0
+ vspltisb $out1,-1
+ vperm $out0,$out0,$out1,$inpperm
+ vsel $out0,$in0,$tmp,$out0
+ vxor $out0,$out0,$twk0
+
+ subi r30,$out,1
+ mtctr $taillen
+Loop_xts_dec6x_steal:
+ lbzu r0,1(r30)
+ stb r0,16(r30)
+ bdnz Loop_xts_dec6x_steal
+
+ li $taillen,0
+ mtctr $rounds
+ b Loop_xts_dec1x # one more time...
+
+.align 4
+Lxts_dec6x_done:
+ ${UCMP}i $ivp,0
+ beq Lxts_dec6x_ret
+
+ vxor $tweak,$twk0,$rndkey0
+ le?vperm $tweak,$tweak,$tweak,$leperm
+ stvx_u $tweak,0,$ivp
+
+Lxts_dec6x_ret:
+ mtlr r11
+ li r10,`$FRAME+15`
+ li r11,`$FRAME+31`
+ stvx $seven,r10,$sp # wipe copies of round keys
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+ stvx $seven,r10,$sp
+ addi r10,r10,32
+ stvx $seven,r11,$sp
+ addi r11,r11,32
+
+ mtspr 256,$vrsave
+ lvx v20,r10,$sp # ABI says so
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,6,6,0
+ .long 0
+
+.align 5
+_aesp8_xts_dec5x:
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+ lvx v24,$x20,$key_ # round[3]
+ addi $key_,$key_,0x20
+
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ lvx v25,$x10,$key_ # round[4]
+ bdnz _aesp8_xts_dec5x
+
+ subi r0,$taillen,1
+ vncipher $out0,$out0,v24
+ vncipher $out1,$out1,v24
+ vncipher $out2,$out2,v24
+ vncipher $out3,$out3,v24
+ vncipher $out4,$out4,v24
+
+ andi. r0,r0,16
+ cmpwi $taillen,0
+ vncipher $out0,$out0,v25
+ vncipher $out1,$out1,v25
+ vncipher $out2,$out2,v25
+ vncipher $out3,$out3,v25
+ vncipher $out4,$out4,v25
+ vxor $twk0,$twk0,v31
+
+ sub $inp,$inp,r0
+ vncipher $out0,$out0,v26
+ vncipher $out1,$out1,v26
+ vncipher $out2,$out2,v26
+ vncipher $out3,$out3,v26
+ vncipher $out4,$out4,v26
+ vxor $in1,$twk1,v31
+
+ vncipher $out0,$out0,v27
+ lvx_u $in0,0,$inp
+ vncipher $out1,$out1,v27
+ vncipher $out2,$out2,v27
+ vncipher $out3,$out3,v27
+ vncipher $out4,$out4,v27
+ vxor $in2,$twk2,v31
+
+ addi $key_,$sp,$FRAME+15 # rewind $key_
+ vncipher $out0,$out0,v28
+ vncipher $out1,$out1,v28
+ vncipher $out2,$out2,v28
+ vncipher $out3,$out3,v28
+ vncipher $out4,$out4,v28
+ lvx v24,$x00,$key_ # re-pre-load round[1]
+ vxor $in3,$twk3,v31
+
+ vncipher $out0,$out0,v29
+ le?vperm $in0,$in0,$in0,$leperm
+ vncipher $out1,$out1,v29
+ vncipher $out2,$out2,v29
+ vncipher $out3,$out3,v29
+ vncipher $out4,$out4,v29
+ lvx v25,$x10,$key_ # re-pre-load round[2]
+ vxor $in4,$twk4,v31
+
+ vncipher $out0,$out0,v30
+ vncipher $out1,$out1,v30
+ vncipher $out2,$out2,v30
+ vncipher $out3,$out3,v30
+ vncipher $out4,$out4,v30
+
+ vncipherlast $out0,$out0,$twk0
+ vncipherlast $out1,$out1,$in1
+ vncipherlast $out2,$out2,$in2
+ vncipherlast $out3,$out3,$in3
+ vncipherlast $out4,$out4,$in4
+ mtctr $rounds
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+___
+}} }}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ # constants table endian-specific conversion
+ if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+ my $conv=$3;
+ my @bytes=();
+
+ # convert to endian-agnostic format
+ if ($1 eq "long") {
+ foreach (split(/,\s*/,$2)) {
+ my $l = /^0/?oct:int;
+ push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+ }
+ } else {
+ @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+ }
+
+ # little-endian conversion
+ if ($flavour =~ /le$/o) {
+ SWITCH: for($conv) {
+ /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ }
+ }
+
+ #emit
+ print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+ next;
+ }
+ $consts=0 if (m/Lconsts:/o); # end of table
+
+ # instructions prefixed with '?' are endian-specific and need
+ # to be adjusted accordingly...
+ if ($flavour =~ /le$/o) { # little-endian
+ s/le\?//o or
+ s/be\?/#be#/o or
+ s/\?lvsr/lvsl/o or
+ s/\?lvsl/lvsr/o or
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+ s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+ s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+ } else { # big-endian
+ s/le\?/#le#/o or
+ s/be\?//o or
+ s/\?([a-z]+)/$1/o;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aest4-sparcv9.pl b/openssl-1.1.0h/crypto/aes/asm/aest4-sparcv9.pl
new file mode 100644
index 0000000..bf479c6
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aest4-sparcv9.pl
@@ -0,0 +1,929 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
+# <appro@openssl.org>. The module is licensed under 2-clause BSD
+# license. October 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# AES for SPARC T4.
+#
+# AES round instructions complete in 3 cycles and can be issued every
+# cycle. It means that round calculations should take 4*rounds cycles,
+# because any given round instruction depends on result of *both*
+# previous instructions:
+#
+# |0 |1 |2 |3 |4
+# |01|01|01|
+# |23|23|23|
+# |01|01|...
+# |23|...
+#
+# Provided that fxor [with IV] takes 3 cycles to complete, critical
+# path length for CBC encrypt would be 3+4*rounds, or in other words
+# it should process one byte in at least (3+4*rounds)/16 cycles. This
+# estimate doesn't account for "collateral" instructions, such as
+# fetching input from memory, xor-ing it with zero-round key and
+# storing the result. Yet, *measured* performance [for data aligned
+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
+#
+# 128-bit key 192- 256-
+# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Out-of-order execution logic managed to fully overlap "collateral"
+# instructions with those on critical path. Amazing!
+#
+# As with Intel AES-NI, question is if it's possible to improve
+# performance of parallelizeable modes by interleaving round
+# instructions. Provided round instruction latency and throughput
+# optimal interleave factor is 2. But can we expect 2x performance
+# improvement? Well, as round instructions can be issued one per
+# cycle, they don't saturate the 2-way issue pipeline and therefore
+# there is room for "collateral" calculations... Yet, 2x speed-up
+# over CBC encrypt remains unattaintable:
+#
+# 128-bit key 192- 256-
+# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
+# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Estimates based on amount of instructions under assumption that
+# round instructions are not pairable with any other instruction
+# suggest that latter is the actual case and pipeline runs
+# underutilized. It should be noted that T4 out-of-order execution
+# logic is so capable that performance gain from 2x interleave is
+# not even impressive, ~7-13% over non-interleaved code, largest
+# for 256-bit keys.
+
+# To anchor to something else, software implementation processes
+# one byte in 29 cycles with 128-bit key on same processor. Intel
+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
+# in 0.93, naturally with AES-NI.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+$output = pop;
+open STDOUT,">$output";
+
+$::evp=1; # if $evp is set to 0, script generates module with
+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
+# points. These however are not fully compatible with openssl/aes.h,
+# because they expect AES_KEY to be aligned at 64-bit boundary. When
+# used through EVP, alignment is arranged at EVP layer. Second thing
+# that is arranged by EVP is at least 32-bit alignment of IV.
+
+######################################################################
+# single-round subroutines
+#
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
+
+$code.=<<___;
+#include "sparc_arch.h"
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+.text
+
+.globl aes_t4_encrypt
+.align 32
+aes_t4_encrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Lenc:
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_eround01 %f16, %f4, %f2, %f0
+ aes_eround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Lenc
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ aes_eround01_l %f16, %f4, %f2, %f0
+ aes_eround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_encrypt,#function
+.size aes_t4_encrypt,.-aes_t4_encrypt
+
+.globl aes_t4_decrypt
+.align 32
+aes_t4_decrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Ldec:
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_dround01 %f16, %f4, %f2, %f0
+ aes_dround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Ldec
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ aes_dround01_l %f16, %f4, %f2, %f0
+ aes_dround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_decrypt,#function
+.size aes_t4_decrypt,.-aes_t4_decrypt
+___
+}
+
+######################################################################
+# key setup subroutines
+#
+{
+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
+$code.=<<___;
+.globl aes_t4_set_encrypt_key
+.align 32
+aes_t4_set_encrypt_key:
+.Lset_encrypt_key:
+ and $inp, 7, $tmp
+ alignaddr $inp, %g0, $inp
+ cmp $bits, 192
+ ldd [$inp + 0], %f0
+ bl,pt %icc,.L128
+ ldd [$inp + 8], %f2
+
+ be,pt %icc,.L192
+ ldd [$inp + 16], %f4
+ brz,pt $tmp, .L256aligned
+ ldd [$inp + 24], %f6
+
+ ldd [$inp + 32], %f8
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f8, %f6
+.L256aligned:
+___
+for ($i=0; $i<6; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ aes_kexpand0 %f4, %f2, %f4
+ std %f6, [$out + `32*$i+24`]
+ aes_kexpand2 %f6, %f4, %f6
+___
+}
+$code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ std %f6, [$out + `32*$i+24`]
+ std %f0, [$out + `32*$i+32`]
+ std %f2, [$out + `32*$i+40`]
+
+ mov 14, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L192:
+ brz,pt $tmp, .L192aligned
+ nop
+
+ ldd [$inp + 24], %f6
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+.L192aligned:
+___
+for ($i=0; $i<7; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ aes_kexpand2 %f4, %f2, %f4
+___
+}
+$code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ std %f0, [$out + `24*$i+24`]
+ std %f2, [$out + `24*$i+32`]
+
+ mov 12, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L128:
+ brz,pt $tmp, .L128aligned
+ nop
+
+ ldd [$inp + 16], %f4
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+.L128aligned:
+___
+for ($i=0; $i<10; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ aes_kexpand1 %f0, %f2, $i, %f0
+ std %f2, [$out + `16*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+___
+}
+$code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ std %f2, [$out + `16*$i+8`]
+
+ mov 10, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_encrypt_key,#function
+.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
+
+.globl aes_t4_set_decrypt_key
+.align 32
+aes_t4_set_decrypt_key:
+ mov %o7, %o5
+ call .Lset_encrypt_key
+ nop
+
+ mov %o5, %o7
+ sll $tmp, 4, $inp ! $tmp is number of rounds
+ add $tmp, 2, $tmp
+ add $out, $inp, $inp ! $inp=$out+16*rounds
+ srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
+
+.Lkey_flip:
+ ldd [$out + 0], %f0
+ ldd [$out + 8], %f2
+ ldd [$out + 16], %f4
+ ldd [$out + 24], %f6
+ ldd [$inp + 0], %f8
+ ldd [$inp + 8], %f10
+ ldd [$inp - 16], %f12
+ ldd [$inp - 8], %f14
+ sub $tmp, 1, $tmp
+ std %f0, [$inp + 0]
+ std %f2, [$inp + 8]
+ std %f4, [$inp - 16]
+ std %f6, [$inp - 8]
+ std %f8, [$out + 0]
+ std %f10, [$out + 8]
+ std %f12, [$out + 16]
+ std %f14, [$out + 24]
+ add $out, 32, $out
+ brnz $tmp, .Lkey_flip
+ sub $inp, 32, $inp
+
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_decrypt_key,#function
+.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
+___
+}
+
+{{{
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
+
+$code.=<<___;
+.align 32
+_aes128_encrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f4
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f54, %f4, %f2, %f2
+.type _aes128_encrypt_1x,#function
+.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
+
+.align 32
+_aes128_encrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f8
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01 %f48, %f4, %f6, %f10
+ aes_eround23 %f50, %f4, %f6, %f6
+ aes_eround01_l %f52, %f8, %f2, %f0
+ aes_eround23_l %f54, %f8, %f2, %f2
+ aes_eround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f54, %f10, %f6, %f6
+.type _aes128_encrypt_2x,#function
+.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
+
+.align 32
+_aes128_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<22;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes128_loadkey,#function
+.size _aes128_loadkey,.-_aes128_loadkey
+_aes128_load_enckey=_aes128_loadkey
+_aes128_load_deckey=_aes128_loadkey
+
+___
+
+&alg_cbc_encrypt_implement("aes",128);
+if ($::evp) {
+ &alg_ctr32_implement("aes",128);
+ &alg_xts_implement("aes",128,"en");
+ &alg_xts_implement("aes",128,"de");
+}
+&alg_cbc_decrypt_implement("aes",128);
+
+$code.=<<___;
+.align 32
+_aes128_decrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f4
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f54, %f4, %f2, %f2
+.type _aes128_decrypt_1x,#function
+.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
+
+.align 32
+_aes128_decrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f8
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01 %f48, %f4, %f6, %f10
+ aes_dround23 %f50, %f4, %f6, %f6
+ aes_dround01_l %f52, %f8, %f2, %f0
+ aes_dround23_l %f54, %f8, %f2, %f2
+ aes_dround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f54, %f10, %f6, %f6
+.type _aes128_decrypt_2x,#function
+.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
+___
+
+$code.=<<___;
+.align 32
+_aes192_encrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f4
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f62, %f4, %f2, %f2
+.type _aes192_encrypt_1x,#function
+.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
+
+.align 32
+_aes192_encrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f8
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01 %f56, %f4, %f6, %f10
+ aes_eround23 %f58, %f4, %f6, %f6
+ aes_eround01_l %f60, %f8, %f2, %f0
+ aes_eround23_l %f62, %f8, %f2, %f2
+ aes_eround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f62, %f10, %f6, %f6
+.type _aes192_encrypt_2x,#function
+.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
+
+.align 32
+_aes256_encrypt_1x:
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f4, %f2, %f0
+ aes_eround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f4, %f2, %f0
+ aes_eround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_1x,#function
+.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
+
+.align 32
+_aes256_encrypt_2x:
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f8, %f2, %f0
+ aes_eround23 %f22, %f8, %f2, %f2
+ aes_eround01 %f20, %f10, %f6, %f4
+ aes_eround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f8, %f2, %f0
+ aes_eround23_l %f22, %f8, %f2, %f2
+ aes_eround01_l %f20, %f10, %f6, %f4
+ aes_eround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_2x,#function
+.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
+
+.align 32
+_aes192_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<26;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes192_loadkey,#function
+.size _aes192_loadkey,.-_aes192_loadkey
+_aes256_loadkey=_aes192_loadkey
+_aes192_load_enckey=_aes192_loadkey
+_aes192_load_deckey=_aes192_loadkey
+_aes256_load_enckey=_aes192_loadkey
+_aes256_load_deckey=_aes192_loadkey
+___
+
+&alg_cbc_encrypt_implement("aes",256);
+&alg_cbc_encrypt_implement("aes",192);
+if ($::evp) {
+ &alg_ctr32_implement("aes",256);
+ &alg_xts_implement("aes",256,"en");
+ &alg_xts_implement("aes",256,"de");
+ &alg_ctr32_implement("aes",192);
+}
+&alg_cbc_decrypt_implement("aes",192);
+&alg_cbc_decrypt_implement("aes",256);
+
+$code.=<<___;
+.align 32
+_aes256_decrypt_1x:
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f4, %f2, %f0
+ aes_dround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f4, %f2, %f0
+ aes_dround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_1x,#function
+.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
+
+.align 32
+_aes256_decrypt_2x:
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f8, %f2, %f0
+ aes_dround23 %f22, %f8, %f2, %f2
+ aes_dround01 %f20, %f10, %f6, %f4
+ aes_dround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f8, %f2, %f0
+ aes_dround23_l %f22, %f8, %f2, %f2
+ aes_dround01_l %f20, %f10, %f6, %f4
+ aes_dround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_2x,#function
+.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
+
+.align 32
+_aes192_decrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f4
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f62, %f4, %f2, %f2
+.type _aes192_decrypt_1x,#function
+.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
+
+.align 32
+_aes192_decrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f8
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01 %f56, %f4, %f6, %f10
+ aes_dround23 %f58, %f4, %f6, %f6
+ aes_dround01_l %f60, %f8, %f2, %f0
+ aes_dround23_l %f62, %f8, %f2, %f2
+ aes_dround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f62, %f10, %f6, %f6
+.type _aes192_decrypt_2x,#function
+.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
+___
+}}}
+
+if (!$::evp) {
+$code.=<<___;
+.global AES_encrypt
+AES_encrypt=aes_t4_encrypt
+.global AES_decrypt
+AES_decrypt=aes_t4_decrypt
+.global AES_set_encrypt_key
+.align 32
+AES_set_encrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_encrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_encrypt_key,#function
+.size AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global AES_set_decrypt_key
+.align 32
+AES_set_decrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_decrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_decrypt_key,#function
+.size AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
+
+$code.=<<___;
+.globl AES_cbc_encrypt
+.align 32
+AES_cbc_encrypt:
+ ld [$key + 240], %g1
+ nop
+ brz $enc, .Lcbc_decrypt
+ cmp %g1, 12
+
+ bl,pt %icc, aes128_t4_cbc_encrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_encrypt
+ nop
+ ba aes256_t4_cbc_encrypt
+ nop
+
+.Lcbc_decrypt:
+ bl,pt %icc, aes128_t4_cbc_decrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_decrypt
+ nop
+ ba aes256_t4_cbc_decrypt
+ nop
+.type AES_cbc_encrypt,#function
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+$code.=<<___;
+.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/aesv8-armx.pl b/openssl-1.1.0h/crypto/aes/asm/aesv8-armx.pl
new file mode 100755
index 0000000..1782d5b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/aesv8-armx.pl
@@ -0,0 +1,1008 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+# CBC enc CBC dec CTR
+# Apple A7 2.39 1.20 1.20
+# Cortex-A53 1.32 1.29 1.46
+# Cortex-A57(*) 1.95 0.85 0.93
+# Denver 1.96 0.86 0.80
+# Mongoose 1.33 1.20 1.20
+#
+# (*) original 3.64/1.34/1.32 results were for r0p0 revision
+# and are still same even for updated module;
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=7
+.text
+___
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=<<___ if ($flavour !~ /64/);
+.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
+.fpu neon
+.code 32
+#undef __thumb2__
+___
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+ $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+$code.=<<___;
+.align 5
+.Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___;
+ mov $ptr,#-1
+ cmp $inp,#0
+ b.eq .Lenc_key_abort
+ cmp $out,#0
+ b.eq .Lenc_key_abort
+ mov $ptr,#-2
+ cmp $bits,#128
+ b.lt .Lenc_key_abort
+ cmp $bits,#256
+ b.gt .Lenc_key_abort
+ tst $bits,#0x3f
+ b.ne .Lenc_key_abort
+
+ adr $ptr,.Lrcon
+ cmp $bits,#192
+
+ veor $zero,$zero,$zero
+ vld1.8 {$in0},[$inp],#16
+ mov $bits,#8 // reuse $bits
+ vld1.32 {$rcon,$mask},[$ptr],#32
+
+ b.lt .Loop128
+ b.eq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ b.ne .Loop128
+
+ vld1.32 {$rcon},[$ptr]
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+
+ vtbl.8 $key,{$in0},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in0},[$out],#16
+ aese $key,$zero
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out]
+ add $out,$out,#0x50
+
+ mov $rounds,#10
+ b .Ldone
+
+.align 4
+.L192:
+ vld1.8 {$in1},[$inp],#8
+ vmov.i8 $key,#8 // borrow $key
+ vst1.32 {$in0},[$out],#16
+ vsub.i8 $mask,$mask,$key // adjust the mask
+
+.Loop192:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#8
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+
+ vdup.32 $tmp,${in0}[3]
+ veor $tmp,$tmp,$in1
+ veor $key,$key,$rcon
+ vext.8 $in1,$zero,$in1,#12
+ vshl.u8 $rcon,$rcon,#1
+ veor $in1,$in1,$tmp
+ veor $in0,$in0,$key
+ veor $in1,$in1,$key
+ vst1.32 {$in0},[$out],#16
+ b.ne .Loop192
+
+ mov $rounds,#12
+ add $out,$out,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ vld1.8 {$in1},[$inp]
+ mov $bits,#7
+ mov $rounds,#14
+ vst1.32 {$in0},[$out],#16
+
+.Loop256:
+ vtbl.8 $key,{$in1},$mask
+ vext.8 $tmp,$zero,$in0,#12
+ vst1.32 {$in1},[$out],#16
+ aese $key,$zero
+ subs $bits,$bits,#1
+
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in0,$in0,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $key,$key,$rcon
+ veor $in0,$in0,$tmp
+ vshl.u8 $rcon,$rcon,#1
+ veor $in0,$in0,$key
+ vst1.32 {$in0},[$out],#16
+ b.eq .Ldone
+
+ vdup.32 $key,${in0}[3] // just splat
+ vext.8 $tmp,$zero,$in1,#12
+ aese $key,$zero
+
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+ vext.8 $tmp,$zero,$tmp,#12
+ veor $in1,$in1,$tmp
+
+ veor $in1,$in1,$key
+ b .Loop256
+
+.Ldone:
+ str $rounds,[$out]
+ mov $ptr,#0
+
+.Lenc_key_abort:
+ mov x0,$ptr // return value
+ `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ stmdb sp!,{r4,lr}
+___
+$code.=<<___;
+ bl .Lenc_key
+
+ cmp x0,#0
+ b.ne .Ldec_key_abort
+
+ sub $out,$out,#240 // restore original $out
+ mov x4,#-16
+ add $inp,$out,x12,lsl#4 // end of key schedule
+
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+
+.Loop_imc:
+ vld1.32 {v0.16b},[$out]
+ vld1.32 {v1.16b},[$inp]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ vst1.32 {v0.16b},[$inp],x4
+ vst1.32 {v1.16b},[$out],#16
+ cmp $inp,$out
+ b.hi .Loop_imc
+
+ vld1.32 {v0.16b},[$out]
+ aesimc v0.16b,v0.16b
+ vst1.32 {v0.16b},[$inp]
+
+ eor x0,x0,x0 // return value
+.Ldec_key_abort:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ ldmia sp!,{r4,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldp x29,x30,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ ldr $rounds,[$key,#240]
+ vld1.32 {$rndkey0},[$key],#16
+ vld1.8 {$inout},[$inp]
+ sub $rounds,$rounds,#2
+ vld1.32 {$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+ aes$e $inout,$rndkey0
+ aes$mc $inout,$inout
+ vld1.32 {$rndkey0},[$key],#16
+ subs $rounds,$rounds,#2
+ aes$e $inout,$rndkey1
+ aes$mc $inout,$inout
+ vld1.32 {$rndkey1},[$key],#16
+ b.gt .Loop_${dir}c
+
+ aes$e $inout,$rndkey0
+ aes$mc $inout,$inout
+ vld1.32 {$rndkey0},[$key]
+ aes$e $inout,$rndkey1
+ veor $inout,$inout,$rndkey0
+
+ vst1.8 {$inout},[$out]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
+my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r8,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load remaining args
+___
+$code.=<<___;
+ subs $len,$len,#16
+ mov $step,#16
+ b.lo .Lcbc_abort
+ cclr $step,eq
+
+ cmp $enc,#0 // en- or decrypting?
+ ldr $rounds,[$key,#240]
+ and $len,$len,#-16
+ vld1.8 {$ivec},[$ivp]
+ vld1.8 {$dat},[$inp],$step
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ b.eq .Lcbc_dec
+
+ cmp $rounds,#2
+ veor $dat,$dat,$ivec
+ veor $rndzero_n_last,q8,$rndlast
+ b.eq .Lcbc_enc128
+
+ vld1.32 {$in0-$in1},[$key_]
+ add $key_,$key,#16
+ add $key4,$key,#16*4
+ add $key5,$key,#16*5
+ aese $dat,q8
+ aesmc $dat,$dat
+ add $key6,$key,#16*6
+ add $key7,$key,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
+.Loop_cbc_enc:
+ aese $dat,q8
+ aesmc $dat,$dat
+ vst1.8 {$ivec},[$out],#16
+.Lenter_cbc_enc:
+ aese $dat,q9
+ aesmc $dat,$dat
+ aese $dat,$in0
+ aesmc $dat,$dat
+ vld1.32 {q8},[$key4]
+ cmp $rounds,#4
+ aese $dat,$in1
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key5]
+ b.eq .Lcbc_enc192
+
+ aese $dat,q8
+ aesmc $dat,$dat
+ vld1.32 {q8},[$key6]
+ aese $dat,q9
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key7]
+ nop
+
+.Lcbc_enc192:
+ aese $dat,q8
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,q9
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q13
+ aesmc $dat,$dat
+ vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
+ aese $dat,q14
+ aesmc $dat,$dat
+ aese $dat,q15
+ veor $ivec,$dat,$rndlast
+ b.hs .Loop_cbc_enc
+
+ vst1.8 {$ivec},[$out],#16
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ vld1.32 {$in0-$in1},[$key_]
+ aese $dat,q8
+ aesmc $dat,$dat
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese $dat,q8
+ aesmc $dat,$dat
+ vst1.8 {$ivec},[$out],#16
+.Lenter_cbc_enc128:
+ aese $dat,q9
+ aesmc $dat,$dat
+ subs $len,$len,#16
+ aese $dat,$in0
+ aesmc $dat,$dat
+ cclr $step,eq
+ aese $dat,$in1
+ aesmc $dat,$dat
+ aese $dat,q10
+ aesmc $dat,$dat
+ aese $dat,q11
+ aesmc $dat,$dat
+ vld1.8 {q8},[$inp],$step
+ aese $dat,q12
+ aesmc $dat,$dat
+ aese $dat,q13
+ aesmc $dat,$dat
+ aese $dat,q14
+ aesmc $dat,$dat
+ veor q8,q8,$rndzero_n_last
+ aese $dat,q15
+ veor $ivec,$dat,$rndlast
+ b.hs .Loop_cbc_enc128
+
+ vst1.8 {$ivec},[$out],#16
+ b .Lcbc_done
+___
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+$code.=<<___;
+.align 5
+.Lcbc_dec:
+ vld1.8 {$dat2},[$inp],#16
+ subs $len,$len,#32 // bias
+ add $cnt,$rounds,#2
+ vorr $in1,$dat,$dat
+ vorr $dat1,$dat,$dat
+ vorr $in2,$dat2,$dat2
+ b.lo .Lcbc_dec_tail
+
+ vorr $dat1,$dat2,$dat2
+ vld1.8 {$dat2},[$inp],#16
+ vorr $in0,$dat,$dat
+ vorr $in1,$dat1,$dat1
+ vorr $in2,$dat2,$dat2
+
+.Loop3x_cbc_dec:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop3x_cbc_dec
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ veor $tmp0,$ivec,$rndlast
+ subs $len,$len,#0x30
+ veor $tmp1,$in0,$rndlast
+ mov.lo x6,$len // x6, $cnt, is zero at this point
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ veor $tmp2,$in1,$rndlast
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat2
+ // are loaded with last "words"
+ vorr $ivec,$in2,$in2
+ mov $key_,$key
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ vld1.8 {$in0},[$inp],#16
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ vld1.8 {$in1},[$inp],#16
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat0,q15
+ aesd $dat1,q15
+ aesd $dat2,q15
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ add $cnt,$rounds,#2
+ veor $tmp0,$tmp0,$dat0
+ veor $tmp1,$tmp1,$dat1
+ veor $dat2,$dat2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat0,$in0,$in0
+ vst1.8 {$tmp1},[$out],#16
+ vorr $dat1,$in1,$in1
+ vst1.8 {$dat2},[$out],#16
+ vorr $dat2,$in2,$in2
+ b.hs .Loop3x_cbc_dec
+
+ cmn $len,#0x30
+ b.eq .Lcbc_done
+ nop
+
+.Lcbc_dec_tail:
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lcbc_dec_tail
+
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ cmn $len,#0x20
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ veor $tmp1,$ivec,$rndlast
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ veor $tmp2,$in1,$rndlast
+ aesd $dat1,q15
+ aesd $dat2,q15
+ b.eq .Lcbc_dec_one
+ veor $tmp1,$tmp1,$dat1
+ veor $tmp2,$tmp2,$dat2
+ vorr $ivec,$in2,$in2
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ veor $tmp1,$tmp1,$dat2
+ vorr $ivec,$in2,$in2
+ vst1.8 {$tmp1},[$out],#16
+
+.Lcbc_done:
+ vst1.8 {$ivec},[$ivp]
+.Lcbc_abort:
+___
+}
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r8,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12"; # aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r10,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+___
+$code.=<<___;
+ ldr $rounds,[$key,#240]
+
+ ldr $ctr, [$ivp, #12]
+ vld1.32 {$dat0},[$ivp]
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#4
+ mov $step,#16
+ cmp $len,#2
+ add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ cclr $step,lo
+#ifndef __ARMEB__
+ rev $ctr, $ctr
+#endif
+ vorr $dat1,$dat0,$dat0
+ add $tctr1, $ctr, #1
+ vorr $dat2,$dat0,$dat0
+ add $ctr, $ctr, #2
+ vorr $ivec,$dat0,$dat0
+ rev $tctr1, $tctr1
+ vmov.32 ${dat1}[3],$tctr1
+ b.ls .Lctr32_tail
+ rev $tctr2, $ctr
+ sub $len,$len,#3 // bias
+ vmov.32 ${dat2}[3],$tctr2
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop3x_ctr32
+
+ aese $dat0,q8
+ aesmc $tmp0,$dat0
+ aese $dat1,q8
+ aesmc $tmp1,$dat1
+ vld1.8 {$in0},[$inp],#16
+ vorr $dat0,$ivec,$ivec
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.8 {$in1},[$inp],#16
+ vorr $dat1,$ivec,$ivec
+ aese $tmp0,q9
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q9
+ aesmc $tmp1,$tmp1
+ vld1.8 {$in2},[$inp],#16
+ mov $key_,$key
+ aese $dat2,q9
+ aesmc $tmp2,$dat2
+ vorr $dat2,$ivec,$ivec
+ add $tctr0,$ctr,#1
+ aese $tmp0,q12
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q12
+ aesmc $tmp1,$tmp1
+ veor $in0,$in0,$rndlast
+ add $tctr1,$ctr,#2
+ aese $tmp2,q12
+ aesmc $tmp2,$tmp2
+ veor $in1,$in1,$rndlast
+ add $ctr,$ctr,#3
+ aese $tmp0,q13
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q13
+ aesmc $tmp1,$tmp1
+ veor $in2,$in2,$rndlast
+ rev $tctr0,$tctr0
+ aese $tmp2,q13
+ aesmc $tmp2,$tmp2
+ vmov.32 ${dat0}[3], $tctr0
+ rev $tctr1,$tctr1
+ aese $tmp0,q14
+ aesmc $tmp0,$tmp0
+ aese $tmp1,q14
+ aesmc $tmp1,$tmp1
+ vmov.32 ${dat1}[3], $tctr1
+ rev $tctr2,$ctr
+ aese $tmp2,q14
+ aesmc $tmp2,$tmp2
+ vmov.32 ${dat2}[3], $tctr2
+ subs $len,$len,#3
+ aese $tmp0,q15
+ aese $tmp1,q15
+ aese $tmp2,q15
+
+ veor $in0,$in0,$tmp0
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ vst1.8 {$in0},[$out],#16
+ veor $in1,$in1,$tmp1
+ mov $cnt,$rounds
+ vst1.8 {$in1},[$out],#16
+ veor $in2,$in2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$in2},[$out],#16
+ b.hs .Loop3x_ctr32
+
+ adds $len,$len,#3
+ b.eq .Lctr32_done
+ cmp $len,#1
+ mov $step,#16
+ cclr $step,eq
+
+.Lctr32_tail:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lctr32_tail
+
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ vld1.8 {$in0},[$inp],$step
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp]
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ veor $in0,$in0,$rndlast
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ veor $in1,$in1,$rndlast
+ aese $dat0,q15
+ aese $dat1,q15
+
+ cmp $len,#1
+ veor $in0,$in0,$dat0
+ veor $in1,$in1,$dat1
+ vst1.8 {$in0},[$out],#16
+ b.eq .Lctr32_done
+ vst1.8 {$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r10,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) { ######## 64-bit code
+ my %opcode = (
+ "aesd" => 0x4e285800, "aese" => 0x4e284800,
+ "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5),
+ $mnemonic,$arg;
+ };
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
+ s/@\s/\/\//o; # old->new style commentary
+
+ #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
+ s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vext\.8/ext/o or
+ s/vrev32\.8/rev32/o or
+ s/vtst\.8/cmtst/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
+ s/\bbx\s+lr\b/ret/o;
+
+ # fix up remainig legacy suffixes
+ s/\.[ui]?8//o;
+ m/\],#8/o and s/\.16b/\.8b/go;
+ s/\.[ui]?32//o and s/\.16b/\.4s/go;
+ s/\.[ui]?64//o and s/\.16b/\.2d/go;
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+ print $_,"\n";
+ }
+} else { ######## 32-bit code
+ my %opcode = (
+ "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
+ "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
+
+ local *unaes = sub {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<1) |(($2&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ };
+
+ sub unvtbl {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+ sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+ }
+
+ sub unvdup32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+
+ sub unvmov32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+ }
+
+ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/geo;
+
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
+ s/\/\/\s?/@ /o; # new->old style commentary
+
+ # fix up remainig new-style suffixes
+ s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
+ s/\],#[0-9]+/]!/o;
+
+ s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/vmov\.32\s+(.*)/unvmov32($1)/geo or
+ s/^(\s+)b\./$1b/o or
+ s/^(\s+)mov\./$1mov/o or
+ s/^(\s+)ret/$1bx\tlr/o;
+
+ print $_,"\n";
+ }
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/bsaes-armv7.pl b/openssl-1.1.0h/crypto/aes/asm/bsaes-armv7.pl
new file mode 100644
index 0000000..7af38af
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/bsaes-armv7.pl
@@ -0,0 +1,2495 @@
+#! /usr/bin/env perl
+# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt 19.5 cycles per byte processed with 128-bit key
+# decrypt 22.1 cycles per byte processed with 128-bit key
+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+# <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+# <ard.biesheuvel@linaro.org>
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[2], @b[2], @b[1]
+ veor @b[5], @b[5], @b[6]
+ veor @b[3], @b[3], @b[0]
+ veor @b[6], @b[6], @b[2]
+ veor @b[5], @b[5], @b[0]
+
+ veor @b[6], @b[6], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[4], @b[4], @b[5]
+
+ veor @b[2], @b[2], @b[7]
+ veor @b[3], @b[3], @b[1]
+ veor @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+ veor @b[4], @b[4], @b[6]
+ veor @b[2], @b[2], @b[0]
+ veor @b[6], @b[6], @b[1]
+
+ veor @b[1], @b[1], @b[5]
+ veor @b[5], @b[5], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[2], @b[2], @b[5]
+
+ veor @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ veor @b[1], @b[1], @b[7]
+ veor @b[4], @b[4], @b[7]
+
+ veor @b[7], @b[7], @b[5]
+ veor @b[1], @b[1], @b[3]
+ veor @b[2], @b[2], @b[5]
+ veor @b[3], @b[3], @b[7]
+
+ veor @b[6], @b[6], @b[1]
+ veor @b[2], @b[2], @b[0]
+ veor @b[5], @b[5], @b[3]
+ veor @b[4], @b[4], @b[6]
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ veor @b[1], @b[1], @b[5]
+ veor @b[2], @b[2], @b[7]
+
+ veor @b[3], @b[3], @b[1]
+ veor @b[4], @b[4], @b[5]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[5], @b[5], @b[0]
+ veor @b[3], @b[3], @b[7]
+ veor @b[6], @b[6], @b[2]
+ veor @b[2], @b[2], @b[1]
+ veor @b[6], @b[6], @b[3]
+
+ veor @b[3], @b[3], @b[0]
+ veor @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $t1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $t1, $t0
+ veor $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $x1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $x1, $x0
+ veor $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ veor $t1, $y2, $y3
+ vand $t0, $t0, $x0
+ vand $t1, $t1, $x2
+ veor $x0, $x0, $x1
+ veor $x2, $x2, $x3
+ vand $x1, $x1, $y0
+ vand $x3, $x3, $y2
+ vand $x0, $x0, $y1
+ vand $x2, $x2, $y3
+ veor $x1, $x1, $x0
+ veor $x2, $x2, $x3
+ veor $x0, $x0, $t0
+ veor $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ veor @t[0], @x[0], @x[2]
+ veor @t[1], @x[1], @x[3]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @x[0], @x[0], @t[0]
+ veor @x[2], @x[2], @t[0]
+ veor @x[1], @x[1], @t[1]
+ veor @x[3], @x[3], @t[1]
+
+ veor @t[0], @x[4], @x[6]
+ veor @t[1], @x[5], @x[7]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @x[4], @x[4], @t[0]
+ veor @x[6], @x[6], @t[0]
+ veor @x[5], @x[5], @t[1]
+ veor @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ veor @t[3], @x[4], @x[6]
+ veor @t[2], @x[5], @x[7]
+ veor @t[1], @x[1], @x[3]
+ veor @s[1], @x[7], @x[6]
+ vmov @t[0], @t[2]
+ veor @s[0], @x[0], @x[2]
+
+ vorr @t[2], @t[2], @t[1]
+ veor @s[3], @t[3], @t[0]
+ vand @s[2], @t[3], @s[0]
+ vorr @t[3], @t[3], @s[0]
+ veor @s[0], @s[0], @t[1]
+ vand @t[0], @t[0], @t[1]
+ veor @t[1], @x[3], @x[2]
+ vand @s[3], @s[3], @s[0]
+ vand @s[1], @s[1], @t[1]
+ veor @t[1], @x[4], @x[5]
+ veor @s[0], @x[1], @x[0]
+ veor @t[3], @t[3], @s[1]
+ veor @t[2], @t[2], @s[1]
+ vand @s[1], @t[1], @s[0]
+ vorr @t[1], @t[1], @s[0]
+ veor @t[3], @t[3], @s[3]
+ veor @t[0], @t[0], @s[1]
+ veor @t[2], @t[2], @s[2]
+ veor @t[1], @t[1], @s[3]
+ veor @t[0], @t[0], @s[2]
+ vand @s[0], @x[7], @x[3]
+ veor @t[1], @t[1], @s[2]
+ vand @s[1], @x[6], @x[2]
+ vand @s[2], @x[5], @x[1]
+ vorr @s[3], @x[4], @x[0]
+ veor @t[3], @t[3], @s[0]
+ veor @t[1], @t[1], @s[2]
+ veor @t[0], @t[0], @s[3]
+ veor @t[2], @t[2], @s[1]
+
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ @ new smaller inversion
+
+ vand @s[2], @t[3], @t[1]
+ vmov @s[0], @t[0]
+
+ veor @s[1], @t[2], @s[2]
+ veor @s[3], @t[0], @s[2]
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
+
+ vbsl @s[1], @t[1], @t[0]
+ vbsl @s[3], @t[3], @t[2]
+ veor @t[3], @t[3], @t[2]
+
+ vbsl @s[0], @s[1], @s[2]
+ vbsl @t[0], @s[2], @s[1]
+
+ vand @s[2], @s[0], @s[3]
+ veor @t[1], @t[1], @t[0]
+
+ veor @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+ vldmia $key!, {@t[0]-@t[3]}
+ veor @t[0], @t[0], @x[0]
+ veor @t[1], @t[1], @x[1]
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+ vldmia $key!, {@t[0]}
+ veor @t[2], @t[2], @x[2]
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+ vldmia $key!, {@t[1]}
+ veor @t[3], @t[3], @x[3]
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+ vldmia $key!, {@t[2]}
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+ vldmia $key!, {@t[3]}
+ veor @t[0], @t[0], @x[4]
+ veor @t[1], @t[1], @x[5]
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+ veor @t[2], @t[2], @x[6]
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+ veor @t[3], @t[3], @x[7]
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
+ vext.8 @t[2], @x[2], @x[2], #12
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[2], @x[2], @t[2]
+ vext.8 @t[4], @x[4], @x[4], #12
+ veor @x[3], @x[3], @t[3]
+ vext.8 @t[5], @x[5], @x[5], #12
+ veor @x[4], @x[4], @t[4]
+ vext.8 @t[6], @x[6], @x[6], #12
+ veor @x[5], @x[5], @t[5]
+ vext.8 @t[7], @x[7], @x[7], #12
+ veor @x[6], @x[6], @t[6]
+
+ veor @t[1], @t[1], @x[0]
+ veor @x[7], @x[7], @t[7]
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor @t[2], @t[2], @x[1]
+ veor @t[0], @t[0], @x[7]
+ veor @t[1], @t[1], @x[7]
+ vext.8 @x[1], @x[1], @x[1], #8
+ veor @t[5], @t[5], @x[4]
+ veor @x[0], @x[0], @t[0]
+ veor @t[6], @t[6], @x[5]
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[0], @x[4], @x[4], #8
+ veor @t[4], @t[4], @x[3]
+ vext.8 @t[1], @x[5], @x[5], #8
+ veor @t[7], @t[7], @x[6]
+ vext.8 @x[4], @x[3], @x[3], #8
+ veor @t[3], @t[3], @x[2]
+ vext.8 @x[5], @x[7], @x[7], #8
+ veor @t[4], @t[4], @x[7]
+ vext.8 @x[3], @x[6], @x[6], #8
+ veor @t[3], @t[3], @x[7]
+ vext.8 @x[6], @x[2], @x[2], #8
+ veor @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+ veor @x[2], @t[0], @t[4]
+ veor @x[4], @x[4], @t[3]
+ veor @x[5], @x[5], @t[7]
+ veor @x[3], @x[3], @t[6]
+ @ vmov @x[2], @t[0]
+ veor @x[6], @x[6], @t[2]
+ @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+ veor @t[3], @t[3], @x[4]
+ veor @x[5], @x[5], @t[7]
+ veor @x[2], @x[3], @t[6]
+ veor @x[3], @t[0], @t[4]
+ veor @x[4], @x[6], @t[2]
+ vmov @x[6], @t[3]
+ @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ @ multiplication by 0x0e
+ vext.8 @t[7], @x[7], @x[7], #12
+ vmov @t[2], @x[2]
+ veor @x[2], @x[2], @x[5] @ 2 5
+ veor @x[7], @x[7], @x[5] @ 7 5
+ vext.8 @t[0], @x[0], @x[0], #12
+ vmov @t[5], @x[5]
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
+ veor @x[0], @x[0], @x[1] @ 0 1
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[1], @x[1], @x[2] @ 1 25
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
+ veor @x[3], @x[3], @x[7] @ 3 75
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
+ vext.8 @t[6], @x[6], @x[6], #12
+ vmov @t[4], @x[4]
+ veor @x[6], @x[6], @x[4] @ 6 4
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
+ veor @x[3], @x[3], @x[7] @ 375 756=36
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
+ veor @x[3], @x[3], @t[2] @ 36 2
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ @ multiplication by 0x0b
+ veor @y[1], @y[1], @y[0]
+ veor @y[0], @y[0], @t[0]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[1], @y[1], @t[6]
+ veor @y[0], @y[0], @t[7]
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
+
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @y[0]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[2], @y[2], @t[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+ veor @y[2], @y[2], @t[2]
+ veor @y[3], @y[3], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[7]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[3], @y[3], @t[3]
+ veor @y[6], @y[6], @t[3]
+ veor @y[4], @y[4], @t[3]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[3], @t[3], @t[3], #12
+ veor @y[5], @y[5], @t[4]
+ veor @y[7], @y[7], @t[7]
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
+ veor @y[3], @y[3], @t[5]
+ veor @y[4], @y[4], @t[4]
+
+ veor @y[5], @y[5], @t[7]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[6], @y[6], @t[7]
+ veor @y[4], @y[4], @t[7]
+
+ veor @t[7], @t[7], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+
+ @ multiplication by 0x0d
+ veor @y[4], @y[4], @y[7]
+ veor @t[7], @t[7], @t[6] @ restore t[7]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @y[2], @y[2], @t[0]
+ veor @y[7], @y[7], @t[5]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @y[2], @y[2], @t[2]
+
+ veor @y[3], @y[3], @y[1]
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[0]
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @t[5]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[1], @y[1], @t[7]
+ veor @y[0], @y[0], @t[6]
+ veor @y[3], @y[3], @y[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+
+ veor @y[7], @y[7], @t[7]
+ veor @y[4], @y[4], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[6]
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[4], @y[4], @y[7]
+ veor @y[3], @y[3], @t[6]
+
+ veor @y[6], @y[6], @t[6]
+ veor @y[5], @y[5], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @y[6], @y[6], @t[4]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[5], @y[5], @t[6]
+ veor @y[6], @y[6], @t[7]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @t[6], @t[6], @t[3] @ restore t[6]
+ vext.8 @t[3], @t[3], @t[3], #12
+
+ @ multiplication by 0x09
+ veor @y[4], @y[4], @y[1]
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @t[1], @t[1], @t[5]
+ veor @y[3], @y[3], @t[0]
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
+ veor @t[1], @t[1], @t[6]
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
+ veor @y[4], @y[4], @t[1]
+ veor @y[7], @y[7], @t[4]
+ veor @y[6], @y[6], @t[3]
+ veor @y[5], @y[5], @t[2]
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
+ veor @t[3], @t[3], @t[7]
+ veor @XMM[5], @t[5], @t[6]
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
+ veor @XMM[2], @t[2], @t[6]
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
+
+ vmov @XMM[0], @t[0]
+ vmov @XMM[1], @t[1]
+ @ vmov @XMM[2], @t[2]
+ vmov @XMM[3], @t[3]
+ vmov @XMM[4], @t[4]
+ @ vmov @XMM[5], @t[5]
+ @ vmov @XMM[6], @t[6]
+ @ vmov @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 @t[0], @x[0], @x[0], #8
+ vext.8 @t[6], @x[6], @x[6], #8
+ vext.8 @t[7], @x[7], @x[7], #8
+ veor @t[0], @t[0], @x[0]
+ vext.8 @t[1], @x[1], @x[1], #8
+ veor @t[6], @t[6], @x[6]
+ vext.8 @t[2], @x[2], @x[2], #8
+ veor @t[7], @t[7], @x[7]
+ vext.8 @t[3], @x[3], @x[3], #8
+ veor @t[1], @t[1], @x[1]
+ vext.8 @t[4], @x[4], @x[4], #8
+ veor @t[2], @t[2], @x[2]
+ vext.8 @t[5], @x[5], @x[5], #8
+ veor @t[3], @t[3], @x[3]
+ veor @t[4], @t[4], @x[4]
+ veor @t[5], @t[5], @x[5]
+
+ veor @x[0], @x[0], @t[6]
+ veor @x[1], @x[1], @t[6]
+ veor @x[2], @x[2], @t[0]
+ veor @x[4], @x[4], @t[2]
+ veor @x[3], @x[3], @t[1]
+ veor @x[1], @x[1], @t[7]
+ veor @x[2], @x[2], @t[7]
+ veor @x[4], @x[4], @t[6]
+ veor @x[5], @x[5], @t[3]
+ veor @x[3], @x[3], @t[6]
+ veor @x[6], @x[6], @t[4]
+ veor @x[4], @x[4], @t[7]
+ veor @x[5], @x[5], @t[7]
+ veor @x[7], @x[7], @t[5]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ vshr.u64 $t, $b, #$n
+ veor $t, $t, $a
+ vand $t, $t, $mask
+ veor $a, $a, $t
+ vshl.u64 $t, $t, #$n
+ veor $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ vshr.u64 $t0, $b0, #$n
+ vshr.u64 $t1, $b1, #$n
+ veor $t0, $t0, $a0
+ veor $t1, $t1, $a1
+ vand $t0, $t0, $mask
+ vand $t1, $t1, $mask
+ veor $a0, $a0, $t0
+ vshl.u64 $t0, $t0, #$n
+ veor $a1, $a1, $t1
+ vshl.u64 $t1, $t1, #$n
+ veor $b0, $b0, $t0
+ veor $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ vmov.i8 $t0,#0x55 @ compose .LBS0
+ vmov.i8 $t1,#0x33 @ compose .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ vmov.i8 $t0,#0x0f @ compose .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code 32
+# undef __thumb2__
+#endif
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr $const,.
+ vldmia $key!, {@XMM[9]} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr $const,.LM0ISR
+#else
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
+#endif
+
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq $const,$const,#0x10
+ bne .Ldec_loop
+ vldmia $const, {@XMM[12]} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr $const,.
+ vldmia $key!, {@XMM[9]} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr $const,.LM0SR
+#else
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
+#endif
+
+ vldmia $const!, {@XMM[8]} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq $const,$const,#0x10
+ bne .Lenc_loop
+ vldmia $const, {@XMM[12]} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+ vmov @x[2], @x[0]
+ vmov @x[3], @x[1]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ vmov @x[4], @x[0]
+ vmov @x[6], @x[2]
+ vmov @x[5], @x[1]
+ vmov @x[7], @x[3]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr $const,.
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr $const,.LM0
+#else
+ sub $const,$const,#_bsaes_key_convert-.LM0
+#endif
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
+
+ vmov.i8 @XMM[8], #0x01 @ bit masks
+ vmov.i8 @XMM[9], #0x02
+ vmov.i8 @XMM[10], #0x04
+ vmov.i8 @XMM[11], #0x08
+ vmov.i8 @XMM[12], #0x10
+ vmov.i8 @XMM[13], #0x20
+ vldmia $const, {@XMM[14]} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 @XMM[7], @XMM[7]
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ sub $rounds,$rounds,#1
+ vstmia $out!, {@XMM[7]} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+ vmov.i8 @XMM[6], #0x40
+ vmov.i8 @XMM[15], #0x80
+
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
+ vmvn @XMM[0], @XMM[0] @ "pnot"
+ vmvn @XMM[1], @XMM[1]
+ vmvn @XMM[5], @XMM[5]
+ vmvn @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ subs $rounds,$rounds,#1
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Lenc128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_encrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Lenc128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ vldmia $out, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $out, {@XMM[7]}
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Ldec128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_decrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Ldec128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp $len, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
+ mov $len, $len, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ vldmia $keysched, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $keysched, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {@XMM[15]}, [$ivp] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs $len, $len, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5, $rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
+ sub $inp, $inp, #0x60
+ vstmia $fp, {@XMM[15]} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ veor @XMM[5], @XMM[5], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[5]}, [$out]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds $len, $len, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
+ cmp $len, #2
+ blo .Lcbc_dec_one
+ vld1.8 {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ mov r5, $rounds
+ vstmia $fp, {@XMM[15]} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {@XMM[2]}, [$inp]!
+ cmp $len, #4
+ blo .Lcbc_dec_three
+ vld1.8 {@XMM[3]}, [$inp]!
+ beq .Lcbc_dec_four
+ vld1.8 {@XMM[4]}, [$inp]!
+ cmp $len, #6
+ blo .Lcbc_dec_five
+ vld1.8 {@XMM[5]}, [$inp]!
+ beq .Lcbc_dec_six
+ vld1.8 {@XMM[6]}, [$inp]!
+ sub $inp, $inp, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub $inp, $inp, #0x60
+ bl _bsaes_decrypt8
+ vldmia $fp,{@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub $inp, $inp, #0x50
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub $inp, $inp, #0x40
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub $inp, $inp, #0x30
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub $inp, $inp, #0x20
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
+ veor @XMM[1], @XMM[1], @XMM[8]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub $inp, $inp, #0x10
+ mov $rounds, $out @ save original out pointer
+ mov $out, $fp @ use the iv scratch space as out buffer
+ mov r2, $key
+ vmov @XMM[4],@XMM[15] @ just in case ensure that IV
+ vmov @XMM[5],@XMM[0] @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {@XMM[0]}, [$fp] @ load result
+ veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
+ vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
+ vst1.8 {@XMM[0]}, [$rounds] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ vst1.8 {@XMM[15]}, [$ivp] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6"; # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp $len, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+#ifdef __APPLE__
+ mov $ctr, #:lower16:(.LREVM0SR-.LM0)
+ add $ctr, $const, $ctr
+#else
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+#endif
+ vldmia $keysched, {@XMM[4]} @ load round0 key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+.align 2
+0: add r12, $key, #248
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ adrl $ctr, .LREVM0SR @ borrow $ctr
+ vldmia r12, {@XMM[4]} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
+ veor @XMM[9],@XMM[9],@XMM[9]
+ vrev32.8 @XMM[0],@XMM[0]
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
+ vrev32.8 @XMM[4],@XMM[4]
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia $keysched, {@XMM[9]} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, $keysched, #0x10 @ pass next round key
+#else
+ add r4, $key, #`248+16`
+#endif
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
+ mov r5, $rounds @ pass rounds
+ vstmia $fp, {@XMM[10]} @ save next counter
+#ifdef __APPLE__
+ mov $const, #:lower16:(.LREVM0SR-.LSR)
+ sub $const, $ctr, $const
+#else
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+#endif
+
+ bl _bsaes_encrypt8_alt
+
+ subs $len, $len, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ veor @XMM[6], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[7], @XMM[13]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ veor @XMM[5], @XMM[15]
+ vst1.8 {@XMM[6]}, [$out]!
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
+ vst1.8 {@XMM[3]}, [$out]!
+ veor @XMM[9], @XMM[9], @XMM[9]
+ vst1.8 {@XMM[7]}, [$out]!
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
+ vst1.8 {@XMM[2]}, [$out]!
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vst1.8 {@XMM[5]}, [$out]!
+ vldmia $fp, {@XMM[0]} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add $len, $len, #8
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
+ veor @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]! @ write output
+ cmp $len, #2
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[9]}, [$inp]!
+ veor @XMM[1], @XMM[9]
+ vst1.8 {@XMM[1]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ vst1.8 {@XMM[4]}, [$out]!
+ cmp $len, #4
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[11]}, [$inp]!
+ veor @XMM[6], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[3]}, [$out]!
+ cmp $len, #6
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[13]}, [$inp]!
+ veor @XMM[7], @XMM[13]
+ vst1.8 {@XMM[7]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[14]}, [$inp]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lctr_enc_bzero
+#else
+ vstmia $keysched, {q0-q1}
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, $inp @ copy arguments
+ mov r5, $out
+ mov r6, $len
+ mov r7, $key
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {@XMM[1]}, [sp] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [r4]! @ load input
+ vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor @XMM[0],@XMM[0],@XMM[1]
+ vst1.8 {@XMM[0]}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6"; # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ subs $len, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds $len, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_enc_ret
+ sub r6, $out, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [$inp], #1
+ ldrb r1, [$out, #-0x10]
+ strb r0, [$out, #-0x10]
+ strb r1, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+#ifndef XTS_CHAIN_TWEAK
+ tst $len, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne $len, #0x10 @ subtract another 16 bytes
+#endif
+ subs $len, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds $len, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r5, $magic @ preserve magic
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+ mov $magic, r5
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia $magic, {$twmask}
+ vshr.s64 @XMM[6], @XMM[8], #63
+ vand @XMM[6], @XMM[6], $twmask
+ vadd.u64 @XMM[9], @XMM[8], @XMM[8]
+ vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+ veor @XMM[9], @XMM[9], @XMM[6]
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {@XMM[0]}, [$inp]!
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[9]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[9]
+ vst1.8 {@XMM[0]}, [$out]
+
+ mov r6, $out
+.Lxts_dec_steal:
+ ldrb r1, [$out]
+ ldrb r0, [$inp], #1
+ strb r1, [$out, #0x10]
+ strb r0, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/bsaes-x86_64.pl b/openssl-1.1.0h/crypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000..921d870
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3111 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+###################################################################
+### AES-128 [originally in CTR mode] ###
+### bitsliced implementation for Intel Core 2 processors ###
+### requires support of SSE extensions up to SSSE3 ###
+### Author: Emilia Käsper and Peter Schwabe ###
+### Date: 2009-03-19 ###
+### Public domain ###
+### ###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
+### further information. ###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+# from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+# allowed to feed its output back to aesenc[last], this was
+# achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+# relies on conversion of "conventional" key schedule as returned
+# by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+# to skip one shiftrows(), reduce bit-sliced key schedule and
+# speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+# Emilia's this(*) difference
+#
+# Core 2 9.30 8.69 +7%
+# Nehalem(**) 7.63 6.88 +11%
+# Atom 17.1 16.4 +4%
+# Silvermont - 12.9
+# Goldmont - 8.85
+#
+# (*) Comparison is not completely fair, because "this" is ECB,
+# i.e. no extra processing such as counter values calculation
+# and xor-ing input as in Emilia's CTR implementation is
+# performed. However, the CTR calculations stand for not more
+# than 1% of total time, so comparison is *rather* fair.
+#
+# (**) Results were collected on Westmere, which is considered to
+# be equivalent to Nehalem for this code.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# conversion conversion/8x block
+# Core 2 240 0.22
+# Nehalem 180 0.20
+# Atom 430 0.20
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2 9.98
+# Nehalem 7.80
+# Atom 17.9
+# Silvermont 14.0
+# Goldmont 10.2
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+# <appro@openssl.org>
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
+my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[5]
+ pxor @b[1], @b[2]
+ pxor @b[0], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[0], @b[5]
+
+ pxor @b[3], @b[6]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[1], @b[3]
+
+ pxor @b[7], @b[2]
+ pxor @b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[1], @b[6]
+
+ pxor @b[5], @b[1]
+ pxor @b[3], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+
+ pxor @b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ pxor @b[7], @b[4]
+
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+ pxor @b[7], @b[3]
+ pxor @b[3], @b[5]
+ pxor @b[5], @b[1]
+
+ pxor @b[1], @b[6]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ pxor @b[5], @b[1]
+ pxor @b[7], @b[2]
+
+ pxor @b[1], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[0], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[1], @b[2]
+ pxor @b[3], @b[6]
+
+ pxor @b[0], @b[3]
+ pxor @b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x1, $x0
+ pxor $t0, $x1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x0, $x1
+ pxor $t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ movdqa $y2, $t1
+ pxor $y1, $t0
+ pxor $y3, $t1
+ pand $x0, $t0
+ pand $x2, $t1
+ pxor $x1, $x0
+ pxor $x3, $x2
+ pand $y0, $x1
+ pand $y2, $x3
+ pand $y1, $x0
+ pand $y3, $x2
+ pxor $x0, $x1
+ pxor $x3, $x2
+ pxor $t0, $x0
+ pxor $t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ movdqa @x[0], @t[0]
+ movdqa @x[1], @t[1]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+ pxor @x[2], @t[0]
+ pxor @x[3], @t[1]
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @t[0], @x[0]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[1]
+ pxor @t[1], @x[3]
+
+ movdqa @x[4], @t[0]
+ movdqa @x[5], @t[1]
+ pxor @x[6], @t[0]
+ pxor @x[7], @t[1]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+ pxor @t[0], @x[4]
+ pxor @t[0], @x[6]
+ pxor @t[1], @x[5]
+ pxor @t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ movdqa @x[4], @t[3]
+ movdqa @x[5], @t[2]
+ movdqa @x[1], @t[1]
+ movdqa @x[7], @s[1]
+ movdqa @x[0], @s[0]
+
+ pxor @x[6], @t[3]
+ pxor @x[7], @t[2]
+ pxor @x[3], @t[1]
+ movdqa @t[3], @s[2]
+ pxor @x[6], @s[1]
+ movdqa @t[2], @t[0]
+ pxor @x[2], @s[0]
+ movdqa @t[3], @s[3]
+
+ por @t[1], @t[2]
+ por @s[0], @t[3]
+ pxor @t[0], @s[3]
+ pand @s[0], @s[2]
+ pxor @t[1], @s[0]
+ pand @t[1], @t[0]
+ pand @s[0], @s[3]
+ movdqa @x[3], @s[0]
+ pxor @x[2], @s[0]
+ pand @s[0], @s[1]
+ pxor @s[1], @t[3]
+ pxor @s[1], @t[2]
+ movdqa @x[4], @s[1]
+ movdqa @x[1], @s[0]
+ pxor @x[5], @s[1]
+ pxor @x[0], @s[0]
+ movdqa @s[1], @t[1]
+ pand @s[0], @s[1]
+ por @s[0], @t[1]
+ pxor @s[1], @t[0]
+ pxor @s[3], @t[3]
+ pxor @s[2], @t[2]
+ pxor @s[3], @t[1]
+ movdqa @x[7], @s[0]
+ pxor @s[2], @t[0]
+ movdqa @x[6], @s[1]
+ pxor @s[2], @t[1]
+ movdqa @x[5], @s[2]
+ pand @x[3], @s[0]
+ movdqa @x[4], @s[3]
+ pand @x[2], @s[1]
+ pand @x[1], @s[2]
+ por @x[0], @s[3]
+ pxor @s[0], @t[3]
+ pxor @s[1], @t[2]
+ pxor @s[2], @t[1]
+ pxor @s[3], @t[0]
+
+ #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ # new smaller inversion
+
+ movdqa @t[3], @s[0]
+ pand @t[1], @t[3]
+ pxor @t[2], @s[0]
+
+ movdqa @t[0], @s[2]
+ movdqa @s[0], @s[3]
+ pxor @t[3], @s[2]
+ pand @s[2], @s[3]
+
+ movdqa @t[1], @s[1]
+ pxor @t[2], @s[3]
+ pxor @t[0], @s[1]
+
+ pxor @t[2], @t[3]
+
+ pand @t[3], @s[1]
+
+ movdqa @s[2], @t[2]
+ pxor @t[0], @s[1]
+
+ pxor @s[1], @t[2]
+ pxor @s[1], @t[1]
+
+ pand @t[0], @t[2]
+
+ pxor @t[2], @s[2]
+ pxor @t[2], @t[1]
+
+ pand @s[3], @s[2]
+
+ pxor @s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+ pxor 0x00($key),@x[0]
+ pxor 0x10($key),@x[1]
+ pxor 0x20($key),@x[2]
+ pxor 0x30($key),@x[3]
+ pshufb $mask,@x[0]
+ pshufb $mask,@x[1]
+ pxor 0x40($key),@x[4]
+ pxor 0x50($key),@x[5]
+ pshufb $mask,@x[2]
+ pshufb $mask,@x[3]
+ pxor 0x60($key),@x[6]
+ pxor 0x70($key),@x[7]
+ pshufb $mask,@x[4]
+ pshufb $mask,@x[5]
+ pshufb $mask,@x[6]
+ pshufb $mask,@x[7]
+ lea 0x80($key),$key
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
+ pshufd \$0x93, @x[2], @t[2]
+ pxor @t[1], @x[1]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @t[2], @x[2]
+ pshufd \$0x93, @x[4], @t[4]
+ pxor @t[3], @x[3]
+ pshufd \$0x93, @x[5], @t[5]
+ pxor @t[4], @x[4]
+ pshufd \$0x93, @x[6], @t[6]
+ pxor @t[5], @x[5]
+ pshufd \$0x93, @x[7], @t[7]
+ pxor @t[6], @x[6]
+ pxor @t[7], @x[7]
+
+ pxor @x[0], @t[1]
+ pxor @x[7], @t[0]
+ pxor @x[7], @t[1]
+ pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
+ pxor @x[1], @t[2]
+ pshufd \$0x4E, @x[1], @x[1]
+ pxor @x[4], @t[5]
+ pxor @t[0], @x[0]
+ pxor @x[5], @t[6]
+ pxor @t[1], @x[1]
+ pxor @x[3], @t[4]
+ pshufd \$0x4E, @x[4], @t[0]
+ pxor @x[6], @t[7]
+ pshufd \$0x4E, @x[5], @t[1]
+ pxor @x[2], @t[3]
+ pshufd \$0x4E, @x[3], @x[4]
+ pxor @x[7], @t[3]
+ pshufd \$0x4E, @x[7], @x[5]
+ pxor @x[7], @t[4]
+ pshufd \$0x4E, @x[6], @x[3]
+ pxor @t[4], @t[0]
+ pshufd \$0x4E, @x[2], @x[6]
+ pxor @t[5], @t[1]
+___
+$code.=<<___ if (!$inv);
+ pxor @t[3], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[6], @x[3]
+ movdqa @t[0], @x[2]
+ pxor @t[2], @x[6]
+ movdqa @t[1], @x[7]
+___
+$code.=<<___ if ($inv);
+ pxor @x[4], @t[3]
+ pxor @t[7], @x[5]
+ pxor @x[3], @t[6]
+ movdqa @t[0], @x[3]
+ pxor @t[2], @x[6]
+ movdqa @t[6], @x[2]
+ movdqa @t[1], @x[7]
+ movdqa @x[6], @x[4]
+ movdqa @t[3], @x[6]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ # multiplication by 0x0e
+ pshufd \$0x93, @x[7], @t[7]
+ movdqa @x[2], @t[2]
+ pxor @x[5], @x[7] # 7 5
+ pxor @x[5], @x[2] # 2 5
+ pshufd \$0x93, @x[0], @t[0]
+ movdqa @x[5], @t[5]
+ pxor @x[0], @x[5] # 5 0 [1]
+ pxor @x[1], @x[0] # 0 1
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @x[2], @x[1] # 1 25
+ pxor @x[6], @x[0] # 01 6 [2]
+ pxor @x[3], @x[1] # 125 3 [4]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @x[0], @x[2] # 25 016 [3]
+ pxor @x[7], @x[3] # 3 75
+ pxor @x[6], @x[7] # 75 6 [0]
+ pshufd \$0x93, @x[6], @t[6]
+ movdqa @x[4], @t[4]
+ pxor @x[4], @x[6] # 6 4
+ pxor @x[3], @x[4] # 4 375 [6]
+ pxor @x[7], @x[3] # 375 756=36
+ pxor @t[5], @x[6] # 64 5 [7]
+ pxor @t[2], @x[3] # 36 2
+ pxor @t[4], @x[3] # 362 4 [5]
+ pshufd \$0x93, @t[5], @t[5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ # multiplication by 0x0b
+ pxor @y[0], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[1], @y[1]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[5], @y[0]
+ pxor @t[6], @y[1]
+ pxor @t[7], @y[0]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @t[7] # clobber t[7]
+ pxor @y[0], @y[1]
+
+ pxor @t[0], @y[3]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[1], @y[2]
+ pxor @t[1], @y[4]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[1], @t[1]
+ pxor @t[2], @y[3]
+ pxor @t[2], @y[5]
+ pxor @t[7], @y[2]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[3], @y[3]
+ pxor @t[3], @y[6]
+ pxor @t[3], @y[4]
+ pshufd \$0x93, @t[3], @t[3]
+ pxor @t[4], @y[7]
+ pxor @t[4], @y[5]
+ pxor @t[7], @y[7]
+ pxor @t[5], @y[3]
+ pxor @t[4], @y[4]
+ pxor @t[5], @t[7] # clobber t[7] even more
+
+ pxor @t[7], @y[5]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[7], @y[6]
+ pxor @t[7], @y[4]
+
+ pxor @t[5], @t[7]
+ pshufd \$0x93, @t[5], @t[5]
+ pxor @t[6], @t[7] # restore t[7]
+
+ # multiplication by 0x0d
+ pxor @y[7], @y[4]
+ pxor @t[4], @y[7]
+ pshufd \$0x93, @t[6], @t[6]
+ pxor @t[0], @y[2]
+ pxor @t[5], @y[7]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[7], @t[7]
+
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[0], @y[3]
+ pxor @t[5], @y[1]
+ pxor @t[5], @y[0]
+ pxor @t[7], @y[1]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[6], @y[0]
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[4]
+ pshufd \$0x93, @t[1], @t[1]
+
+ pxor @t[7], @y[7]
+ pxor @t[2], @y[4]
+ pxor @t[2], @y[5]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[6], @y[2]
+ pxor @t[3], @t[6] # clobber t[6]
+ pxor @y[7], @y[4]
+ pxor @t[6], @y[3]
+
+ pxor @t[6], @y[6]
+ pxor @t[5], @y[5]
+ pxor @t[4], @y[6]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @y[5]
+ pxor @t[7], @y[6]
+ pxor @t[3], @t[6] # restore t[6]
+
+ pshufd \$0x93, @t[5], @t[5]
+ pshufd \$0x93, @t[6], @t[6]
+ pshufd \$0x93, @t[7], @t[7]
+ pshufd \$0x93, @t[3], @t[3]
+
+ # multiplication by 0x09
+ pxor @y[1], @y[4]
+ pxor @y[1], @t[1] # t[1]=y[1]
+ pxor @t[5], @t[0] # clobber t[0]
+ pxor @t[5], @t[1]
+ pxor @t[0], @y[3]
+ pxor @y[0], @t[0] # t[0]=y[0]
+ pxor @t[6], @t[1]
+ pxor @t[7], @t[6] # clobber t[6]
+ pxor @t[1], @y[4]
+ pxor @t[4], @y[7]
+ pxor @y[4], @t[4] # t[4]=y[4]
+ pxor @t[3], @y[6]
+ pxor @y[3], @t[3] # t[3]=y[3]
+ pxor @t[2], @y[5]
+ pxor @y[2], @t[2] # t[2]=y[2]
+ pxor @t[7], @t[3]
+ pxor @y[5], @t[5] # t[5]=y[5]
+ pxor @t[6], @t[2]
+ pxor @t[6], @t[5]
+ pxor @y[6], @t[6] # t[6]=y[6]
+ pxor @y[7], @t[7] # t[7]=y[7]
+
+ movdqa @t[0],@XMM[0]
+ movdqa @t[1],@XMM[1]
+ movdqa @t[2],@XMM[2]
+ movdqa @t[3],@XMM[3]
+ movdqa @t[4],@XMM[4]
+ movdqa @t[5],@XMM[5]
+ movdqa @t[6],@XMM[6]
+ movdqa @t[7],@XMM[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ # multiplication by 0x05-0x00-0x04-0x00
+ pshufd \$0x4E, @x[0], @t[0]
+ pshufd \$0x4E, @x[6], @t[6]
+ pxor @x[0], @t[0]
+ pshufd \$0x4E, @x[7], @t[7]
+ pxor @x[6], @t[6]
+ pshufd \$0x4E, @x[1], @t[1]
+ pxor @x[7], @t[7]
+ pshufd \$0x4E, @x[2], @t[2]
+ pxor @x[1], @t[1]
+ pshufd \$0x4E, @x[3], @t[3]
+ pxor @x[2], @t[2]
+ pxor @t[6], @x[0]
+ pxor @t[6], @x[1]
+ pshufd \$0x4E, @x[4], @t[4]
+ pxor @x[3], @t[3]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[3]
+ pshufd \$0x4E, @x[5], @t[5]
+ pxor @x[4], @t[4]
+ pxor @t[7], @x[1]
+ pxor @t[2], @x[4]
+ pxor @x[5], @t[5]
+
+ pxor @t[7], @x[2]
+ pxor @t[6], @x[3]
+ pxor @t[6], @x[4]
+ pxor @t[3], @x[5]
+ pxor @t[4], @x[6]
+ pxor @t[7], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[5], @x[7]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub aesenc { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x30($const),@t[0] # .LSR
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+ &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x40($const),@t[0] # .LSRM0
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+$code.=<<___
+ pxor 0x00($key),@b[0]
+ pxor 0x10($key),@b[1]
+ pxor 0x20($key),@b[4]
+ pxor 0x30($key),@b[6]
+ pxor 0x40($key),@b[3]
+ pxor 0x50($key),@b[7]
+ pxor 0x60($key),@b[2]
+ pxor 0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ movdqa $b,$t
+ psrlq \$$n,$b
+ pxor $a,$b
+ pand $mask,$b
+ pxor $b,$a
+ psllq \$$n,$b
+ pxor $t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ movdqa $b0,$t0
+ psrlq \$$n,$b0
+ movdqa $b1,$t1
+ psrlq \$$n,$b1
+ pxor $a0,$b0
+ pxor $a1,$b1
+ pand $mask,$b0
+ pand $mask,$b1
+ pxor $b0,$a0
+ psllq \$$n,$b0
+ pxor $b1,$a1
+ psllq \$$n,$b1
+ pxor $t0,$b0
+ pxor $t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ movdqa 0x00($const),$t0 # .LBS0
+ movdqa 0x10($const),$t1 # .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ movdqa 0x20($const),$t0 # .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern asm_AES_encrypt
+.extern asm_AES_decrypt
+
+.type _bsaes_encrypt8,\@abi-omnipotent
+.align 64
+_bsaes_encrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa 0x50($const), @XMM[8] # .LM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pxor @XMM[9], @XMM[2]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[4]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[6]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Lenc_sbox
+.align 16
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ movdqa 0x30($const), @XMM[8] # .LSR
+ jnz .Lenc_loop
+ movdqa 0x40($const), @XMM[8] # .LSRM0
+ jmp .Lenc_loop
+.align 16
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_decrypt8,\@abi-omnipotent
+.align 64
+_bsaes_decrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa -0x30($const), @XMM[8] # .LM0ISR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pxor @XMM[9], @XMM[2]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[4]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[6]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Ldec_sbox
+.align 16
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ movdqa -0x10($const), @XMM[8] # .LISR
+ jnz .Ldec_loop
+ movdqa -0x20($const), @XMM[8] # .LISRM0
+ jmp .Ldec_loop
+.align 16
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ #&swapmove(@x[2,3],1,$t0,$t2,$t3);
+ movdqa @x[0], @x[2]
+ movdqa @x[1], @x[3]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ movdqa @x[0], @x[4]
+ movdqa @x[2], @x[6]
+ movdqa @x[1], @x[5]
+ movdqa @x[3], @x[7]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,\@abi-omnipotent
+.align 16
+_bsaes_key_convert:
+ lea .Lmasks(%rip), $const
+ movdqu ($inp), %xmm7 # load round 0 key
+ lea 0x10($inp), $inp
+ movdqa 0x00($const), %xmm0 # 0x01...
+ movdqa 0x10($const), %xmm1 # 0x02...
+ movdqa 0x20($const), %xmm2 # 0x04...
+ movdqa 0x30($const), %xmm3 # 0x08...
+ movdqa 0x40($const), %xmm4 # .LM0
+ pcmpeqd %xmm5, %xmm5 # .LNOT
+
+ movdqu ($inp), %xmm6 # load round 1 key
+ movdqa %xmm7, ($out) # save round 0 key
+ lea 0x10($out), $out
+ dec $rounds
+ jmp .Lkey_loop
+.align 16
+.Lkey_loop:
+ pshufb %xmm4, %xmm6 # .LM0
+
+ movdqa %xmm0, %xmm8
+ movdqa %xmm1, %xmm9
+
+ pand %xmm6, %xmm8
+ pand %xmm6, %xmm9
+ movdqa %xmm2, %xmm10
+ pcmpeqb %xmm0, %xmm8
+ psllq \$4, %xmm0 # 0x10...
+ movdqa %xmm3, %xmm11
+ pcmpeqb %xmm1, %xmm9
+ psllq \$4, %xmm1 # 0x20...
+
+ pand %xmm6, %xmm10
+ pand %xmm6, %xmm11
+ movdqa %xmm0, %xmm12
+ pcmpeqb %xmm2, %xmm10
+ psllq \$4, %xmm2 # 0x40...
+ movdqa %xmm1, %xmm13
+ pcmpeqb %xmm3, %xmm11
+ psllq \$4, %xmm3 # 0x80...
+
+ movdqa %xmm2, %xmm14
+ movdqa %xmm3, %xmm15
+ pxor %xmm5, %xmm8 # "pnot"
+ pxor %xmm5, %xmm9
+
+ pand %xmm6, %xmm12
+ pand %xmm6, %xmm13
+ movdqa %xmm8, 0x00($out) # write bit-sliced round key
+ pcmpeqb %xmm0, %xmm12
+ psrlq \$4, %xmm0 # 0x01...
+ movdqa %xmm9, 0x10($out)
+ pcmpeqb %xmm1, %xmm13
+ psrlq \$4, %xmm1 # 0x02...
+ lea 0x10($inp), $inp
+
+ pand %xmm6, %xmm14
+ pand %xmm6, %xmm15
+ movdqa %xmm10, 0x20($out)
+ pcmpeqb %xmm2, %xmm14
+ psrlq \$4, %xmm2 # 0x04...
+ movdqa %xmm11, 0x30($out)
+ pcmpeqb %xmm3, %xmm15
+ psrlq \$4, %xmm3 # 0x08...
+ movdqu ($inp), %xmm6 # load next round key
+
+ pxor %xmm5, %xmm13 # "pnot"
+ pxor %xmm5, %xmm14
+ movdqa %xmm12, 0x40($out)
+ movdqa %xmm13, 0x50($out)
+ movdqa %xmm14, 0x60($out)
+ movdqa %xmm15, 0x70($out)
+ lea 0x80($out),$out
+ dec $rounds
+ jnz .Lkey_loop
+
+ movdqa 0x50($const), %xmm7 # .L63
+ #movdqa %xmm6, ($out) # don't save last round key
+ ret
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,\@function,2
+.align 16
+bsaes_enc_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+ ret
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,\@function,4
+.align 16
+bsaes_encrypt_128:
+.Lenc128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Lenc128_loop
+ ret
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,\@function,2
+.align 16
+bsaes_dec_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor ($out),%xmm7 # fix up round 0 key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,($out)
+ ret
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,\@function,4
+.align 16
+bsaes_decrypt_128:
+.Ldec128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Ldec128_loop
+ ret
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+ : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl bsaes_ecb_encrypt_blocks
+.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_encrypt_blocks:
+ mov %rsp, %rax
+.Lecb_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_enc_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ sub \$8,$len
+.Lecb_enc_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_enc_loop
+
+ add \$8,$len
+ jz .Lecb_enc_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_enc_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_enc_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_enc_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_enc_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_enc_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_enc_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_six:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_five:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_four:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_three:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_two:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_one:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_enc_short
+
+.Lecb_enc_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_enc_epilogue:
+ ret
+.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl bsaes_ecb_decrypt_blocks
+.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_decrypt_blocks:
+ mov %rsp, %rax
+.Lecb_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_dec_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ sub \$8,$len
+.Lecb_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_dec_loop
+
+ add \$8,$len
+ jz .Lecb_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_six:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_five:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_four:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_three:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_two:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_one:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_dec_short
+
+.Lecb_dec_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_dec_epilogue:
+ ret
+.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern asm_AES_cbc_encrypt
+.globl bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,\@abi-omnipotent
+.align 16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+ mov 48(%rsp),$arg6 # pull direction flag
+___
+$code.=<<___;
+ cmp \$0,$arg6
+ jne asm_AES_cbc_encrypt
+ cmp \$128,$arg3
+ jb asm_AES_cbc_encrypt
+
+ mov %rsp, %rax
+.Lcbc_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ mov $arg5, %rbx
+ shr \$4, $len # bytes to blocks
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ movdqu (%rbx), @XMM[15] # load IV
+ sub \$8,$len
+.Lcbc_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %edx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+
+ call _bsaes_decrypt8
+
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[13], @XMM[3]
+ movdqu 0x70($inp), @XMM[15] # IV
+ pxor @XMM[14], @XMM[5]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x80($inp), $inp
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lcbc_dec_loop
+
+ add \$8,$len
+ jz .Lcbc_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+ cmp \$2,$len
+ jb .Lcbc_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lcbc_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lcbc_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lcbc_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lcbc_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lcbc_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[15] # IV
+ pxor @XMM[13], @XMM[3]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[15] # IV
+ pxor @XMM[12], @XMM[7]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[15] # IV
+ pxor @XMM[11], @XMM[2]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[15] # IV
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[15] # IV
+ pxor @XMM[9], @XMM[6]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[15] # IV
+ pxor @XMM[8], @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+ lea ($inp), $arg1
+ lea 0x20(%rbp), $arg2 # buffer output
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[15] # ^= IV
+ movdqu @XMM[15], ($out) # write output
+ movdqa @XMM[0], @XMM[15] # IV
+
+.Lcbc_dec_done:
+ movdqu @XMM[15], (%rbx) # return IV
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lcbc_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lcbc_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lcbc_dec_epilogue:
+ ret
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ctr32_encrypt_blocks:
+ mov %rsp, %rax
+.Lctr_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ movdqu ($arg5), %xmm0 # load counter
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ movdqa %xmm0, 0x20(%rbp) # copy counter
+ cmp \$8, $arg3
+ jb .Lctr_enc_short
+
+ mov %eax, %ebx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %ebx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ movdqa (%rsp), @XMM[9] # load round0 key
+ lea .LADD1(%rip), %r11
+ movdqa 0x20(%rbp), @XMM[0] # counter copy
+ movdqa -0x20(%r11), @XMM[8] # .LSWPUP
+ pshufb @XMM[8], @XMM[9] # byte swap upper part
+ pshufb @XMM[8], @XMM[0]
+ movdqa @XMM[9], (%rsp) # save adjusted round0 key
+ jmp .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+ movdqa @XMM[0], 0x20(%rbp) # save counter
+ movdqa @XMM[0], @XMM[1] # prepare 8 counter values
+ movdqa @XMM[0], @XMM[2]
+ paddd 0x00(%r11), @XMM[1] # .LADD1
+ movdqa @XMM[0], @XMM[3]
+ paddd 0x10(%r11), @XMM[2] # .LADD2
+ movdqa @XMM[0], @XMM[4]
+ paddd 0x20(%r11), @XMM[3] # .LADD3
+ movdqa @XMM[0], @XMM[5]
+ paddd 0x30(%r11), @XMM[4] # .LADD4
+ movdqa @XMM[0], @XMM[6]
+ paddd 0x40(%r11), @XMM[5] # .LADD5
+ movdqa @XMM[0], @XMM[7]
+ paddd 0x50(%r11), @XMM[6] # .LADD6
+ paddd 0x60(%r11), @XMM[7] # .LADD7
+
+ # Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ # to flip byte order in 32-bit counter
+ movdqa (%rsp), @XMM[9] # round 0 key
+ lea 0x10(%rsp), %rax # pass key schedule
+ movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pxor @XMM[9], @XMM[2]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[4]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[6]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+ lea .LBS0(%rip), %r11 # constants table
+ mov %ebx,%r10d # pass rounds
+
+ call _bsaes_encrypt8_bitslice
+
+ sub \$8,$len
+ jc .Lctr_enc_loop_done
+
+ movdqu 0x00($inp), @XMM[8] # load input
+ movdqu 0x10($inp), @XMM[9]
+ movdqu 0x20($inp), @XMM[10]
+ movdqu 0x30($inp), @XMM[11]
+ movdqu 0x40($inp), @XMM[12]
+ movdqu 0x50($inp), @XMM[13]
+ movdqu 0x60($inp), @XMM[14]
+ movdqu 0x70($inp), @XMM[15]
+ lea 0x80($inp),$inp
+ pxor @XMM[0], @XMM[8]
+ movdqa 0x20(%rbp), @XMM[0] # load counter
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[8], 0x00($out) # write output
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor @XMM[15], @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ lea .LADD1(%rip), %r11
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ paddd 0x70(%r11), @XMM[0] # .LADD8
+ jnz .Lctr_enc_loop
+
+ jmp .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+ add \$8, $len
+ movdqu 0x00($inp), @XMM[8] # load input
+ pxor @XMM[8], @XMM[0]
+ movdqu @XMM[0], 0x00($out) # write output
+ cmp \$2,$len
+ jb .Lctr_enc_done
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[1], 0x10($out)
+ je .Lctr_enc_done
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[4], 0x20($out)
+ cmp \$4,$len
+ jb .Lctr_enc_done
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[6], 0x30($out)
+ je .Lctr_enc_done
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[3], 0x40($out)
+ cmp \$6,$len
+ jb .Lctr_enc_done
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[7], 0x50($out)
+ je .Lctr_enc_done
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+ lea 0x20(%rbp), $arg1
+ lea 0x30(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ movdqu ($inp), @XMM[1]
+ lea 16($inp), $inp
+ mov 0x2c(%rbp), %eax # load 32-bit counter
+ bswap %eax
+ pxor 0x30(%rbp), @XMM[1]
+ inc %eax # increment
+ movdqu @XMM[1], ($out)
+ bswap %eax
+ lea 16($out), $out
+ mov %eax, 0x2c(%rsp) # save 32-bit counter
+ dec $len
+ jnz .Lctr_enc_short
+
+.Lctr_enc_done:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lctr_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lctr_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lctr_enc_epilogue:
+ ret
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$arg6=~s/d$//;
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_encrypt:
+ mov %rsp, %rax
+.Lxts_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6, %xmm7 # fix up last round key
+ movdqa %xmm7, (%rax) # save last round key
+
+ and \$-16, $len
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_enc_short
+ jmp .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_enc_loop
+
+.Lxts_enc_short:
+ add \$0x80, $len
+ jz .Lxts_enc_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_enc_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_encrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_enc_done:
+ and \$15, %ebx
+ jz .Lxts_enc_ret
+ mov $out, %rdx
+
+.Lxts_enc_steal:
+ movzb ($inp), %eax
+ movzb -16(%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, -16(%rdx)
+ mov %cl, 0(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_enc_steal
+
+ movdqu -16($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ movdqu @XMM[7], -16($out)
+
+.Lxts_enc_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_enc_epilogue:
+ ret
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_decrypt:
+ mov %rsp, %rax
+.Lxts_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp), %xmm7 # fix up round 0 key
+ movdqa %xmm6, (%rax) # save last round key
+ movdqa %xmm7, (%rsp)
+
+ xor %eax, %eax # if ($len%16) len-=16;
+ and \$-16, $len
+ test \$15, %ebx
+ setnz %al
+ shl \$4, %rax
+ sub %rax, $len
+
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_dec_short
+ jmp .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_dec_loop
+
+.Lxts_dec_short:
+ add \$0x80, $len
+ jz .Lxts_dec_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_dec_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_decrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_dec_done:
+ and \$15, %ebx
+ jz .Lxts_dec_ret
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ movdqa @XMM[7], @XMM[6]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ movdqu ($inp), @XMM[0]
+ pxor $twres, @XMM[7]
+
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ mov $out, %rdx
+ movdqu @XMM[7], ($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp), %eax
+ movzb (%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, (%rdx)
+ mov %cl, 16(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_dec_steal
+
+ movdqu ($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[6], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[6]
+ movdqu @XMM[6], ($out)
+
+.Lxts_dec_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_dec_epilogue:
+ ret
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type _bsaes_const,\@object
+.align 64
+_bsaes_const:
+.LM0ISR: # InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0: # bit-slice constants
+ .quad 0x5555555555555555, 0x5555555555555555
+.LBS1:
+ .quad 0x3333333333333333, 0x3333333333333333
+.LBS2:
+ .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR: # shiftrows constants
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP: # byte-swap upper dword
+ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1: # counter increment constants
+ .quad 0x0000000000000000, 0x0000000100000000
+.LADD2:
+ .quad 0x0000000000000000, 0x0000000200000000
+.LADD3:
+ .quad 0x0000000000000000, 0x0000000300000000
+.LADD4:
+ .quad 0x0000000000000000, 0x0000000400000000
+.LADD5:
+ .quad 0x0000000000000000, 0x0000000500000000
+.LADD6:
+ .quad 0x0000000000000000, 0x0000000600000000
+.LADD7:
+ .quad 0x0000000000000000, 0x0000000700000000
+.LADD8:
+ .quad 0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+ .long 0x87,0,1,0
+.Lmasks:
+ .quad 0x0101010101010101, 0x0101010101010101
+ .quad 0x0202020202020202, 0x0202020202020202
+ .quad 0x0404040404040404, 0x0404040404040404
+ .quad 0x0808080808080808, 0x0808080808080808
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+ .quad 0x6363636363636363, 0x6363636363636363
+.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align 64
+.size _bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov 160($context),%rax # pull context->Rbp
+
+ lea 0x40(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0(%rax),%rax # adjust stack pointer
+
+ mov 0x70(%rax),%rbp
+ mov 0x68(%rax),%rbx
+ mov 0x60(%rax),%r12
+ mov 0x58(%rax),%r13
+ mov 0x50(%rax),%r14
+ mov 0x48(%rax),%r15
+ lea 0x78(%rax),%rax # adjust stack pointer
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov %rax,152($context) # restore context->Rsp
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($ecb);
+ .rva .Lecb_enc_prologue
+ .rva .Lecb_enc_epilogue
+ .rva .Lecb_enc_info
+
+ .rva .Lecb_dec_prologue
+ .rva .Lecb_dec_epilogue
+ .rva .Lecb_dec_info
+___
+$code.=<<___;
+ .rva .Lcbc_dec_prologue
+ .rva .Lcbc_dec_epilogue
+ .rva .Lcbc_dec_info
+
+ .rva .Lctr_enc_prologue
+ .rva .Lctr_enc_epilogue
+ .rva .Lctr_enc_info
+
+ .rva .Lxts_enc_prologue
+ .rva .Lxts_enc_epilogue
+ .rva .Lxts_enc_info
+
+ .rva .Lxts_dec_prologue
+ .rva .Lxts_dec_epilogue
+ .rva .Lxts_dec_info
+
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
+.Lecb_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
+.Lctr_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
+.Lxts_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.Lxts_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/vpaes-armv8.pl b/openssl-1.1.0h/crypto/aes/asm/vpaes-armv8.pl
new file mode 100755
index 0000000..d6b5f56
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/vpaes-armv8.pl
@@ -0,0 +1,1259 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+# ARMv8 NEON adaptation by <appro@openssl.org>
+#
+# Reason for undertaken effort is that there is at least one popular
+# SoC based on Cortex-A53 that doesn't have crypto extensions.
+#
+# CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
+# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
+# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
+# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
+# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
+# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
+# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
+#
+# (*) ECB denotes approximate result for parallelizeable modes
+# such as CBC decrypt, CTR, etc.;
+# (**) these results are worse than scalar compiler-generated
+# code, but it's constant-time and therefore preferred;
+# (***) presented for reference/comparison purposes;
+
+$flavour = shift;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+.text
+
+.type _vpaes_consts,%object
+.align 7 // totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward: // mc_forward
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:// mc_backward
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr: // sr
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv: // inv, inva
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt: // input transform (lo, hi)
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo: // sbou, sbot
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1: // sb1u, sb1t
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2: // sb2u, sb2t
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+// Decryption stuff
+//
+.Lk_dipt: // decryption input transform
+ .quad 0x0F505B040B545F00, 0x154A411E114E451A
+ .quad 0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo: // decryption sbox final output
+ .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9: // decryption sbox output *9*u, *9*t
+ .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: // decryption sbox output *D*u, *D*t
+ .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: // decryption sbox output *B*u, *B*t
+ .quad 0xD022649296B44200, 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: // decryption sbox output *E*u, *E*t
+ .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+// Key schedule constants
+//
+.Lk_dksd: // decryption key schedule: invskew x*D
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: // decryption key schedule: invskew x*B
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: // decryption key schedule: invskew x*E + 0x63
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: // decryption key schedule: invskew x*9
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon: // rcon
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt: // output transform
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew: // deskew tables: inverts the sbox's "skew"
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.asciz "Vector Permutaion AES for ARMv8, Mike Hamburg (Stanford University)"
+.size _vpaes_consts,.-_vpaes_consts
+.align 6
+___
+
+{
+my ($inp,$out,$key) = map("x$_",(0..2));
+
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
+
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.type _vpaes_encrypt_preheat,%function
+.align 4
+_vpaes_encrypt_preheat:
+ adr x10, .Lk_inv
+ movi v17.16b, #0x0f
+ ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
+ ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
+ ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
+ ret
+.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type _vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+ mov x9, $key
+ ldr w8, [$key,#240] // pull rounds
+ adr x11, .Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
+ tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b .Lenc_entry
+
+.align 4
+.Lenc_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ sub w8, w8, #1 // nr--
+
+.Lenc_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, .Lenc_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl vpaes_encrypt
+.type vpaes_encrypt,%function
+.align 4
+vpaes_encrypt:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [$inp]
+ bl _vpaes_encrypt_preheat
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [$out]
+
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_encrypt,.-vpaes_encrypt
+
+.type _vpaes_encrypt_2x,%function
+.align 4
+_vpaes_encrypt_2x:
+ mov x9, $key
+ ldr w8, [$key,#240] // pull rounds
+ adr x11, .Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ tbl v9.16b, {$iptlo}, v9.16b
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ tbl v10.16b, {$ipthi}, v8.16b
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v8.16b, v9.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b .Lenc_2x_entry
+
+.align 4
+.Lenc_2x_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ tbl v12.16b, {$sb1t}, v10.16b
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ tbl v8.16b, {$sb1u}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ tbl v13.16b, {$sb2t}, v10.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ tbl v10.16b, {$sb2u}, v11.16b
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ tbl v11.16b, {v8.16b}, v1.16b
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ eor v10.16b, v10.16b, v13.16b
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ tbl v8.16b, {v8.16b}, v4.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ eor v11.16b, v11.16b, v10.16b
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ tbl v12.16b, {v11.16b},v1.16b
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ eor v8.16b, v8.16b, v11.16b
+ and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ eor v8.16b, v8.16b, v12.16b
+ sub w8, w8, #1 // nr--
+
+.Lenc_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ tbl v13.16b, {$invhi},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {$invlo},v8.16b
+ tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {$invlo},v9.16b
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v13.16b
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v13.16b
+ tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {$invlo},v11.16b
+ tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {$invlo},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, .Lenc_2x_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {$sbou}, v10.16b
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ tbl v8.16b, {$sbot}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v1.16b
+ ret
+.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type _vpaes_decrypt_preheat,%function
+.align 4
+_vpaes_decrypt_preheat:
+ adr x10, .Lk_inv
+ movi v17.16b, #0x0f
+ adr x11, .Lk_dipt
+ ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
+ ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
+ ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
+ ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
+ ret
+.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.type _vpaes_decrypt_core,%function
+.align 4
+_vpaes_decrypt_core:
+ mov x9, $key
+ ldr w8, [$key,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
+ eor x11, x11, #0x30 // xor \$0x30, %r11
+ adr x10, .Lk_sr
+ and x11, x11, #0x30 // and \$0x30, %r11
+ add x11, x11, x10
+ adr x10, .Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
+ tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b .Ldec_entry
+
+.align 4
+.Ldec_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ sub w8, w8, #1 // sub \$1,%rax # nr--
+
+.Ldec_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, .Ldec_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl vpaes_decrypt
+.type vpaes_decrypt,%function
+.align 4
+vpaes_decrypt:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [$inp]
+ bl _vpaes_decrypt_preheat
+ bl _vpaes_decrypt_core
+ st1 {v0.16b}, [$out]
+
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type _vpaes_decrypt_2x,%function
+.align 4
+_vpaes_decrypt_2x:
+ mov x9, $key
+ ldr w8, [$key,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
+ eor x11, x11, #0x30 // xor \$0x30, %r11
+ adr x10, .Lk_sr
+ and x11, x11, #0x30 // and \$0x30, %r11
+ add x11, x11, x10
+ adr x10, .Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ tbl v10.16b, {$iptlo},v9.16b
+ ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ tbl v8.16b, {$ipthi},v8.16b
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v10.16b, v10.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b .Ldec_2x_entry
+
+.align 4
+.Ldec_2x_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v12.16b, {$sb9u}, v10.16b
+ tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ tbl v9.16b, {$sb9t}, v11.16b
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ eor v8.16b, v12.16b, v16.16b
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v12.16b, {$sbdu}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ tbl v9.16b, {$sbdt}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v12.16b, {$sbbu}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ tbl v9.16b, {$sbbt}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v12.16b, {$sbeu}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ tbl v9.16b, {$sbet}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ sub w8, w8, #1 // sub \$1,%rax # nr--
+
+.Ldec_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ tbl v10.16b, {$invhi},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {$invlo},v8.16b
+ tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {$invlo},v9.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v10.16b
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v10.16b
+ tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {$invlo},v11.16b
+ tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {$invlo},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, .Ldec_2x_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {$sbou}, v10.16b
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ tbl v9.16b, {$sbot}, v11.16b
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ eor v8.16b, v9.16b, v12.16b
+ tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v2.16b
+ ret
+.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+___
+}
+{
+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
+
+$code.=<<___;
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.type _vpaes_key_preheat,%function
+.align 4
+_vpaes_key_preheat:
+ adr x10, .Lk_inv
+ movi v16.16b, #0x5b // .Lk_s63
+ adr x11, .Lk_sb1
+ movi v17.16b, #0x0f // .Lk_s0F
+ ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
+ adr x10, .Lk_dksd
+ ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
+ adr x11, .Lk_mc_forward
+ ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
+ ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
+ ld1 {v8.2d}, [x10] // .Lk_rcon
+ ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
+ ret
+.size _vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type _vpaes_schedule_core,%function
+.align 4
+_vpaes_schedule_core:
+ stp x29, x30, [sp,#-16]!
+ add x29,sp,#0
+
+ bl _vpaes_key_preheat // load the tables
+
+ ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ // input transform
+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
+
+ adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
+ add x8, x8, x10
+ cbnz $dir, .Lschedule_am_decrypting
+
+ // encrypting, output zeroth round key after transform
+ st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
+ b .Lschedule_go
+
+.Lschedule_am_decrypting:
+ // decrypting, output zeroth round key after shiftrows
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
+ eor x8, x8, #0x30 // xor \$0x30, %r8
+
+.Lschedule_go:
+ cmp $bits, #192 // cmp \$192, %esi
+ b.hi .Lschedule_256
+ b.eq .Lschedule_192
+ // 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ mov $inp, #10 // mov \$10, %esi
+
+.Loop_schedule_128:
+ sub $inp, $inp, #1 // dec %esi
+ bl _vpaes_schedule_round
+ cbz $inp, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // write output
+ b .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+.Lschedule_192:
+ sub $inp, $inp, #8
+ ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov $inp, #4 // mov \$4, %esi
+
+.Loop_schedule_192:
+ sub $inp, $inp, #1 // dec %esi
+ bl _vpaes_schedule_round
+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle // save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle // save key n+1
+ bl _vpaes_schedule_round
+ cbz $inp, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // save key n+2
+ bl _vpaes_schedule_192_smear
+ b .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+.Lschedule_256:
+ ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov $inp, #7 // mov \$7, %esi
+
+.Loop_schedule_256:
+ sub $inp, $inp, #1 // dec %esi
+ bl _vpaes_schedule_mangle // output low result
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ // high round
+ bl _vpaes_schedule_round
+ cbz $inp, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ // low round. swap xmm7 and xmm6
+ dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
+ movi v4.16b, #0
+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
+
+ b .Loop_schedule_256
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+.Lschedule_mangle_last:
+ // schedule last round key from xmm0
+ adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ cbnz $dir, .Lschedule_mangle_last_dec
+
+ // encrypting
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
+ adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add $out, $out, #32 // add \$32, %rdx
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+.Lschedule_mangle_last_dec:
+ ld1 {v20.2d-v21.2d}, [x11] // reload constants
+ sub $out, $out, #16 // add \$-16, %rdx
+ eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform // output transform
+ st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
+
+ // cleanup
+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
+ ldp x29, x30, [sp],#16
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.type _vpaes_schedule_192_smear,%function
+.align 4
+_vpaes_schedule_192_smear:
+ movi v1.16b, #0
+ dup v0.4s, v7.s[3]
+ ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.type _vpaes_schedule_round,%function
+.align 4
+_vpaes_schedule_round:
+ // extract rcon from xmm8
+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
+ ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
+ ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+
+ // rotate
+ dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
+
+ // fall through...
+
+ // low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ // smear xmm7
+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
+
+ // subbytes
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
+ tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
+ tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ // add in smeared stuff
+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.type _vpaes_schedule_transform,%function
+.align 4
+_vpaes_schedule_transform:
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
+ // vmovdqa (%r11), %xmm2 # lo
+ tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ // vmovdqa 16(%r11), %xmm1 # hi
+ tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.type _vpaes_schedule_mangle,%function
+.align 4
+_vpaes_schedule_mangle:
+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ // vmovdqa .Lk_mc_forward(%rip),%xmm5
+ cbnz $dir, .Lschedule_mangle_dec
+
+ // encrypting
+ eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
+ add $out, $out, #16 // add \$16, %rdx
+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
+
+ b .Lschedule_mangle_both
+.align 4
+.Lschedule_mangle_dec:
+ // inverse mix columns
+ // lea .Lk_dksd(%rip),%r11
+ ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
+ and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ // vmovdqa 0x00(%r11), %xmm2
+ tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ // vmovdqa 0x10(%r11), %xmm3
+ tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x20(%r11), %xmm2
+ tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x30(%r11), %xmm3
+ tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x40(%r11), %xmm2
+ tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x50(%r11), %xmm3
+ tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+
+ // vmovdqa 0x60(%r11), %xmm2
+ tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+ // vmovdqa 0x70(%r11), %xmm4
+ tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
+
+ sub $out, $out, #16 // add \$-16, %rdx
+
+.Lschedule_mangle_both:
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ add x8, x8, #64-16 // add \$-16, %r8
+ and x8, x8, #~(1<<6) // and \$0x30, %r8
+ st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,%function
+.align 4
+vpaes_set_encrypt_key:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, $bits, #5 // shr \$5,%eax
+ add w9, w9, #5 // \$5,%eax
+ str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov $dir, #0 // mov \$0,%ecx
+ mov x8, #0x30 // mov \$0x30,%r8d
+ bl _vpaes_schedule_core
+ eor x0, x0, x0
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,%function
+.align 4
+vpaes_set_decrypt_key:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, $bits, #5 // shr \$5,%eax
+ add w9, w9, #5 // \$5,%eax
+ str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl w9, w9, #4 // shl \$4,%eax
+ add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
+ add $out, $out, x9
+
+ mov $dir, #1 // mov \$1,%ecx
+ lsr w8, $bits, #1 // shr \$1,%r8d
+ and x8, x8, #32 // and \$32,%r8d
+ eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+___
+}
+{
+my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
+
+$code.=<<___;
+.globl vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,%function
+.align 4
+vpaes_cbc_encrypt:
+ cbz $len, .Lcbc_abort
+ cmp w5, #0 // check direction
+ b.eq vpaes_cbc_decrypt
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x17, $len // reassign
+ mov x2, $key // reassign
+
+ ld1 {v0.16b}, [$ivec] // load ivec
+ bl _vpaes_encrypt_preheat
+ b .Lcbc_enc_loop
+
+.align 4
+.Lcbc_enc_loop:
+ ld1 {v7.16b}, [$inp],#16 // load input
+ eor v7.16b, v7.16b, v0.16b // xor with ivec
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [$out],#16 // save output
+ subs x17, x17, #16
+ b.hi .Lcbc_enc_loop
+
+ st1 {v0.16b}, [$ivec] // write ivec
+
+ ldp x29,x30,[sp],#16
+.Lcbc_abort:
+ ret
+.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type vpaes_cbc_decrypt,%function
+.align 4
+vpaes_cbc_decrypt:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, $len // reassign
+ mov x2, $key // reassign
+ ld1 {v6.16b}, [$ivec] // load ivec
+ bl _vpaes_decrypt_preheat
+ tst x17, #16
+ b.eq .Lcbc_dec_loop2x
+
+ ld1 {v7.16b}, [$inp], #16 // load input
+ bl _vpaes_decrypt_core
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ orr v6.16b, v7.16b, v7.16b // next ivec value
+ st1 {v0.16b}, [$out], #16
+ subs x17, x17, #16
+ b.ls .Lcbc_dec_done
+
+.align 4
+.Lcbc_dec_loop2x:
+ ld1 {v14.16b,v15.16b}, [$inp], #32
+ bl _vpaes_decrypt_2x
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ eor v1.16b, v1.16b, v14.16b
+ orr v6.16b, v15.16b, v15.16b
+ st1 {v0.16b,v1.16b}, [$out], #32
+ subs x17, x17, #32
+ b.hi .Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+ st1 {v6.16b}, [$ivec]
+
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+___
+if (1) {
+$code.=<<___;
+.globl vpaes_ecb_encrypt
+.type vpaes_ecb_encrypt,%function
+.align 4
+vpaes_ecb_encrypt:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, $len
+ mov x2, $key
+ bl _vpaes_encrypt_preheat
+ tst x17, #16
+ b.eq .Lecb_enc_loop
+
+ ld1 {v7.16b}, [$inp],#16
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [$out],#16
+ subs x17, x17, #16
+ b.ls .Lecb_enc_done
+
+.align 4
+.Lecb_enc_loop:
+ ld1 {v14.16b,v15.16b}, [$inp], #32
+ bl _vpaes_encrypt_2x
+ st1 {v0.16b,v1.16b}, [$out], #32
+ subs x17, x17, #32
+ b.hi .Lecb_enc_loop
+
+.Lecb_enc_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
+
+.globl vpaes_ecb_decrypt
+.type vpaes_ecb_decrypt,%function
+.align 4
+vpaes_ecb_decrypt:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, $len
+ mov x2, $key
+ bl _vpaes_decrypt_preheat
+ tst x17, #16
+ b.eq .Lecb_dec_loop
+
+ ld1 {v7.16b}, [$inp],#16
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [$out],#16
+ subs x17, x17, #16
+ b.ls .Lecb_dec_done
+
+.align 4
+.Lecb_dec_loop:
+ ld1 {v14.16b,v15.16b}, [$inp], #32
+ bl _vpaes_decrypt_2x
+ st1 {v0.16b,v1.16b}, [$out], #32
+ subs x17, x17, #32
+ b.hi .Lecb_dec_loop
+
+.Lecb_dec_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ ret
+.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
+___
+} }
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/vpaes-ppc.pl b/openssl-1.1.0h/crypto/aes/asm/vpaes-ppc.pl
new file mode 100644
index 0000000..bb38fbe
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/vpaes-ppc.pl
@@ -0,0 +1,1594 @@
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+# CBC encrypt/decrypt performance in cycles per byte processed with
+# 128-bit key.
+#
+# aes-ppc.pl this
+# PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
+# PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
+# POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
+# POWER7 32.3/42.9/(18.4) 18.5/23.3
+#
+# (*) This is ~10% worse than reported in paper. The reason is
+# twofold. This module doesn't make any assumption about
+# key schedule (or data for that matter) alignment and handles
+# it in-line. Secondly it, being transliterated from
+# vpaes-x86_64.pl, relies on "nested inversion" better suited
+# for Intel CPUs.
+# (**) Inadequate POWER6 performance is due to astronomic AltiVec
+# latency, 9 cycles per simple logical operation.
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+ $UCMP ="cmpld";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+ $UCMP ="cmplw";
+} else { die "nonsense $flavour"; }
+
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$code.=<<___;
+.machine "any"
+
+.text
+
+.align 7 # totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward: # mc_forward
+ .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
+ .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
+ .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
+ .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
+Lk_mc_backward: # mc_backward
+ .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
+ .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
+ .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
+ .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
+Lk_sr: # sr
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
+ .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
+ .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
+ .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
+
+##
+## "Hot" constants
+##
+Lk_inv: # inv, inva
+ .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
+ .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
+Lk_ipt: # input transform (lo, hi)
+ .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
+ .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
+Lk_sbo: # sbou, sbot
+ .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
+ .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
+Lk_sb1: # sb1u, sb1t
+ .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
+ .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
+Lk_sb2: # sb2u, sb2t
+ .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
+ .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
+
+##
+## Decryption stuff
+##
+Lk_dipt: # decryption input transform
+ .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
+ .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
+Lk_dsbo: # decryption sbox final output
+ .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
+ .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
+Lk_dsb9: # decryption sbox output *9*u, *9*t
+ .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
+ .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
+Lk_dsbd: # decryption sbox output *D*u, *D*t
+ .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
+ .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
+Lk_dsbb: # decryption sbox output *B*u, *B*t
+ .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
+ .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
+Lk_dsbe: # decryption sbox output *E*u, *E*t
+ .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
+ .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
+
+##
+## Key schedule constants
+##
+Lk_dksd: # decryption key schedule: invskew x*D
+ .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
+ .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
+Lk_dksb: # decryption key schedule: invskew x*B
+ .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
+ .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
+Lk_dkse: # decryption key schedule: invskew x*E + 0x63
+ .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
+ .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
+Lk_dks9: # decryption key schedule: invskew x*9
+ .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
+ .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
+
+Lk_rcon: # rcon
+ .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
+Lk_s63:
+ .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
+
+Lk_opt: # output transform
+ .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
+ .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
+Lk_deskew: # deskew tables: inverts the sbox's "skew"
+ .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
+ .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
+.align 5
+Lconsts:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr r12 #vvvvv "distance between . and _vpaes_consts
+ addi r12,r12,-0x308
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
+.align 6
+___
+
+my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
+{
+my ($inp,$out,$key) = map("r$_",(3..5));
+
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
+
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.align 4
+_vpaes_encrypt_preheat:
+ mflr r8
+ bl Lconsts
+ mtlr r8
+ li r11, 0xc0 # Lk_inv
+ li r10, 0xd0
+ li r9, 0xe0 # Lk_ipt
+ li r8, 0xf0
+ vxor v7, v7, v7 # 0x00..00
+ vspltisb v8,4 # 0x04..04
+ vspltisb v9,0x0f # 0x0f..0f
+ lvx $invlo, r12, r11
+ li r11, 0x100
+ lvx $invhi, r12, r10
+ li r10, 0x110
+ lvx $iptlo, r12, r9
+ li r9, 0x120
+ lvx $ipthi, r12, r8
+ li r8, 0x130
+ lvx $sbou, r12, r11
+ li r11, 0x140
+ lvx $sbot, r12, r10
+ li r10, 0x150
+ lvx $sb1u, r12, r9
+ lvx $sb1t, r12, r8
+ lvx $sb2u, r12, r11
+ lvx $sb2t, r12, r10
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
+##
+##
+.align 5
+_vpaes_encrypt_core:
+ lwz r8, 240($key) # pull rounds
+ li r9, 16
+ lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
+ li r11, 0x10
+ lvx v6, r9, $key
+ addi r9, r9, 16
+ ?vperm v5, v5, v6, $keyperm # align round key
+ addi r10, r11, 0x40
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
+ vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
+ vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
+ vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
+ vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
+ mtctr r8
+ b Lenc_entry
+
+.align 4
+Lenc_loop:
+ # middle of middle round
+ vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ addi r11, r11, 16
+ vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
+ vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ addi r10, r11, 0x40
+ vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+
+Lenc_entry:
+ # top of round
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vand v0, v0, v9
+ vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vmr v5, v6
+ lvx v6, r9, $key # vmovdqu (%r9), %xmm5
+ vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ addi r9, r9, 16
+ vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ ?vperm v5, v5, v6, $keyperm # align round key
+ vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ bdnz Lenc_loop
+
+ # middle of last round
+ addi r10, r11, 0x80
+ # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .vpaes_encrypt
+.align 5
+.vpaes_encrypt:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r6
+ mfspr r7, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r7,`$FRAME-4`($sp) # save vrsave
+ li r0, -1
+ $PUSH r6,`$FRAME+$LRSAVE`($sp)
+ mtspr 256, r0 # preserve all AltiVec registers
+
+ bl _vpaes_encrypt_preheat
+
+ ?lvsl $inpperm, 0, $inp # prepare for unaligned access
+ lvx v0, 0, $inp
+ addi $inp, $inp, 15 # 15 is not a typo
+ ?lvsr $outperm, 0, $out
+ ?lvsl $keyperm, 0, $key # prepare for unaligned access
+ lvx $inptail, 0, $inp # redundant in aligned case
+ ?vperm v0, v0, $inptail, $inpperm
+
+ bl _vpaes_encrypt_core
+
+ andi. r8, $out, 15
+ li r9, 16
+ beq Lenc_out_aligned
+
+ vperm v0, v0, v0, $outperm # rotate right/left
+ mtctr r9
+Lenc_out_unaligned:
+ stvebx v0, 0, $out
+ addi $out, $out, 1
+ bdnz Lenc_out_unaligned
+ b Lenc_done
+
+.align 4
+Lenc_out_aligned:
+ stvx v0, 0, $out
+Lenc_done:
+
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtlr r6
+ mtspr 256, r7 # restore vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_encrypt,.-.vpaes_encrypt
+
+.align 4
+_vpaes_decrypt_preheat:
+ mflr r8
+ bl Lconsts
+ mtlr r8
+ li r11, 0xc0 # Lk_inv
+ li r10, 0xd0
+ li r9, 0x160 # Ldipt
+ li r8, 0x170
+ vxor v7, v7, v7 # 0x00..00
+ vspltisb v8,4 # 0x04..04
+ vspltisb v9,0x0f # 0x0f..0f
+ lvx $invlo, r12, r11
+ li r11, 0x180
+ lvx $invhi, r12, r10
+ li r10, 0x190
+ lvx $iptlo, r12, r9
+ li r9, 0x1a0
+ lvx $ipthi, r12, r8
+ li r8, 0x1b0
+ lvx $sbou, r12, r11
+ li r11, 0x1c0
+ lvx $sbot, r12, r10
+ li r10, 0x1d0
+ lvx $sb9u, r12, r9
+ li r9, 0x1e0
+ lvx $sb9t, r12, r8
+ li r8, 0x1f0
+ lvx $sbdu, r12, r11
+ li r11, 0x200
+ lvx $sbdt, r12, r10
+ li r10, 0x210
+ lvx $sbbu, r12, r9
+ lvx $sbbt, r12, r8
+ lvx $sbeu, r12, r11
+ lvx $sbet, r12, r10
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.align 4
+_vpaes_decrypt_core:
+ lwz r8, 240($key) # pull rounds
+ li r9, 16
+ lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
+ li r11, 0x30
+ lvx v6, r9, $key
+ addi r9, r9, 16
+ ?vperm v5, v5, v6, $keyperm # align round key
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
+ vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
+ vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
+ vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
+ vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
+ mtctr r8
+ b Ldec_entry
+
+.align 4
+Ldec_loop:
+#
+# Inverse mix columns
+#
+ lvx v0, r12, r11 # v5 and v0 are flipped
+ # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ subi r11, r11, 16
+ vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ andi. r11, r11, 0x30
+ vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
+ # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+Ldec_entry:
+ # top of round
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vand v0, v0, v9
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vmr v5, v6
+ lvx v6, r9, $key # vmovdqu (%r9), %xmm0
+ vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ addi r9, r9, 16
+ vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ ?vperm v5, v5, v6, $keyperm # align round key
+ vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ bdnz Ldec_loop
+
+ # middle of last round
+ addi r10, r11, 0x80
+ # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .vpaes_decrypt
+.align 5
+.vpaes_decrypt:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r6
+ mfspr r7, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r7,`$FRAME-4`($sp) # save vrsave
+ li r0, -1
+ $PUSH r6,`$FRAME+$LRSAVE`($sp)
+ mtspr 256, r0 # preserve all AltiVec registers
+
+ bl _vpaes_decrypt_preheat
+
+ ?lvsl $inpperm, 0, $inp # prepare for unaligned access
+ lvx v0, 0, $inp
+ addi $inp, $inp, 15 # 15 is not a typo
+ ?lvsr $outperm, 0, $out
+ ?lvsl $keyperm, 0, $key
+ lvx $inptail, 0, $inp # redundant in aligned case
+ ?vperm v0, v0, $inptail, $inpperm
+
+ bl _vpaes_decrypt_core
+
+ andi. r8, $out, 15
+ li r9, 16
+ beq Ldec_out_aligned
+
+ vperm v0, v0, v0, $outperm # rotate right/left
+ mtctr r9
+Ldec_out_unaligned:
+ stvebx v0, 0, $out
+ addi $out, $out, 1
+ bdnz Ldec_out_unaligned
+ b Ldec_done
+
+.align 4
+Ldec_out_aligned:
+ stvx v0, 0, $out
+Ldec_done:
+
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtlr r6
+ mtspr 256, r7 # restore vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_decrypt,.-.vpaes_decrypt
+
+.globl .vpaes_cbc_encrypt
+.align 5
+.vpaes_cbc_encrypt:
+ ${UCMP}i r5,16
+ bltlr-
+
+ $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
+ mflr r0
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mfspr r12, 256
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r12,`$FRAME-4`($sp) # save vrsave
+ $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
+ $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
+ li r9, -16
+ $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
+
+ and r30, r5, r9 # copy length&-16
+ andi. r9, $out, 15 # is $out aligned?
+ mr r5, r6 # copy pointer to key
+ mr r31, r7 # copy pointer to iv
+ li r6, -1
+ mcrf cr1, cr0 # put aside $out alignment flag
+ mr r7, r12 # copy vrsave
+ mtspr 256, r6 # preserve all AltiVec registers
+
+ lvx v24, 0, r31 # load [potentially unaligned] iv
+ li r9, 15
+ ?lvsl $inpperm, 0, r31
+ lvx v25, r9, r31
+ ?vperm v24, v24, v25, $inpperm
+
+ cmpwi r8, 0 # test direction
+ neg r8, $inp # prepare for unaligned access
+ vxor v7, v7, v7
+ ?lvsl $keyperm, 0, $key
+ ?lvsr $outperm, 0, $out
+ ?lvsr $inpperm, 0, r8 # -$inp
+ vnor $outmask, v7, v7 # 0xff..ff
+ lvx $inptail, 0, $inp
+ ?vperm $outmask, v7, $outmask, $outperm
+ addi $inp, $inp, 15 # 15 is not a typo
+
+ beq Lcbc_decrypt
+
+ bl _vpaes_encrypt_preheat
+ li r0, 16
+
+ beq cr1, Lcbc_enc_loop # $out is aligned
+
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vxor v0, v0, v24 # ^= iv
+
+ bl _vpaes_encrypt_core
+
+ andi. r8, $out, 15
+ vmr v24, v0 # put aside iv
+ sub r9, $out, r8
+ vperm $outhead, v0, v0, $outperm # rotate right/left
+
+Lcbc_enc_head:
+ stvebx $outhead, r8, r9
+ cmpwi r8, 15
+ addi r8, r8, 1
+ bne Lcbc_enc_head
+
+ sub. r30, r30, r0 # len -= 16
+ addi $out, $out, 16
+ beq Lcbc_unaligned_done
+
+Lcbc_enc_loop:
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vxor v0, v0, v24 # ^= iv
+
+ bl _vpaes_encrypt_core
+
+ vmr v24, v0 # put aside iv
+ sub. r30, r30, r0 # len -= 16
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v1, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v1, 0, $out
+ addi $out, $out, 16
+ bne Lcbc_enc_loop
+
+ b Lcbc_done
+
+.align 5
+Lcbc_decrypt:
+ bl _vpaes_decrypt_preheat
+ li r0, 16
+
+ beq cr1, Lcbc_dec_loop # $out is aligned
+
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vmr v25, v0 # put aside input
+
+ bl _vpaes_decrypt_core
+
+ andi. r8, $out, 15
+ vxor v0, v0, v24 # ^= iv
+ vmr v24, v25
+ sub r9, $out, r8
+ vperm $outhead, v0, v0, $outperm # rotate right/left
+
+Lcbc_dec_head:
+ stvebx $outhead, r8, r9
+ cmpwi r8, 15
+ addi r8, r8, 1
+ bne Lcbc_dec_head
+
+ sub. r30, r30, r0 # len -= 16
+ addi $out, $out, 16
+ beq Lcbc_unaligned_done
+
+Lcbc_dec_loop:
+ vmr v0, $inptail
+ lvx $inptail, 0, $inp
+ addi $inp, $inp, 16
+ ?vperm v0, v0, $inptail, $inpperm
+ vmr v25, v0 # put aside input
+
+ bl _vpaes_decrypt_core
+
+ vxor v0, v0, v24 # ^= iv
+ vmr v24, v25
+ sub. r30, r30, r0 # len -= 16
+ vperm v0, v0, v0, $outperm # rotate right/left
+ vsel v1, $outhead, v0, $outmask
+ vmr $outhead, v0
+ stvx v1, 0, $out
+ addi $out, $out, 16
+ bne Lcbc_dec_loop
+
+Lcbc_done:
+ beq cr1, Lcbc_write_iv # $out is aligned
+
+Lcbc_unaligned_done:
+ andi. r8, $out, 15
+ sub $out, $out, r8
+ li r9, 0
+Lcbc_tail:
+ stvebx $outhead, r9, $out
+ addi r9, r9, 1
+ cmpw r9, r8
+ bne Lcbc_tail
+
+Lcbc_write_iv:
+ neg r8, r31 # write [potentially unaligned] iv
+ li r10, 4
+ ?lvsl $outperm, 0, r8
+ li r11, 8
+ li r12, 12
+ vperm v24, v24, v24, $outperm # rotate right/left
+ stvewx v24, 0, r31 # ivp is at least 32-bit aligned
+ stvewx v24, r10, r31
+ stvewx v24, r11, r31
+ stvewx v24, r12, r31
+
+ mtspr 256, r7 # restore vrsave
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+Lcbc_abort:
+ $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
+ $POP r30,`$FRAME+$SIZE_T*0`($sp)
+ $POP r31,`$FRAME+$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,`$FRAME+$SIZE_T*2`
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,2,6,0
+ .long 0
+.size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
+___
+}
+{
+my ($inp,$bits,$out)=map("r$_",(3..5));
+my $dir="cr1";
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
+
+$code.=<<___;
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.align 4
+_vpaes_key_preheat:
+ mflr r8
+ bl Lconsts
+ mtlr r8
+ li r11, 0xc0 # Lk_inv
+ li r10, 0xd0
+ li r9, 0xe0 # L_ipt
+ li r8, 0xf0
+
+ vspltisb v8,4 # 0x04..04
+ vxor v9,v9,v9 # 0x00..00
+ lvx $invlo, r12, r11 # Lk_inv
+ li r11, 0x120
+ lvx $invhi, r12, r10
+ li r10, 0x130
+ lvx $iptlo, r12, r9 # Lk_ipt
+ li r9, 0x220
+ lvx $ipthi, r12, r8
+ li r8, 0x230
+
+ lvx v14, r12, r11 # Lk_sb1
+ li r11, 0x240
+ lvx v15, r12, r10
+ li r10, 0x250
+
+ lvx v16, r12, r9 # Lk_dksd
+ li r9, 0x260
+ lvx v17, r12, r8
+ li r8, 0x270
+ lvx v18, r12, r11 # Lk_dksb
+ li r11, 0x280
+ lvx v19, r12, r10
+ li r10, 0x290
+ lvx v20, r12, r9 # Lk_dkse
+ li r9, 0x2a0
+ lvx v21, r12, r8
+ li r8, 0x2b0
+ lvx v22, r12, r11 # Lk_dks9
+ lvx v23, r12, r10
+
+ lvx v24, r12, r9 # Lk_rcon
+ lvx v25, 0, r12 # Lk_mc_forward[0]
+ lvx v26, r12, r8 # Lks63
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.align 4
+_vpaes_schedule_core:
+ mflr r7
+
+ bl _vpaes_key_preheat # load the tables
+
+ #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
+ neg r8, $inp # prepare for unaligned access
+ lvx v0, 0, $inp
+ addi $inp, $inp, 15 # 15 is not typo
+ ?lvsr $inpperm, 0, r8 # -$inp
+ lvx v6, 0, $inp # v6 serves as inptail
+ addi $inp, $inp, 8
+ ?vperm v0, v0, v6, $inpperm
+
+ # input transform
+ vmr v3, v0 # vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ vmr v7, v0 # vmovdqa %xmm0, %xmm7
+
+ bne $dir, Lschedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ li r8, 0x30 # mov \$0x30,%r8d
+ li r9, 4
+ li r10, 8
+ li r11, 12
+
+ ?lvsr $outperm, 0, $out # prepare for unaligned access
+ vnor $outmask, v9, v9 # 0xff..ff
+ ?vperm $outmask, v9, $outmask, $outperm
+
+ #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
+ vperm $outhead, v0, v0, $outperm # rotate right/left
+ stvewx $outhead, 0, $out # some are superfluous
+ stvewx $outhead, r9, $out
+ stvewx $outhead, r10, $out
+ addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+ stvewx $outhead, r11, $out
+ b Lschedule_go
+
+Lschedule_am_decrypting:
+ srwi r8, $bits, 1 # shr \$1,%r8d
+ andi. r8, r8, 32 # and \$32,%r8d
+ xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
+ addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+ # decrypting, output zeroth round key after shiftrows
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
+ li r9, 4
+ li r10, 8
+ li r11, 12
+ vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
+
+ neg r0, $out # prepare for unaligned access
+ ?lvsl $outperm, 0, r0
+ vnor $outmask, v9, v9 # 0xff..ff
+ ?vperm $outmask, $outmask, v9, $outperm
+
+ #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
+ vperm $outhead, v4, v4, $outperm # rotate right/left
+ stvewx $outhead, 0, $out # some are superfluous
+ stvewx $outhead, r9, $out
+ stvewx $outhead, r10, $out
+ addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
+ stvewx $outhead, r11, $out
+ addi $out, $out, 15 # 15 is not typo
+ xori r8, r8, 0x30 # xor \$0x30, %r8
+
+Lschedule_go:
+ cmplwi $bits, 192 # cmp \$192, %esi
+ bgt Lschedule_256
+ beq Lschedule_192
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+Lschedule_128:
+ li r0, 10 # mov \$10, %esi
+ mtctr r0
+
+Loop_schedule_128:
+ bl _vpaes_schedule_round
+ bdz Lschedule_mangle_last # dec %esi
+ bl _vpaes_schedule_mangle # write output
+ b Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+Lschedule_192:
+ li r0, 4 # mov \$4, %esi
+ lvx v0, 0, $inp
+ ?vperm v0, v6, v0, $inpperm
+ ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform # input transform
+ ?vsldoi v6, v0, v9, 8
+ ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
+ mtctr r0
+
+Loop_schedule_192:
+ bl _vpaes_schedule_round
+ ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle # save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle # save key n+1
+ bl _vpaes_schedule_round
+ bdz Lschedule_mangle_last # dec %esi
+ bl _vpaes_schedule_mangle # save key n+2
+ bl _vpaes_schedule_192_smear
+ b Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+Lschedule_256:
+ li r0, 7 # mov \$7, %esi
+ addi $inp, $inp, 8
+ lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ ?vperm v0, v6, v0, $inpperm
+ bl _vpaes_schedule_transform # input transform
+ mtctr r0
+
+Loop_schedule_256:
+ bl _vpaes_schedule_mangle # output low result
+ vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ # high round
+ bl _vpaes_schedule_round
+ bdz Lschedule_mangle_last # dec %esi
+ bl _vpaes_schedule_mangle
+
+ # low round. swap xmm7 and xmm6
+ ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
+ vmr v5, v7 # vmovdqa %xmm7, %xmm5
+ vmr v7, v6 # vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ vmr v7, v5 # vmovdqa %xmm5, %xmm7
+
+ b Loop_schedule_256
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+Lschedule_mangle_last:
+ # schedule last round key from xmm0
+ li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
+ li r9, 0x2f0
+ bne $dir, Lschedule_mangle_last_dec
+
+ # encrypting
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
+ li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
+ li r9, 0x2d0 # prepare to output transform
+ vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+ lvx $iptlo, r11, r12 # reload $ipt
+ lvx $ipthi, r9, r12
+ addi $out, $out, 16 # add \$16, %rdx
+ vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform # output transform
+
+ #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
+ vperm v0, v0, v0, $outperm # rotate right/left
+ li r10, 4
+ vsel v2, $outhead, v0, $outmask
+ li r11, 8
+ stvx v2, 0, $out
+ li r12, 12
+ stvewx v0, 0, $out # some (or all) are redundant
+ stvewx v0, r10, $out
+ stvewx v0, r11, $out
+ stvewx v0, r12, $out
+ b Lschedule_mangle_done
+
+.align 4
+Lschedule_mangle_last_dec:
+ lvx $iptlo, r11, r12 # reload $ipt
+ lvx $ipthi, r9, r12
+ addi $out, $out, -16 # add \$-16, %rdx
+ vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform # output transform
+
+ #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
+ addi r9, $out, -15 # -15 is not typo
+ vperm v0, v0, v0, $outperm # rotate right/left
+ li r10, 4
+ vsel v2, $outhead, v0, $outmask
+ li r11, 8
+ stvx v2, 0, $out
+ li r12, 12
+ stvewx v0, 0, r9 # some (or all) are redundant
+ stvewx v0, r10, r9
+ stvewx v0, r11, r9
+ stvewx v0, r12, r9
+
+
+Lschedule_mangle_done:
+ mtlr r7
+ # cleanup
+ vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
+ vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
+ vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
+ vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
+ vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
+ vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
+ vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
+ vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.align 4
+_vpaes_schedule_192_smear:
+ ?vspltw v0, v7, 3
+ ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ vmr v0, v6
+ ?vsldoi v6, v6, v9, 8
+ ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.align 4
+_vpaes_schedule_round:
+ # extract rcon from xmm8
+ #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
+ ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
+ ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
+ vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
+
+ # rotate
+ ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
+ ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ # smear xmm7
+ ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
+ vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
+ vspltisb v1, 0x0f # 0x0f..0f
+ ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
+
+ # subbytes
+ vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
+ vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
+ vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
+ vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ # add in smeared stuff
+ vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
+ vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm2
+##
+.align 4
+_vpaes_schedule_transform:
+ #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
+ vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
+ # vmovdqa (%r11), %xmm2 # lo
+ vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
+ # vmovdqa 16(%r11), %xmm1 # hi
+ vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
+ vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.align 4
+_vpaes_schedule_mangle:
+ #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ # vmovdqa .Lk_mc_forward(%rip),%xmm5
+ bne $dir, Lschedule_mangle_dec
+
+ # encrypting
+ vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
+ addi $out, $out, 16 # add \$16, %rdx
+ vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
+ vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
+ vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
+ vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
+ vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
+
+ vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ addi r8, r8, -16 # add \$-16, %r8
+ andi. r8, r8, 0x30 # and \$0x30, %r8
+
+ #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
+ vperm v1, v3, v3, $outperm # rotate right/left
+ vsel v2, $outhead, v1, $outmask
+ vmr $outhead, v1
+ stvx v2, 0, $out
+ blr
+
+.align 4
+Lschedule_mangle_dec:
+ # inverse mix columns
+ # lea .Lk_dksd(%rip),%r11
+ vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
+ #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ # vmovdqa 0x00(%r11), %xmm2
+ vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ # vmovdqa 0x10(%r11), %xmm3
+ vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
+ vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
+
+ # vmovdqa 0x20(%r11), %xmm2
+ vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
+ # vmovdqa 0x30(%r11), %xmm3
+ vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
+ vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
+
+ # vmovdqa 0x40(%r11), %xmm2
+ vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
+ # vmovdqa 0x50(%r11), %xmm3
+ vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
+
+ # vmovdqa 0x60(%r11), %xmm2
+ vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
+ vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
+ # vmovdqa 0x70(%r11), %xmm4
+ vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
+ lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
+ vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
+ vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
+
+ addi $out, $out, -16 # add \$-16, %rdx
+
+ vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
+ addi r8, r8, -16 # add \$-16, %r8
+ andi. r8, r8, 0x30 # and \$0x30, %r8
+
+ #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
+ vperm v1, v3, v3, $outperm # rotate right/left
+ vsel v2, $outhead, v1, $outmask
+ vmr $outhead, v1
+ stvx v2, 0, $out
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .vpaes_set_encrypt_key
+.align 5
+.vpaes_set_encrypt_key:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r0
+ mfspr r6, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r6,`$FRAME-4`($sp) # save vrsave
+ li r7, -1
+ $PUSH r0, `$FRAME+$LRSAVE`($sp)
+ mtspr 256, r7 # preserve all AltiVec registers
+
+ srwi r9, $bits, 5 # shr \$5,%eax
+ addi r9, r9, 6 # add \$5,%eax
+ stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ cmplw $dir, $bits, $bits # set encrypt direction
+ li r8, 0x30 # mov \$0x30,%r8d
+ bl _vpaes_schedule_core
+
+ $POP r0, `$FRAME+$LRSAVE`($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtspr 256, r6 # restore vrsave
+ mtlr r0
+ xor r3, r3, r3
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
+
+.globl .vpaes_set_decrypt_key
+.align 4
+.vpaes_set_decrypt_key:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r0
+ mfspr r6, 256 # save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r6,`$FRAME-4`($sp) # save vrsave
+ li r7, -1
+ $PUSH r0, `$FRAME+$LRSAVE`($sp)
+ mtspr 256, r7 # preserve all AltiVec registers
+
+ srwi r9, $bits, 5 # shr \$5,%eax
+ addi r9, r9, 6 # add \$5,%eax
+ stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ slwi r9, r9, 4 # shl \$4,%eax
+ add $out, $out, r9 # lea (%rdx,%rax),%rdx
+
+ cmplwi $dir, $bits, 0 # set decrypt direction
+ srwi r8, $bits, 1 # shr \$1,%r8d
+ andi. r8, r8, 32 # and \$32,%r8d
+ xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ $POP r0, `$FRAME+$LRSAVE`($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtspr 256, r6 # restore vrsave
+ mtlr r0
+ xor r3, r3, r3
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,3,0
+ .long 0
+.size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
+___
+}
+
+my $consts=1;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ # constants table endian-specific conversion
+ if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
+ my $conv=$2;
+ my @bytes=();
+
+ # convert to endian-agnostic format
+ foreach (split(/,\s+/,$1)) {
+ my $l = /^0/?oct:int;
+ push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+ }
+
+ # little-endian conversion
+ if ($flavour =~ /le$/o) {
+ SWITCH: for($conv) {
+ /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
+ /\?rev/ && do { @bytes=reverse(@bytes); last; };
+ }
+ }
+
+ #emit
+ print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+ next;
+ }
+ $consts=0 if (m/Lconsts:/o); # end of table
+
+ # instructions prefixed with '?' are endian-specific and need
+ # to be adjusted accordingly...
+ if ($flavour =~ /le$/o) { # little-endian
+ s/\?lvsr/lvsl/o or
+ s/\?lvsl/lvsr/o or
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+ s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+ s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+ } else { # big-endian
+ s/\?([a-z]+)/$1/o;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/vpaes-x86.pl b/openssl-1.1.0h/crypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000..47615c0
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,916 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+# aes-586.pl vpaes-x86.pl
+#
+# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
+# Nehalem 27.9/40.4/18.1 10.2/11.9
+# Atom 70.7/92.1/60.1 61.1/75.4(***)
+# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
+#
+# (*) "Hyper-threading" in the context refers rather to cache shared
+# among multiple cores, than to specifically Intel HTT. As vast
+# majority of contemporary cores share cache, slower code path
+# is common place. In other words "with-hyper-threading-off"
+# results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+# pshufb, yet it's respectable +28%/64% improvement on Core 2
+# and +15% on Atom (as implied, over "hyper-threading-safe"
+# code path).
+#
+# <appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my ($round, $base, $magic, $key, $const, $inp, $out)=
+ ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30; # inv, inva
+ &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+ &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10; # s0F
+ &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00; # input transform (lo, hi)
+ &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+ &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20; # sb1u, sb1t
+ &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+ &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40; # sb2u, sb2t
+ &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+ &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60; # sbou, sbot
+ &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+ &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80; # mc_forward
+ &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+ &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+ &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+ &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0; # mc_backward
+ &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+ &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+ &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+ &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100; # sr
+ &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+ &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+ &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+ &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140; # rcon
+ &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150; # s63: all equal to 0x63 transformed
+ &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160; # output transform
+ &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+ &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
+ &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+ &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+## Decryption stuff
+## Key schedule constants
+##
+$k_dksd=0x1a0; # decryption key schedule: invskew x*D
+ &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+ &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0; # decryption key schedule: invskew x*B
+ &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+ &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
+ &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+ &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200; # decryption key schedule: invskew x*9
+ &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+ &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+## Decryption stuff
+## Round function constants
+##
+$k_dipt=0x220; # decryption input transform
+ &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+ &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
+ &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+ &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
+ &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+ &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
+ &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+ &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
+ &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+ &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0; # decryption sbox final output
+ &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+ &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align (64);
+
+&function_begin_B("_vpaes_preheat");
+ &add ($const,&DWP(0,"esp"));
+ &movdqa ("xmm7",&QWP($k_inv,$const));
+ &movdqa ("xmm6",&QWP($k_s0F,$const));
+ &ret ();
+&function_end_B("_vpaes_preheat");
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm6-%xmm7 as in _vpaes_preheat
+## (%edx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+ &mov ($magic,16);
+ &mov ($round,&DWP(240,$key));
+ &movdqa ("xmm1","xmm6")
+ &movdqa ("xmm2",&QWP($k_ipt,$const));
+ &pandn ("xmm1","xmm0");
+ &pand ("xmm0","xmm6");
+ &movdqu ("xmm5",&QWP(0,$key));
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP($k_ipt+16,$const));
+ &pxor ("xmm2","xmm5");
+ &psrld ("xmm1",4);
+ &add ($key,16);
+ &pshufb ("xmm0","xmm1");
+ &lea ($base,&DWP($k_mc_backward,$const));
+ &pxor ("xmm0","xmm2");
+ &jmp (&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+ # middle of middle round
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+ &pshufb ("xmm4","xmm2"); # 4 = sb1u
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
+ &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+ &pshufb ("xmm5","xmm2"); # 4 = sb2u
+ &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+ &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
+ &pshufb ("xmm2","xmm3"); # 2 = sb2t
+ &movdqa ("xmm3","xmm0"); # 3 = A
+ &pxor ("xmm2","xmm5"); # 2 = 2A
+ &pshufb ("xmm0","xmm1"); # 0 = B
+ &add ($key,16); # next key
+ &pxor ("xmm0","xmm2"); # 0 = 2A+B
+ &pshufb ("xmm3","xmm4"); # 3 = D
+ &add ($magic,16); # next mc
+ &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
+ &pshufb ("xmm0","xmm1"); # 0 = 2B+C
+ &and ($magic,0x30); # ... mod 4
+ &sub ($round,1); # nr--
+ &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
+
+&set_label("enc_entry");
+ # top of round
+ &movdqa ("xmm1","xmm6"); # 1 : i
+ &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm6"); # 0 = k
+ &pshufb ("xmm5","xmm0"); # 2 = a/k
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &movdqu ("xmm5",&QWP(0,$key));
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &jnz (&label("enc_loop"));
+
+ # middle of last round
+ &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
+ &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &pshufb ("xmm0","xmm1");
+ &ret ();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+ &lea ($base,&DWP($k_dsbd,$const));
+ &mov ($round,&DWP(240,$key));
+ &movdqa ("xmm1","xmm6");
+ &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+ &pandn ("xmm1","xmm0");
+ &mov ($magic,$round);
+ &psrld ("xmm1",4)
+ &movdqu ("xmm5",&QWP(0,$key));
+ &shl ($magic,4);
+ &pand ("xmm0","xmm6");
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+ &xor ($magic,0x30);
+ &pshufb ("xmm0","xmm1");
+ &and ($magic,0x30);
+ &pxor ("xmm2","xmm5");
+ &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
+ &pxor ("xmm0","xmm2");
+ &add ($key,16);
+ &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+ &jmp (&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+## Inverse mix columns
+##
+ &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
+ &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
+ &pshufb ("xmm4","xmm2"); # 4 = sb9u
+ &pshufb ("xmm1","xmm3"); # 0 = sb9t
+ &pxor ("xmm0","xmm4");
+ &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
+
+ &pshufb ("xmm4","xmm2"); # 4 = sbdu
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbdt
+ &pxor ("xmm0","xmm4"); # 4 = ch
+ &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
+
+ &pshufb ("xmm4","xmm2"); # 4 = sbbu
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbbt
+ &pxor ("xmm0","xmm4"); # 4 = ch
+ &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
+
+ &pshufb ("xmm4","xmm2"); # 4 = sbeu
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbet
+ &pxor ("xmm0","xmm4"); # 4 = ch
+ &add ($key,16); # next round key
+ &palignr("xmm5","xmm5",12);
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &sub ($round,1); # nr--
+
+&set_label("dec_entry");
+ # top of round
+ &movdqa ("xmm1","xmm6"); # 1 : i
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
+ &pand ("xmm0","xmm6"); # 0 = k
+ &psrld ("xmm1",4); # 1 = i
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &movdqu ("xmm0",&QWP(0,$key));
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &jnz (&label("dec_loop"));
+
+ # middle of last round
+ &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &pxor ("xmm4","xmm0"); # 4 = sb1u + k
+ &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
+ &movdqa ("xmm2",&QWP(0,$magic));
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &pshufb ("xmm0","xmm2");
+ &ret ();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+ &add ($const,&DWP(0,"esp"));
+ &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
+ &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
+
+ # input transform
+ &movdqa ("xmm3","xmm0");
+ &lea ($base,&DWP($k_ipt,$const));
+ &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
+ &call ("_vpaes_schedule_transform");
+ &movdqa ("xmm7","xmm0");
+
+ &test ($out,$out);
+ &jnz (&label("schedule_am_decrypting"));
+
+ # encrypting, output zeroth round key after transform
+ &movdqu (&QWP(0,$key),"xmm0");
+ &jmp (&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+ # decrypting, output zeroth round key after shiftrows
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm3","xmm1");
+ &movdqu (&QWP(0,$key),"xmm3");
+ &xor ($magic,0x30);
+
+&set_label("schedule_go");
+ &cmp ($round,192);
+ &ja (&label("schedule_256"));
+ &je (&label("schedule_192"));
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+ &mov ($round,10);
+
+&set_label("loop_schedule_128");
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle"); # write output
+ &jmp (&label("loop_schedule_128"));
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+&set_label("schedule_192",16);
+ &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
+ &call ("_vpaes_schedule_transform"); # input transform
+ &movdqa ("xmm6","xmm0"); # save short part
+ &pxor ("xmm4","xmm4"); # clear 4
+ &movhlps("xmm6","xmm4"); # clobber low side with zeros
+ &mov ($round,4);
+
+&set_label("loop_schedule_192");
+ &call ("_vpaes_schedule_round");
+ &palignr("xmm0","xmm6",8);
+ &call ("_vpaes_schedule_mangle"); # save key n
+ &call ("_vpaes_schedule_192_smear");
+ &call ("_vpaes_schedule_mangle"); # save key n+1
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle"); # save key n+2
+ &call ("_vpaes_schedule_192_smear");
+ &jmp (&label("loop_schedule_192"));
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+ &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
+ &call ("_vpaes_schedule_transform"); # input transform
+ &mov ($round,7);
+
+&set_label("loop_schedule_256");
+ &call ("_vpaes_schedule_mangle"); # output low result
+ &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
+
+ # high round
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle");
+
+ # low round. swap xmm7 and xmm6
+ &pshufd ("xmm0","xmm0",0xFF);
+ &movdqa (&QWP(20,"esp"),"xmm7");
+ &movdqa ("xmm7","xmm6");
+ &call ("_vpaes_schedule_low_round");
+ &movdqa ("xmm7",&QWP(20,"esp"));
+
+ &jmp (&label("loop_schedule_256"));
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+ # schedule last round key from xmm0
+ &lea ($base,&DWP($k_deskew,$const));
+ &test ($out,$out);
+ &jnz (&label("schedule_mangle_last_dec"));
+
+ # encrypting
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm0","xmm1"); # output permute
+ &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
+ &add ($key,32);
+
+&set_label("schedule_mangle_last_dec");
+ &add ($key,-16);
+ &pxor ("xmm0",&QWP($k_s63,$const));
+ &call ("_vpaes_schedule_transform"); # output transform
+ &movdqu (&QWP(0,$key),"xmm0"); # save last key
+
+ # cleanup
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
+ &ret ();
+&function_end_B("_vpaes_schedule_core");
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+ &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
+ &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
+ &pxor ("xmm6","xmm1"); # -> c+d c 0 0
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
+ &movdqa ("xmm0","xmm6");
+ &movhlps("xmm6","xmm1"); # clobber low side with zeros
+ &ret ();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+ # extract rcon from xmm8
+ &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
+ &pxor ("xmm1","xmm1");
+ &palignr("xmm1","xmm2",15);
+ &palignr("xmm2","xmm2",15);
+ &pxor ("xmm7","xmm1");
+
+ # rotate
+ &pshufd ("xmm0","xmm0",0xFF);
+ &palignr("xmm0","xmm0",1);
+
+ # fall through...
+ &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
+
+ # low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+ # smear xmm7
+ &movdqa ("xmm1","xmm7");
+ &pslldq ("xmm7",4);
+ &pxor ("xmm7","xmm1");
+ &movdqa ("xmm1","xmm7");
+ &pslldq ("xmm7",8);
+ &pxor ("xmm7","xmm1");
+ &pxor ("xmm7",&QWP($k_s63,$const));
+
+ # subbyte
+ &movdqa ("xmm4",&QWP($k_s0F,$const));
+ &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
+ &movdqa ("xmm1","xmm4");
+ &pandn ("xmm1","xmm0");
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm4"); # 0 = k
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &movdqa ("xmm3","xmm5"); # 3 : 1/i
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
+ &movdqa ("xmm4","xmm5"); # 4 : 1/j
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm5"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &movdqa ("xmm3","xmm5"); # 3 : 1/jak
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = sbox output
+
+ # add in smeared stuff
+ &pxor ("xmm0","xmm7");
+ &movdqa ("xmm7","xmm0");
+ &ret ();
+&function_end_B("_vpaes_schedule_round");
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%ebx)
+##
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
+ &movdqa ("xmm1","xmm2");
+ &pandn ("xmm1","xmm0");
+ &psrld ("xmm1",4);
+ &pand ("xmm0","xmm2");
+ &movdqa ("xmm2",&QWP(0,$base));
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP(16,$base));
+ &pshufb ("xmm0","xmm1");
+ &pxor ("xmm0","xmm2");
+ &ret ();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%edx), and increments or decrements it
+## Keeps track of round number mod 4 in %ecx
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+ &movdqa ("xmm4","xmm0"); # save xmm0 for later
+ &movdqa ("xmm5",&QWP($k_mc_forward,$const));
+ &test ($out,$out);
+ &jnz (&label("schedule_mangle_dec"));
+
+ # encrypting
+ &add ($key,16);
+ &pxor ("xmm4",&QWP($k_s63,$const));
+ &pshufb ("xmm4","xmm5");
+ &movdqa ("xmm3","xmm4");
+ &pshufb ("xmm4","xmm5");
+ &pxor ("xmm3","xmm4");
+ &pshufb ("xmm4","xmm5");
+ &pxor ("xmm3","xmm4");
+
+ &jmp (&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+ # inverse mix columns
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
+ &lea ($inp,&DWP($k_dksd,$const));
+ &movdqa ("xmm1","xmm2");
+ &pandn ("xmm1","xmm4");
+ &psrld ("xmm1",4); # 1 = hi
+ &pand ("xmm4","xmm2"); # 4 = lo
+
+ &movdqa ("xmm2",&QWP(0,$inp));
+ &pshufb ("xmm2","xmm4");
+ &movdqa ("xmm3",&QWP(0x10,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x20,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x30,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x40,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x50,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x60,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x70,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+
+ &add ($key,-16);
+
+&set_label("schedule_mangle_both");
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm3","xmm1");
+ &add ($magic,-16);
+ &and ($magic,0x30);
+ &movdqu (&QWP(0,$key),"xmm3");
+ &ret ();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($round,&wparam(1)); # bits
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov ($base,$round);
+ &shr ($base,5);
+ &add ($base,5);
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
+ &mov ($magic,0x30);
+ &mov ($out,0);
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_schedule_core");
+&set_label("pic_point");
+
+ &mov ("esp",&DWP(48,"esp"));
+ &xor ("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($round,&wparam(1)); # bits
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov ($base,$round);
+ &shr ($base,5);
+ &add ($base,5);
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
+ &shl ($base,4);
+ &lea ($key,&DWP(16,$key,$base));
+
+ &mov ($out,1);
+ &mov ($magic,$round);
+ &shr ($magic,1);
+ &and ($magic,32);
+ &xor ($magic,32); # nbist==192?0:32;
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_schedule_core");
+&set_label("pic_point");
+
+ &mov ("esp",&DWP(48,"esp"));
+ &xor ("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($out,&wparam(1)); # out
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &movdqu ("xmm0",&QWP(0,$inp));
+ &call ("_vpaes_encrypt_core");
+ &movdqu (&QWP(0,$out),"xmm0");
+
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($out,&wparam(1)); # out
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &movdqu ("xmm0",&QWP(0,$inp));
+ &call ("_vpaes_decrypt_core");
+ &movdqu (&QWP(0,$out),"xmm0");
+
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+ &mov ($inp,&wparam(0)); # inp
+ &mov ($out,&wparam(1)); # out
+ &mov ($round,&wparam(2)); # len
+ &mov ($key,&wparam(3)); # key
+ &sub ($round,16);
+ &jc (&label("cbc_abort"));
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($const,&wparam(4)); # ivp
+ &and ($base,-16);
+ &mov ($magic,&wparam(5)); # enc
+ &xchg ($base,"esp"); # alloca
+ &movdqu ("xmm1",&QWP(0,$const)); # load IV
+ &sub ($out,$inp);
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov (&DWP(0,"esp"),$out); # save out
+ &mov (&DWP(4,"esp"),$key) # save key
+ &mov (&DWP(8,"esp"),$const); # save ivp
+ &mov ($out,$round); # $out works as $len
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &cmp ($magic,0);
+ &je (&label("cbc_dec_loop"));
+ &jmp (&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+ &movdqu ("xmm0",&QWP(0,$inp)); # load input
+ &pxor ("xmm0","xmm1"); # inp^=iv
+ &call ("_vpaes_encrypt_core");
+ &mov ($base,&DWP(0,"esp")); # restore out
+ &mov ($key,&DWP(4,"esp")); # restore key
+ &movdqa ("xmm1","xmm0");
+ &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
+ &lea ($inp,&DWP(16,$inp));
+ &sub ($out,16);
+ &jnc (&label("cbc_enc_loop"));
+ &jmp (&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+ &movdqu ("xmm0",&QWP(0,$inp)); # load input
+ &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
+ &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
+ &call ("_vpaes_decrypt_core");
+ &mov ($base,&DWP(0,"esp")); # restore out
+ &mov ($key,&DWP(4,"esp")); # restore key
+ &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
+ &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
+ &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
+ &lea ($inp,&DWP(16,$inp));
+ &sub ($out,16);
+ &jnc (&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+ &mov ($base,&DWP(8,"esp")); # restore ivp
+ &mov ("esp",&DWP(48,"esp"));
+ &movdqu (&QWP(0,$base),"xmm1"); # write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/asm/vpaes-x86_64.pl b/openssl-1.1.0h/crypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000..422e8ee
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1215 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+# aes-x86_64.pl vpaes-x86_64.pl
+#
+# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
+# Nehalem 29.6/40.3/14.6 10.0/11.8
+# Atom 57.3/74.2/32.1 60.9/77.2(***)
+# Silvermont 52.7/64.0/19.5 48.8/60.8(***)
+# Goldmont 38.9/49.0/17.8 10.6/12.6
+#
+# (*) "Hyper-threading" in the context refers rather to cache shared
+# among multiple cores, than to specifically Intel HTT. As vast
+# majority of contemporary cores share cache, slower code path
+# is common place. In other words "with-hyper-threading-off"
+# results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+# pshufb, yet it's respectable +36%/62% improvement on Core 2
+# (as implied, over "hyper-threading-safe" code path).
+#
+# <appro@openssl.org>
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type _vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+ mov %rdx, %r9
+ mov \$16, %r11
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
+ pshufb %xmm1, %xmm0
+ pxor %xmm5, %xmm2
+ add \$16, %r9
+ pxor %xmm2, %xmm0
+ lea .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+ # middle of middle round
+ movdqa %xmm13, %xmm4 # 4 : sb1u
+ movdqa %xmm12, %xmm0 # 0 : sb1t
+ pshufb %xmm2, %xmm4 # 4 = sb1u
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ movdqa %xmm15, %xmm5 # 4 : sb2u
+ pxor %xmm4, %xmm0 # 0 = A
+ movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ pshufb %xmm2, %xmm5 # 4 = sb2u
+ movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ movdqa %xmm14, %xmm2 # 2 : sb2t
+ pshufb %xmm3, %xmm2 # 2 = sb2t
+ movdqa %xmm0, %xmm3 # 3 = A
+ pxor %xmm5, %xmm2 # 2 = 2A
+ pshufb %xmm1, %xmm0 # 0 = B
+ add \$16, %r9 # next key
+ pxor %xmm2, %xmm0 # 0 = 2A+B
+ pshufb %xmm4, %xmm3 # 3 = D
+ add \$16, %r11 # next mc
+ pxor %xmm0, %xmm3 # 3 = 2A+B+D
+ pshufb %xmm1, %xmm0 # 0 = 2B+C
+ and \$0x30, %r11 # ... mod 4
+ sub \$1,%rax # nr--
+ pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
+
+.Lenc_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ movdqa %xmm11, %xmm5 # 2 : a/k
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ pshufb %xmm0, %xmm5 # 2 = a/k
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pxor %xmm1, %xmm0 # 0 = j
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pxor %xmm0, %xmm2 # 2 = io
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ movdqu (%r9), %xmm5
+ pxor %xmm1, %xmm3 # 3 = jo
+ jnz .Lenc_loop
+
+ # middle of last round
+ movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm1, %xmm0
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.type _vpaes_decrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_decrypt_core:
+ mov %rdx, %r9 # load key
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ mov %rax, %r11
+ psrld \$4, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ shl \$4, %r11
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
+ xor \$0x30, %r11
+ lea .Lk_dsbd(%rip),%r10
+ pshufb %xmm1, %xmm0
+ and \$0x30, %r11
+ pxor %xmm5, %xmm2
+ movdqa .Lk_mc_forward+48(%rip), %xmm5
+ pxor %xmm2, %xmm0
+ add \$16, %r9
+ add %r10, %r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+## Inverse mix columns
+##
+ movdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ movdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ pshufb %xmm2, %xmm4 # 4 = sb9u
+ pshufb %xmm3, %xmm1 # 0 = sb9t
+ pxor %xmm4, %xmm0
+ movdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ pshufb %xmm2, %xmm4 # 4 = sbdu
+ pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbdt
+ pxor %xmm4, %xmm0 # 4 = ch
+ movdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x30(%r10),%xmm1 # 0 : sbbt
+
+ pshufb %xmm2, %xmm4 # 4 = sbbu
+ pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbbt
+ pxor %xmm4, %xmm0 # 4 = ch
+ movdqa 0x40(%r10),%xmm4 # 4 : sbeu
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x50(%r10),%xmm1 # 0 : sbet
+
+ pshufb %xmm2, %xmm4 # 4 = sbeu
+ pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbet
+ pxor %xmm4, %xmm0 # 4 = ch
+ add \$16, %r9 # next round key
+ palignr \$12, %xmm5, %xmm5
+ pxor %xmm1, %xmm0 # 0 = ch
+ sub \$1,%rax # nr--
+
+.Ldec_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pxor %xmm1, %xmm0 # 0 = j
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pxor %xmm0, %xmm2 # 2 = io
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ movdqu (%r9), %xmm0
+ pxor %xmm1, %xmm3 # 3 = jo
+ jnz .Ldec_loop
+
+ # middle of last round
+ movdqa 0x60(%r10), %xmm4 # 3 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm0, %xmm4 # 4 = sb1u + k
+ movdqa 0x70(%r10), %xmm0 # 0 : sbot
+ movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm2, %xmm0
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.type _vpaes_schedule_core,\@abi-omnipotent
+.align 16
+_vpaes_schedule_core:
+ # rdi = key
+ # rsi = size in bits
+ # rdx = buffer
+ # rcx = direction. 0=encrypt, 1=decrypt
+
+ call _vpaes_preheat # load the tables
+ movdqa .Lk_rcon(%rip), %xmm8 # load rcon
+ movdqu (%rdi), %xmm0 # load key (unaligned)
+
+ # input transform
+ movdqa %xmm0, %xmm3
+ lea .Lk_ipt(%rip), %r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0, %xmm7
+
+ lea .Lk_sr(%rip),%r10
+ test %rcx, %rcx
+ jnz .Lschedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ movdqu %xmm0, (%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+ # decrypting, output zeroth round key after shiftrows
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm3
+ movdqu %xmm3, (%rdx)
+ xor \$0x30, %r8
+
+.Lschedule_go:
+ cmp \$192, %esi
+ ja .Lschedule_256
+ je .Lschedule_192
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ mov \$10, %esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # write output
+ jmp .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ call _vpaes_schedule_transform # input transform
+ movdqa %xmm0, %xmm6 # save short part
+ pxor %xmm4, %xmm4 # clear 4
+ movhlps %xmm4, %xmm6 # clobber low side with zeros
+ mov \$4, %esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+ palignr \$8,%xmm6,%xmm0
+ call _vpaes_schedule_mangle # save key n
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle # save key n+1
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # save key n+2
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ call _vpaes_schedule_transform # input transform
+ mov \$7, %esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle # output low result
+ movdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ # high round
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+ # low round. swap xmm7 and xmm6
+ pshufd \$0xFF, %xmm0, %xmm0
+ movdqa %xmm7, %xmm5
+ movdqa %xmm6, %xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5, %xmm7
+
+ jmp .Loop_schedule_256
+
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 16
+.Lschedule_mangle_last:
+ # schedule last round key from xmm0
+ lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_last_dec
+
+ # encrypting
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm0 # output permute
+ lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add \$32, %rdx
+
+.Lschedule_mangle_last_dec:
+ add \$-16, %rdx
+ pxor .Lk_s63(%rip), %xmm0
+ call _vpaes_schedule_transform # output transform
+ movdqu %xmm0, (%rdx) # save last key
+
+ # cleanup
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.type _vpaes_schedule_192_smear,\@abi-omnipotent
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ pxor %xmm1, %xmm6 # -> c+d c 0 0
+ pxor %xmm1, %xmm1
+ pxor %xmm0, %xmm6 # -> b+c+d b+c b a
+ movdqa %xmm6, %xmm0
+ movhlps %xmm1, %xmm6 # clobber low side with zeros
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.type _vpaes_schedule_round,\@abi-omnipotent
+.align 16
+_vpaes_schedule_round:
+ # extract rcon from xmm8
+ pxor %xmm1, %xmm1
+ palignr \$15, %xmm8, %xmm1
+ palignr \$15, %xmm8, %xmm8
+ pxor %xmm1, %xmm7
+
+ # rotate
+ pshufd \$0xFF, %xmm0, %xmm0
+ palignr \$1, %xmm0, %xmm0
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ # smear xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$4, %xmm7
+ pxor %xmm1, %xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$8, %xmm7
+ pxor %xmm1, %xmm7
+ pxor .Lk_s63(%rip), %xmm7
+
+ # subbytes
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqa %xmm13, %xmm4 # 4 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ movdqa %xmm12, %xmm0 # 0 : sbot
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = sbox output
+
+ # add in smeared stuff
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, %xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.type _vpaes_schedule_transform,\@abi-omnipotent
+.align 16
+_vpaes_schedule_transform:
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ movdqa (%r11), %xmm2 # lo
+ pshufb %xmm0, %xmm2
+ movdqa 16(%r11), %xmm0 # hi
+ pshufb %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.type _vpaes_schedule_mangle,\@abi-omnipotent
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0, %xmm4 # save xmm0 for later
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_dec
+
+ # encrypting
+ add \$16, %rdx
+ pxor .Lk_s63(%rip),%xmm4
+ pshufb %xmm5, %xmm4
+ movdqa %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+ # inverse mix columns
+ lea .Lk_dksd(%rip),%r11
+ movdqa %xmm9, %xmm1
+ pandn %xmm4, %xmm1
+ psrld \$4, %xmm1 # 1 = hi
+ pand %xmm9, %xmm4 # 4 = lo
+
+ movdqa 0x00(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ movdqa 0x10(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x20(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x30(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x40(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x50(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x60(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x70(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+
+ add \$-16, %rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1,%xmm3
+ add \$-16, %r8
+ and \$0x30, %r8
+ movdqu %xmm3, (%rdx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@function,3
+.align 16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov \$0,%ecx
+ mov \$0x30,%r8d
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@function,3
+.align 16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ shl \$4,%eax
+ lea 16(%rdx,%rax),%rdx
+
+ mov \$1,%ecx
+ mov %esi,%r8d
+ shr \$1,%r8d
+ and \$32,%r8d
+ xor \$32,%r8d # nbits==192?0:32
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@function,3
+.align 16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@function,3
+.align 16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+ xchg $key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+ sub \$16,$len
+ jc .Lcbc_abort
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+ movdqu ($ivp),%xmm6 # load IV
+ sub $inp,$out
+ call _vpaes_preheat
+ cmp \$0,${enc}d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu ($inp),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu ($inp),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,($ivp) # save IV
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+.Lcbc_abort:
+ ret
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.type _vpaes_preheat,\@abi-omnipotent
+.align 16
+_vpaes_preheat:
+ lea .Lk_s0F(%rip), %r10
+ movdqa -0x20(%r10), %xmm10 # .Lk_inv
+ movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
+ movdqa 0x00(%r10), %xmm9 # .Lk_s0F
+ movdqa 0x30(%r10), %xmm13 # .Lk_sb1
+ movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
+ movdqa 0x50(%r10), %xmm15 # .Lk_sb2
+ movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+########################################################
+## ##
+## Constants ##
+## ##
+########################################################
+.type _vpaes_consts,\@object
+.align 64
+_vpaes_consts:
+.Lk_inv: # inv, inva
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F: # s0F
+ .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt: # input transform (lo, hi)
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1: # sb1u, sb1t
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2: # sb2u, sb2t
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo: # sbou, sbot
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward: # mc_forward
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr: # sr
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon: # rcon
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63: # s63: all equal to 0x63 transformed
+ .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt: # output transform
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew: # deskew tables: inverts the sbox's "skew"
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+## Decryption stuff
+## Key schedule constants
+##
+.Lk_dksd: # decryption key schedule: invskew x*D
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: # decryption key schedule: invskew x*B
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: # decryption key schedule: invskew x*9
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+## Decryption stuff
+## Round function constants
+##
+.Lk_dipt: # decryption input transform
+ .quad 0x0F505B040B545F00, 0x154A411E114E451A
+ .quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9: # decryption sbox output *9*u, *9*t
+ .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: # decryption sbox output *D*u, *D*t
+ .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: # decryption sbox output *B*u, *B*t
+ .quad 0xD022649296B44200, 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: # decryption sbox output *E*u, *E*t
+ .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo: # decryption sbox final output
+ .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ lea 16(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xb8(%rax),%rax # adjust stack pointer
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_${PREFIX}_set_encrypt_key
+ .rva .LSEH_info_${PREFIX}_set_encrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_${PREFIX}_set_decrypt_key
+ .rva .LSEH_info_${PREFIX}_set_decrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_encrypt
+ .rva .LSEH_end_${PREFIX}_encrypt
+ .rva .LSEH_info_${PREFIX}_encrypt
+
+ .rva .LSEH_begin_${PREFIX}_decrypt
+ .rva .LSEH_end_${PREFIX}_decrypt
+ .rva .LSEH_info_${PREFIX}_decrypt
+
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_${PREFIX}_cbc_encrypt
+
+.section .xdata
+.align 8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/aes/build.info b/openssl-1.1.0h/crypto/aes/build.info
new file mode 100644
index 0000000..cf6cb5e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/aes/build.info
@@ -0,0 +1,57 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+ aes_misc.c aes_ecb.c aes_cfb.c aes_ofb.c \
+ aes_ige.c aes_wrap.c {- $target{aes_asm_src} -}
+
+GENERATE[aes-ia64.s]=asm/aes-ia64.S
+
+GENERATE[aes-586.s]=asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+DEPEND[aes-586.s]=../perlasm/x86asm.pl
+GENERATE[vpaes-x86.s]=asm/vpaes-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+DEPEND[vpaes-586.s]=../perlasm/x86asm.pl
+GENERATE[aesni-x86.s]=asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+DEPEND[aesni-586.s]=../perlasm/x86asm.pl
+
+GENERATE[aes-x86_64.s]=asm/aes-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[vpaes-x86_64.s]=asm/vpaes-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[bsaes-x86_64.s]=asm/bsaes-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-x86_64.s]=asm/aesni-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-sha1-x86_64.s]=asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-sha256-x86_64.s]=asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[aesni-mb-x86_64.s]=asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-sparcv9.S]=asm/aes-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[aes-sparcv9.o]=..
+GENERATE[aest4-sparcv9.S]=asm/aest4-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[aest4-sparcv9.o]=..
+DEPEND[aest4-sparcv9.S]=../perlasm/sparcv9_modes.pl
+GENERATE[aesfx-sparcv9.S]=asm/aesfx-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[aesfx-sparcv9.o]=..
+
+GENERATE[aes-ppc.s]=asm/aes-ppc.pl $(PERLASM_SCHEME)
+GENERATE[vpaes-ppc.s]=asm/vpaes-ppc.pl $(PERLASM_SCHEME)
+GENERATE[aesp8-ppc.s]=asm/aesp8-ppc.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-parisc.s]=asm/aes-parisc.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-mips.S]=asm/aes-mips.pl $(PERLASM_SCHEME)
+
+GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl $(PERLASM_SCHEME)
+INCLUDE[aesv8-armx.o]=..
+GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl $(PERLASM_SCHEME)
+
+GENERATE[aes-armv4.S]=asm/aes-armv4.pl $(PERLASM_SCHEME)
+INCLUDE[aes-armv4.o]=..
+GENERATE[bsaes-armv7.S]=asm/bsaes-armv7.pl $(PERLASM_SCHEME)
+INCLUDE[bsaes-armv7.o]=..
+
+BEGINRAW[Makefile]
+##### AES assembler implementations
+
+# GNU make "catch all"
+{- $builddir -}/aes-%.S: {- $sourcedir -}/asm/aes-%.pl
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+{- $builddir -}/bsaes-%.S: {- $sourcedir -}/asm/bsaes-%.pl
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+
+ENDRAW[Makefile]
diff --git a/openssl-1.1.0h/crypto/alphacpuid.pl b/openssl-1.1.0h/crypto/alphacpuid.pl
new file mode 100644
index 0000000..6c7fd4c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/alphacpuid.pl
@@ -0,0 +1,257 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$output = pop;
+open STDOUT,">$output";
+
+print <<'___';
+.text
+
+.set noat
+
+.globl OPENSSL_cpuid_setup
+.ent OPENSSL_cpuid_setup
+OPENSSL_cpuid_setup:
+ .frame $30,0,$26
+ .prologue 0
+ ret ($26)
+.end OPENSSL_cpuid_setup
+
+.globl OPENSSL_wipe_cpu
+.ent OPENSSL_wipe_cpu
+OPENSSL_wipe_cpu:
+ .frame $30,0,$26
+ .prologue 0
+ clr $1
+ clr $2
+ clr $3
+ clr $4
+ clr $5
+ clr $6
+ clr $7
+ clr $8
+ clr $16
+ clr $17
+ clr $18
+ clr $19
+ clr $20
+ clr $21
+ clr $22
+ clr $23
+ clr $24
+ clr $25
+ clr $27
+ clr $at
+ clr $29
+ fclr $f0
+ fclr $f1
+ fclr $f10
+ fclr $f11
+ fclr $f12
+ fclr $f13
+ fclr $f14
+ fclr $f15
+ fclr $f16
+ fclr $f17
+ fclr $f18
+ fclr $f19
+ fclr $f20
+ fclr $f21
+ fclr $f22
+ fclr $f23
+ fclr $f24
+ fclr $f25
+ fclr $f26
+ fclr $f27
+ fclr $f28
+ fclr $f29
+ fclr $f30
+ mov $sp,$0
+ ret ($26)
+.end OPENSSL_wipe_cpu
+
+.globl OPENSSL_atomic_add
+.ent OPENSSL_atomic_add
+OPENSSL_atomic_add:
+ .frame $30,0,$26
+ .prologue 0
+1: ldl_l $0,0($16)
+ addl $0,$17,$1
+ stl_c $1,0($16)
+ beq $1,1b
+ addl $0,$17,$0
+ ret ($26)
+.end OPENSSL_atomic_add
+
+.globl OPENSSL_rdtsc
+.ent OPENSSL_rdtsc
+OPENSSL_rdtsc:
+ .frame $30,0,$26
+ .prologue 0
+ rpcc $0
+ ret ($26)
+.end OPENSSL_rdtsc
+
+.globl OPENSSL_cleanse
+.ent OPENSSL_cleanse
+OPENSSL_cleanse:
+ .frame $30,0,$26
+ .prologue 0
+ beq $17,.Ldone
+ and $16,7,$0
+ bic $17,7,$at
+ beq $at,.Little
+ beq $0,.Laligned
+
+.Little:
+ subq $0,8,$0
+ ldq_u $1,0($16)
+ mov $16,$2
+.Lalign:
+ mskbl $1,$16,$1
+ lda $16,1($16)
+ subq $17,1,$17
+ addq $0,1,$0
+ beq $17,.Lout
+ bne $0,.Lalign
+.Lout: stq_u $1,0($2)
+ beq $17,.Ldone
+ bic $17,7,$at
+ beq $at,.Little
+
+.Laligned:
+ stq $31,0($16)
+ subq $17,8,$17
+ lda $16,8($16)
+ bic $17,7,$at
+ bne $at,.Laligned
+ bne $17,.Little
+.Ldone: ret ($26)
+.end OPENSSL_cleanse
+
+.globl CRYPTO_memcmp
+.ent CRYPTO_memcmp
+CRYPTO_memcmp:
+ .frame $30,0,$26
+ .prologue 0
+ xor $0,$0,$0
+ beq $18,.Lno_data
+
+ xor $1,$1,$1
+ nop
+.Loop_cmp:
+ ldq_u $2,0($16)
+ subq $18,1,$18
+ ldq_u $3,0($17)
+ extbl $2,$16,$2
+ lda $16,1($16)
+ extbl $3,$17,$3
+ lda $17,1($17)
+ xor $3,$2,$2
+ or $2,$0,$0
+ bne $18,.Loop_cmp
+
+ subq $31,$0,$0
+ srl $0,63,$0
+.Lno_data:
+ ret ($26)
+.end CRYPTO_memcmp
+___
+{
+my ($out,$cnt,$max)=("\$16","\$17","\$18");
+my ($tick,$lasttick)=("\$19","\$20");
+my ($diff,$lastdiff)=("\$21","\$22");
+my ($v0,$ra,$sp,$zero)=("\$0","\$26","\$30","\$31");
+
+print <<___;
+.globl OPENSSL_instrument_bus
+.ent OPENSSL_instrument_bus
+OPENSSL_instrument_bus:
+ .frame $sp,0,$ra
+ .prologue 0
+ mov $cnt,$v0
+
+ rpcc $lasttick
+ mov 0,$diff
+
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+.Loop: rpcc $tick
+ subq $tick,$lasttick,$diff
+ mov $tick,$lasttick
+
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+ subl $cnt,1,$cnt
+ lda $out,4($out)
+ bne $cnt,.Loop
+
+ ret ($ra)
+.end OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.ent OPENSSL_instrument_bus2
+OPENSSL_instrument_bus2:
+ .frame $sp,0,$ra
+ .prologue 0
+ mov $cnt,$v0
+
+ rpcc $lasttick
+ mov 0,$diff
+
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+ rpcc $tick
+ subq $tick,$lasttick,$diff
+ mov $tick,$lasttick
+ mov $diff,$lastdiff
+.Loop2:
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+ subl $max,1,$max
+ beq $max,.Ldone2
+
+ rpcc $tick
+ subq $tick,$lasttick,$diff
+ mov $tick,$lasttick
+ subq $lastdiff,$diff,$tick
+ mov $diff,$lastdiff
+ cmovne $tick,1,$tick
+ subl $cnt,$tick,$cnt
+ s4addq $tick,$out,$out
+ bne $cnt,.Loop2
+
+.Ldone2:
+ subl $v0,$cnt,$v0
+ ret ($ra)
+.end OPENSSL_instrument_bus2
+___
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/arm64cpuid.pl b/openssl-1.1.0h/crypto/arm64cpuid.pl
new file mode 100755
index 0000000..caa3387
--- /dev/null
+++ b/openssl-1.1.0h/crypto/arm64cpuid.pl
@@ -0,0 +1,126 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+.arch armv8-a+crypto
+
+.align 5
+.globl _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ orr v15.16b, v15.16b, v15.16b
+ ret
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.globl _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+#ifdef __APPLE__
+ mrs x0, CNTPCT_EL0
+#else
+ mrs x0, CNTVCT_EL0
+#endif
+ ret
+.size _armv7_tick,.-_armv7_tick
+
+.globl _armv8_aes_probe
+.type _armv8_aes_probe,%function
+_armv8_aes_probe:
+ aese v0.16b, v0.16b
+ ret
+.size _armv8_aes_probe,.-_armv8_aes_probe
+
+.globl _armv8_sha1_probe
+.type _armv8_sha1_probe,%function
+_armv8_sha1_probe:
+ sha1h s0, s0
+ ret
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
+
+.globl _armv8_sha256_probe
+.type _armv8_sha256_probe,%function
+_armv8_sha256_probe:
+ sha256su0 v0.4s, v0.4s
+ ret
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
+.globl _armv8_pmull_probe
+.type _armv8_pmull_probe,%function
+_armv8_pmull_probe:
+ pmull v0.1q, v0.1d, v0.1d
+ ret
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
+
+.globl OPENSSL_cleanse
+.type OPENSSL_cleanse,%function
+.align 5
+OPENSSL_cleanse:
+ cbz x1,.Lret // len==0?
+ cmp x1,#15
+ b.hi .Lot // len>15
+ nop
+.Little:
+ strb wzr,[x0],#1 // store byte-by-byte
+ subs x1,x1,#1
+ b.ne .Little
+.Lret: ret
+
+.align 4
+.Lot: tst x0,#7
+ b.eq .Laligned // inp is aligned
+ strb wzr,[x0],#1 // store byte-by-byte
+ sub x1,x1,#1
+ b .Lot
+
+.align 4
+.Laligned:
+ str xzr,[x0],#8 // store word-by-word
+ sub x1,x1,#8
+ tst x1,#-8
+ b.ne .Laligned // len>=8
+ cbnz x1,.Little // len!=0?
+ ret
+.size OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.globl CRYPTO_memcmp
+.type CRYPTO_memcmp,%function
+.align 4
+CRYPTO_memcmp:
+ eor w3,w3,w3
+ cbz x2,.Lno_data // len==0?
+.Loop_cmp:
+ ldrb w4,[x0],#1
+ ldrb w5,[x1],#1
+ eor w4,w4,w5
+ orr w3,w3,w4
+ subs x2,x2,#1
+ b.ne .Loop_cmp
+
+.Lno_data:
+ neg w0,w3
+ lsr w0,w0,#31
+ ret
+.size CRYPTO_memcmp,.-CRYPTO_memcmp
+___
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/arm_arch.h b/openssl-1.1.0h/crypto/arm_arch.h
new file mode 100644
index 0000000..3fc9e69
--- /dev/null
+++ b/openssl-1.1.0h/crypto/arm_arch.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef __ARM_ARCH_H__
+# define __ARM_ARCH_H__
+
+# if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+# define __ARM_ARCH__ __TARGET_ARCH_ARM
+# if defined(__BIG_ENDIAN)
+# define __ARMEB__
+# else
+# define __ARMEL__
+# endif
+# elif defined(__GNUC__)
+# if defined(__aarch64__)
+# define __ARM_ARCH__ 8
+# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
+# define __ARMEB__
+# else
+# define __ARMEL__
+# endif
+ /*
+ * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+ * bunch of below macros. See all_architectires[] table in
+ * gcc/config/arm/arm.c. On a side note it defines
+ * __ARMEL__/__ARMEB__ for little-/big-endian.
+ */
+# elif defined(__ARM_ARCH)
+# define __ARM_ARCH__ __ARM_ARCH
+# elif defined(__ARM_ARCH_8A__)
+# define __ARM_ARCH__ 8
+# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
+ defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
+ defined(__ARM_ARCH_7EM__)
+# define __ARM_ARCH__ 7
+# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+ defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \
+ defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \
+ defined(__ARM_ARCH_6T2__)
+# define __ARM_ARCH__ 6
+# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
+ defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \
+ defined(__ARM_ARCH_5TEJ__)
+# define __ARM_ARCH__ 5
+# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+# define __ARM_ARCH__ 4
+# else
+# error "unsupported ARM architecture"
+# endif
+# endif
+# endif
+
+# if !defined(__ARM_MAX_ARCH__)
+# define __ARM_MAX_ARCH__ __ARM_ARCH__
+# endif
+
+# if __ARM_MAX_ARCH__<__ARM_ARCH__
+# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
+# elif __ARM_MAX_ARCH__!=__ARM_ARCH__
+# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__)
+# error "can't build universal big-endian binary"
+# endif
+# endif
+
+# if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+# endif
+
+# define ARMV7_NEON (1<<0)
+# define ARMV7_TICK (1<<1)
+# define ARMV8_AES (1<<2)
+# define ARMV8_SHA1 (1<<3)
+# define ARMV8_SHA256 (1<<4)
+# define ARMV8_PMULL (1<<5)
+
+#endif
diff --git a/openssl-1.1.0h/crypto/armcap.c b/openssl-1.1.0h/crypto/armcap.c
new file mode 100644
index 0000000..432a06c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/armcap.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2011-2017 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <openssl/crypto.h>
+
+#include "arm_arch.h"
+
+unsigned int OPENSSL_armcap_P = 0;
+
+#if __ARM_MAX_ARCH__<7
+void OPENSSL_cpuid_setup(void)
+{
+}
+
+unsigned long OPENSSL_rdtsc(void)
+{
+ return 0;
+}
+#else
+static sigset_t all_masked;
+
+static sigjmp_buf ill_jmp;
+static void ill_handler(int sig)
+{
+ siglongjmp(ill_jmp, sig);
+}
+
+/*
+ * Following subroutines could have been inlined, but it's not all
+ * ARM compilers support inline assembler...
+ */
+void _armv7_neon_probe(void);
+void _armv8_aes_probe(void);
+void _armv8_sha1_probe(void);
+void _armv8_sha256_probe(void);
+void _armv8_pmull_probe(void);
+unsigned long _armv7_tick(void);
+
+unsigned long OPENSSL_rdtsc(void)
+{
+ if (OPENSSL_armcap_P & ARMV7_TICK)
+ return _armv7_tick();
+ else
+ return 0;
+}
+
+# if defined(__GNUC__) && __GNUC__>=2
+void OPENSSL_cpuid_setup(void) __attribute__ ((constructor));
+# endif
+/*
+ * Use a weak reference to getauxval() so we can use it if it is available but
+ * don't break the build if it is not.
+ */
+# if defined(__GNUC__) && __GNUC__>=2 && defined(__ELF__)
+extern unsigned long getauxval(unsigned long type) __attribute__ ((weak));
+# else
+static unsigned long (*getauxval) (unsigned long) = NULL;
+# endif
+
+/*
+ * ARM puts the feature bits for Crypto Extensions in AT_HWCAP2, whereas
+ * AArch64 used AT_HWCAP.
+ */
+# if defined(__arm__) || defined (__arm)
+# define HWCAP 16
+ /* AT_HWCAP */
+# define HWCAP_NEON (1 << 12)
+
+# define HWCAP_CE 26
+ /* AT_HWCAP2 */
+# define HWCAP_CE_AES (1 << 0)
+# define HWCAP_CE_PMULL (1 << 1)
+# define HWCAP_CE_SHA1 (1 << 2)
+# define HWCAP_CE_SHA256 (1 << 3)
+# elif defined(__aarch64__)
+# define HWCAP 16
+ /* AT_HWCAP */
+# define HWCAP_NEON (1 << 1)
+
+# define HWCAP_CE HWCAP
+# define HWCAP_CE_AES (1 << 3)
+# define HWCAP_CE_PMULL (1 << 4)
+# define HWCAP_CE_SHA1 (1 << 5)
+# define HWCAP_CE_SHA256 (1 << 6)
+# endif
+
+void OPENSSL_cpuid_setup(void)
+{
+ char *e;
+ struct sigaction ill_oact, ill_act;
+ sigset_t oset;
+ static int trigger = 0;
+
+ if (trigger)
+ return;
+ trigger = 1;
+
+ if ((e = getenv("OPENSSL_armcap"))) {
+ OPENSSL_armcap_P = (unsigned int)strtoul(e, NULL, 0);
+ return;
+ }
+
+# if defined(__APPLE__) && !defined(__aarch64__)
+ /*
+ * Capability probing by catching SIGILL appears to be problematic
+ * on iOS. But since Apple universe is "monocultural", it's actually
+ * possible to simply set pre-defined processor capability mask.
+ */
+ if (1) {
+ OPENSSL_armcap_P = ARMV7_NEON;
+ return;
+ }
+ /*
+ * One could do same even for __aarch64__ iOS builds. It's not done
+ * exclusively for reasons of keeping code unified across platforms.
+ * Unified code works because it never triggers SIGILL on Apple
+ * devices...
+ */
+# endif
+
+ sigfillset(&all_masked);
+ sigdelset(&all_masked, SIGILL);
+ sigdelset(&all_masked, SIGTRAP);
+ sigdelset(&all_masked, SIGFPE);
+ sigdelset(&all_masked, SIGBUS);
+ sigdelset(&all_masked, SIGSEGV);
+
+ OPENSSL_armcap_P = 0;
+
+ memset(&ill_act, 0, sizeof(ill_act));
+ ill_act.sa_handler = ill_handler;
+ ill_act.sa_mask = all_masked;
+
+ sigprocmask(SIG_SETMASK, &ill_act.sa_mask, &oset);
+ sigaction(SIGILL, &ill_act, &ill_oact);
+
+ if (getauxval != NULL) {
+ if (getauxval(HWCAP) & HWCAP_NEON) {
+ unsigned long hwcap = getauxval(HWCAP_CE);
+
+ OPENSSL_armcap_P |= ARMV7_NEON;
+
+ if (hwcap & HWCAP_CE_AES)
+ OPENSSL_armcap_P |= ARMV8_AES;
+
+ if (hwcap & HWCAP_CE_PMULL)
+ OPENSSL_armcap_P |= ARMV8_PMULL;
+
+ if (hwcap & HWCAP_CE_SHA1)
+ OPENSSL_armcap_P |= ARMV8_SHA1;
+
+ if (hwcap & HWCAP_CE_SHA256)
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
+ } else if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv7_neon_probe();
+ OPENSSL_armcap_P |= ARMV7_NEON;
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_pmull_probe();
+ OPENSSL_armcap_P |= ARMV8_PMULL | ARMV8_AES;
+ } else if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_aes_probe();
+ OPENSSL_armcap_P |= ARMV8_AES;
+ }
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sha1_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA1;
+ }
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sha256_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
+ }
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv7_tick();
+ OPENSSL_armcap_P |= ARMV7_TICK;
+ }
+
+ sigaction(SIGILL, &ill_oact, NULL);
+ sigprocmask(SIG_SETMASK, &oset, NULL);
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/armv4cpuid.pl b/openssl-1.1.0h/crypto/armv4cpuid.pl
new file mode 100644
index 0000000..f7d31a6
--- /dev/null
+++ b/openssl-1.1.0h/crypto/armv4cpuid.pl
@@ -0,0 +1,296 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax unified
+.thumb
+#else
+.code 32
+#undef __thumb2__
+#endif
+
+.align 5
+.global OPENSSL_atomic_add
+.type OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd: ldrex r2,[r0]
+ add r3,r2,r1
+ strex r2,r3,[r0]
+ cmp r2,#0
+ bne .Ladd
+ mov r0,r3
+ bx lr
+#else
+ stmdb sp!,{r4-r6,lr}
+ ldr r2,.Lspinlock
+ adr r3,.Lspinlock
+ mov r4,r0
+ mov r5,r1
+ add r6,r3,r2 @ &spinlock
+ b .+8
+.Lspin: bl sched_yield
+ mov r0,#-1
+ swp r0,r0,[r6]
+ cmp r0,#0
+ bne .Lspin
+
+ ldr r2,[r4]
+ add r2,r2,r5
+ str r2,[r4]
+ str r0,[r6] @ release spinlock
+ ldmia sp!,{r4-r6,lr}
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global OPENSSL_cleanse
+.type OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+ eor ip,ip,ip
+ cmp r1,#7
+#ifdef __thumb2__
+ itt hs
+#endif
+ subhs r1,r1,#4
+ bhs .Lot
+ cmp r1,#0
+ beq .Lcleanse_done
+.Little:
+ strb ip,[r0],#1
+ subs r1,r1,#1
+ bhi .Little
+ b .Lcleanse_done
+
+.Lot: tst r0,#3
+ beq .Laligned
+ strb ip,[r0],#1
+ sub r1,r1,#1
+ b .Lot
+.Laligned:
+ str ip,[r0],#4
+ subs r1,r1,#4
+ bhs .Laligned
+ adds r1,r1,#4
+ bne .Little
+.Lcleanse_done:
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global CRYPTO_memcmp
+.type CRYPTO_memcmp,%function
+.align 4
+CRYPTO_memcmp:
+ eor ip,ip,ip
+ cmp r2,#0
+ beq .Lno_data
+ stmdb sp!,{r4,r5}
+
+.Loop_cmp:
+ ldrb r4,[r0],#1
+ ldrb r5,[r1],#1
+ eor r4,r4,r5
+ orr ip,ip,r4
+ subs r2,r2,#1
+ bne .Loop_cmp
+
+ ldmia sp!,{r4,r5}
+.Lno_data:
+ neg r0,ip
+ mov r0,r0,lsr#31
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size CRYPTO_memcmp,.-CRYPTO_memcmp
+
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.align 5
+.global _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ vorr q0,q0,q0
+ bx lr
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.global _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+#ifdef __APPLE__
+ mrrc p15,0,r0,r1,c14 @ CNTPCT
+#else
+ mrrc p15,1,r0,r1,c14 @ CNTVCT
+#endif
+ bx lr
+.size _armv7_tick,.-_armv7_tick
+
+.global _armv8_aes_probe
+.type _armv8_aes_probe,%function
+_armv8_aes_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+ .byte 0xb0,0xff,0x00,0x03 @ aese.8 q0,q0
+#else
+ .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
+#endif
+ bx lr
+.size _armv8_aes_probe,.-_armv8_aes_probe
+
+.global _armv8_sha1_probe
+.type _armv8_sha1_probe,%function
+_armv8_sha1_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+ .byte 0x00,0xef,0x40,0x0c @ sha1c.32 q0,q0,q0
+#else
+ .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
+#endif
+ bx lr
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
+
+.global _armv8_sha256_probe
+.type _armv8_sha256_probe,%function
+_armv8_sha256_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+ .byte 0x00,0xff,0x40,0x0c @ sha256h.32 q0,q0,q0
+#else
+ .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
+#endif
+ bx lr
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
+.global _armv8_pmull_probe
+.type _armv8_pmull_probe,%function
+_armv8_pmull_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+ .byte 0xa0,0xef,0x00,0x0e @ vmull.p64 q0,d0,d0
+#else
+ .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
+#endif
+ bx lr
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
+#endif
+
+.global OPENSSL_wipe_cpu
+.type OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+#if __ARM_MAX_ARCH__>=7
+ ldr r0,.LOPENSSL_armcap
+ adr r1,.LOPENSSL_armcap
+ ldr r0,[r1,r0]
+#ifdef __APPLE__
+ ldr r0,[r0]
+#endif
+#endif
+ eor r2,r2,r2
+ eor r3,r3,r3
+ eor ip,ip,ip
+#if __ARM_MAX_ARCH__>=7
+ tst r0,#1
+ beq .Lwipe_done
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+ veor q3, q3, q3
+ veor q8, q8, q8
+ veor q9, q9, q9
+ veor q10, q10, q10
+ veor q11, q11, q11
+ veor q12, q12, q12
+ veor q13, q13, q13
+ veor q14, q14, q14
+ veor q15, q15, q15
+.Lwipe_done:
+#endif
+ mov r0,sp
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global OPENSSL_instrument_bus
+.type OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+ eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global OPENSSL_instrument_bus2
+.type OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+ eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align 5
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.
+#endif
+#if __ARM_ARCH__>=6
+.align 5
+#else
+.Lspinlock:
+.word atomic_add_spinlock-.Lspinlock
+.align 5
+
+.data
+.align 2
+atomic_add_spinlock:
+.word 0
+#endif
+
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+___
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/asn1/a_bitstr.c b/openssl-1.1.0h/crypto/asn1/a_bitstr.c
new file mode 100644
index 0000000..b2e0fb6
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_bitstr.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include "asn1_locl.h"
+
+int ASN1_BIT_STRING_set(ASN1_BIT_STRING *x, unsigned char *d, int len)
+{
+ return ASN1_STRING_set(x, d, len);
+}
+
+int i2c_ASN1_BIT_STRING(ASN1_BIT_STRING *a, unsigned char **pp)
+{
+ int ret, j, bits, len;
+ unsigned char *p, *d;
+
+ if (a == NULL)
+ return (0);
+
+ len = a->length;
+
+ if (len > 0) {
+ if (a->flags & ASN1_STRING_FLAG_BITS_LEFT) {
+ bits = (int)a->flags & 0x07;
+ } else {
+ for (; len > 0; len--) {
+ if (a->data[len - 1])
+ break;
+ }
+ j = a->data[len - 1];
+ if (j & 0x01)
+ bits = 0;
+ else if (j & 0x02)
+ bits = 1;
+ else if (j & 0x04)
+ bits = 2;
+ else if (j & 0x08)
+ bits = 3;
+ else if (j & 0x10)
+ bits = 4;
+ else if (j & 0x20)
+ bits = 5;
+ else if (j & 0x40)
+ bits = 6;
+ else if (j & 0x80)
+ bits = 7;
+ else
+ bits = 0; /* should not happen */
+ }
+ } else
+ bits = 0;
+
+ ret = 1 + len;
+ if (pp == NULL)
+ return (ret);
+
+ p = *pp;
+
+ *(p++) = (unsigned char)bits;
+ d = a->data;
+ if (len > 0) {
+ memcpy(p, d, len);
+ p += len;
+ p[-1] &= (0xff << bits);
+ }
+ *pp = p;
+ return (ret);
+}
+
+ASN1_BIT_STRING *c2i_ASN1_BIT_STRING(ASN1_BIT_STRING **a,
+ const unsigned char **pp, long len)
+{
+ ASN1_BIT_STRING *ret = NULL;
+ const unsigned char *p;
+ unsigned char *s;
+ int i;
+
+ if (len < 1) {
+ i = ASN1_R_STRING_TOO_SHORT;
+ goto err;
+ }
+
+ if (len > INT_MAX) {
+ i = ASN1_R_STRING_TOO_LONG;
+ goto err;
+ }
+
+ if ((a == NULL) || ((*a) == NULL)) {
+ if ((ret = ASN1_BIT_STRING_new()) == NULL)
+ return (NULL);
+ } else
+ ret = (*a);
+
+ p = *pp;
+ i = *(p++);
+ if (i > 7) {
+ i = ASN1_R_INVALID_BIT_STRING_BITS_LEFT;
+ goto err;
+ }
+ /*
+ * We do this to preserve the settings. If we modify the settings, via
+ * the _set_bit function, we will recalculate on output
+ */
+ ret->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07); /* clear */
+ ret->flags |= (ASN1_STRING_FLAG_BITS_LEFT | i); /* set */
+
+ if (len-- > 1) { /* using one because of the bits left byte */
+ s = OPENSSL_malloc((int)len);
+ if (s == NULL) {
+ i = ERR_R_MALLOC_FAILURE;
+ goto err;
+ }
+ memcpy(s, p, (int)len);
+ s[len - 1] &= (0xff << i);
+ p += len;
+ } else
+ s = NULL;
+
+ ret->length = (int)len;
+ OPENSSL_free(ret->data);
+ ret->data = s;
+ ret->type = V_ASN1_BIT_STRING;
+ if (a != NULL)
+ (*a) = ret;
+ *pp = p;
+ return (ret);
+ err:
+ ASN1err(ASN1_F_C2I_ASN1_BIT_STRING, i);
+ if ((a == NULL) || (*a != ret))
+ ASN1_BIT_STRING_free(ret);
+ return (NULL);
+}
+
+/*
+ * These next 2 functions from Goetz Babin-Ebell <babinebell@trustcenter.de>
+ */
+int ASN1_BIT_STRING_set_bit(ASN1_BIT_STRING *a, int n, int value)
+{
+ int w, v, iv;
+ unsigned char *c;
+
+ w = n / 8;
+ v = 1 << (7 - (n & 0x07));
+ iv = ~v;
+ if (!value)
+ v = 0;
+
+ if (a == NULL)
+ return 0;
+
+ a->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07); /* clear, set on write */
+
+ if ((a->length < (w + 1)) || (a->data == NULL)) {
+ if (!value)
+ return (1); /* Don't need to set */
+ c = OPENSSL_clear_realloc(a->data, a->length, w + 1);
+ if (c == NULL) {
+ ASN1err(ASN1_F_ASN1_BIT_STRING_SET_BIT, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ if (w + 1 - a->length > 0)
+ memset(c + a->length, 0, w + 1 - a->length);
+ a->data = c;
+ a->length = w + 1;
+ }
+ a->data[w] = ((a->data[w]) & iv) | v;
+ while ((a->length > 0) && (a->data[a->length - 1] == 0))
+ a->length--;
+ return (1);
+}
+
+int ASN1_BIT_STRING_get_bit(const ASN1_BIT_STRING *a, int n)
+{
+ int w, v;
+
+ w = n / 8;
+ v = 1 << (7 - (n & 0x07));
+ if ((a == NULL) || (a->length < (w + 1)) || (a->data == NULL))
+ return (0);
+ return ((a->data[w] & v) != 0);
+}
+
+/*
+ * Checks if the given bit string contains only bits specified by
+ * the flags vector. Returns 0 if there is at least one bit set in 'a'
+ * which is not specified in 'flags', 1 otherwise.
+ * 'len' is the length of 'flags'.
+ */
+int ASN1_BIT_STRING_check(const ASN1_BIT_STRING *a,
+ const unsigned char *flags, int flags_len)
+{
+ int i, ok;
+ /* Check if there is one bit set at all. */
+ if (!a || !a->data)
+ return 1;
+
+ /*
+ * Check each byte of the internal representation of the bit string.
+ */
+ ok = 1;
+ for (i = 0; i < a->length && ok; ++i) {
+ unsigned char mask = i < flags_len ? ~flags[i] : 0xff;
+ /* We are done if there is an unneeded bit set. */
+ ok = (a->data[i] & mask) == 0;
+ }
+ return ok;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_d2i_fp.c b/openssl-1.1.0h/crypto/asn1/a_d2i_fp.c
new file mode 100644
index 0000000..e5c1d0e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_d2i_fp.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include "internal/cryptlib.h"
+#include "internal/numbers.h"
+#include <openssl/buffer.h>
+#include <openssl/asn1.h>
+
+static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb);
+
+#ifndef NO_OLD_ASN1
+# ifndef OPENSSL_NO_STDIO
+
+void *ASN1_d2i_fp(void *(*xnew) (void), d2i_of_void *d2i, FILE *in, void **x)
+{
+ BIO *b;
+ void *ret;
+
+ if ((b = BIO_new(BIO_s_file())) == NULL) {
+ ASN1err(ASN1_F_ASN1_D2I_FP, ERR_R_BUF_LIB);
+ return (NULL);
+ }
+ BIO_set_fp(b, in, BIO_NOCLOSE);
+ ret = ASN1_d2i_bio(xnew, d2i, b, x);
+ BIO_free(b);
+ return (ret);
+}
+# endif
+
+void *ASN1_d2i_bio(void *(*xnew) (void), d2i_of_void *d2i, BIO *in, void **x)
+{
+ BUF_MEM *b = NULL;
+ const unsigned char *p;
+ void *ret = NULL;
+ int len;
+
+ len = asn1_d2i_read_bio(in, &b);
+ if (len < 0)
+ goto err;
+
+ p = (unsigned char *)b->data;
+ ret = d2i(x, &p, len);
+ err:
+ BUF_MEM_free(b);
+ return (ret);
+}
+
+#endif
+
+void *ASN1_item_d2i_bio(const ASN1_ITEM *it, BIO *in, void *x)
+{
+ BUF_MEM *b = NULL;
+ const unsigned char *p;
+ void *ret = NULL;
+ int len;
+
+ len = asn1_d2i_read_bio(in, &b);
+ if (len < 0)
+ goto err;
+
+ p = (const unsigned char *)b->data;
+ ret = ASN1_item_d2i(x, &p, len, it);
+ err:
+ BUF_MEM_free(b);
+ return (ret);
+}
+
+#ifndef OPENSSL_NO_STDIO
+void *ASN1_item_d2i_fp(const ASN1_ITEM *it, FILE *in, void *x)
+{
+ BIO *b;
+ char *ret;
+
+ if ((b = BIO_new(BIO_s_file())) == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_D2I_FP, ERR_R_BUF_LIB);
+ return (NULL);
+ }
+ BIO_set_fp(b, in, BIO_NOCLOSE);
+ ret = ASN1_item_d2i_bio(it, b, x);
+ BIO_free(b);
+ return (ret);
+}
+#endif
+
+#define HEADER_SIZE 8
+#define ASN1_CHUNK_INITIAL_SIZE (16 * 1024)
+static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
+{
+ BUF_MEM *b;
+ unsigned char *p;
+ int i;
+ size_t want = HEADER_SIZE;
+ uint32_t eos = 0;
+ size_t off = 0;
+ size_t len = 0;
+
+ const unsigned char *q;
+ long slen;
+ int inf, tag, xclass;
+
+ b = BUF_MEM_new();
+ if (b == NULL) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ERR_R_MALLOC_FAILURE);
+ return -1;
+ }
+
+ ERR_clear_error();
+ for (;;) {
+ if (want >= (len - off)) {
+ want -= (len - off);
+
+ if (len + want < len || !BUF_MEM_grow_clean(b, len + want)) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ i = BIO_read(in, &(b->data[len]), want);
+ if ((i < 0) && ((len - off) == 0)) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ASN1_R_NOT_ENOUGH_DATA);
+ goto err;
+ }
+ if (i > 0) {
+ if (len + i < len) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ASN1_R_TOO_LONG);
+ goto err;
+ }
+ len += i;
+ }
+ }
+ /* else data already loaded */
+
+ p = (unsigned char *)&(b->data[off]);
+ q = p;
+ inf = ASN1_get_object(&q, &slen, &tag, &xclass, len - off);
+ if (inf & 0x80) {
+ unsigned long e;
+
+ e = ERR_GET_REASON(ERR_peek_error());
+ if (e != ASN1_R_TOO_LONG)
+ goto err;
+ else
+ ERR_clear_error(); /* clear error */
+ }
+ i = q - p; /* header length */
+ off += i; /* end of data */
+
+ if (inf & 1) {
+ /* no data body so go round again */
+ if (eos == UINT32_MAX) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ASN1_R_HEADER_TOO_LONG);
+ goto err;
+ }
+ eos++;
+ want = HEADER_SIZE;
+ } else if (eos && (slen == 0) && (tag == V_ASN1_EOC)) {
+ /* eos value, so go back and read another header */
+ eos--;
+ if (eos == 0)
+ break;
+ else
+ want = HEADER_SIZE;
+ } else {
+ /* suck in slen bytes of data */
+ want = slen;
+ if (want > (len - off)) {
+ size_t chunk_max = ASN1_CHUNK_INITIAL_SIZE;
+
+ want -= (len - off);
+ if (want > INT_MAX /* BIO_read takes an int length */ ||
+ len + want < len) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ASN1_R_TOO_LONG);
+ goto err;
+ }
+ while (want > 0) {
+ /*
+ * Read content in chunks of increasing size
+ * so we can return an error for EOF without
+ * having to allocate the entire content length
+ * in one go.
+ */
+ size_t chunk = want > chunk_max ? chunk_max : want;
+
+ if (!BUF_MEM_grow_clean(b, len + chunk)) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ want -= chunk;
+ while (chunk > 0) {
+ i = BIO_read(in, &(b->data[len]), chunk);
+ if (i <= 0) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,
+ ASN1_R_NOT_ENOUGH_DATA);
+ goto err;
+ }
+ /*
+ * This can't overflow because |len+want| didn't
+ * overflow.
+ */
+ len += i;
+ chunk -= i;
+ }
+ if (chunk_max < INT_MAX/2)
+ chunk_max *= 2;
+ }
+ }
+ if (off + slen < off) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ASN1_R_TOO_LONG);
+ goto err;
+ }
+ off += slen;
+ if (eos == 0) {
+ break;
+ } else
+ want = HEADER_SIZE;
+ }
+ }
+
+ if (off > INT_MAX) {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO, ASN1_R_TOO_LONG);
+ goto err;
+ }
+
+ *pb = b;
+ return off;
+ err:
+ BUF_MEM_free(b);
+ return -1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_digest.c b/openssl-1.1.0h/crypto/asn1/a_digest.c
new file mode 100644
index 0000000..c84ecc9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_digest.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include <sys/types.h>
+
+#include "internal/cryptlib.h"
+
+#include <openssl/err.h>
+#include <openssl/evp.h>
+#include <openssl/buffer.h>
+#include <openssl/x509.h>
+
+#ifndef NO_ASN1_OLD
+
+int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data,
+ unsigned char *md, unsigned int *len)
+{
+ int i;
+ unsigned char *str, *p;
+
+ i = i2d(data, NULL);
+ if ((str = OPENSSL_malloc(i)) == NULL) {
+ ASN1err(ASN1_F_ASN1_DIGEST, ERR_R_MALLOC_FAILURE);
+ return (0);
+ }
+ p = str;
+ i2d(data, &p);
+
+ if (!EVP_Digest(str, i, md, len, type, NULL)) {
+ OPENSSL_free(str);
+ return 0;
+ }
+ OPENSSL_free(str);
+ return (1);
+}
+
+#endif
+
+int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn,
+ unsigned char *md, unsigned int *len)
+{
+ int i;
+ unsigned char *str = NULL;
+
+ i = ASN1_item_i2d(asn, &str, it);
+ if (!str)
+ return (0);
+
+ if (!EVP_Digest(str, i, md, len, type, NULL)) {
+ OPENSSL_free(str);
+ return 0;
+ }
+ OPENSSL_free(str);
+ return (1);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_dup.c b/openssl-1.1.0h/crypto/asn1/a_dup.c
new file mode 100644
index 0000000..d9a57b2
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_dup.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+
+#ifndef NO_OLD_ASN1
+
+void *ASN1_dup(i2d_of_void *i2d, d2i_of_void *d2i, void *x)
+{
+ unsigned char *b, *p;
+ const unsigned char *p2;
+ int i;
+ char *ret;
+
+ if (x == NULL)
+ return (NULL);
+
+ i = i2d(x, NULL);
+ b = OPENSSL_malloc(i + 10);
+ if (b == NULL) {
+ ASN1err(ASN1_F_ASN1_DUP, ERR_R_MALLOC_FAILURE);
+ return (NULL);
+ }
+ p = b;
+ i = i2d(x, &p);
+ p2 = b;
+ ret = d2i(NULL, &p2, i);
+ OPENSSL_free(b);
+ return (ret);
+}
+
+#endif
+
+/*
+ * ASN1_ITEM version of dup: this follows the model above except we don't
+ * need to allocate the buffer. At some point this could be rewritten to
+ * directly dup the underlying structure instead of doing and encode and
+ * decode.
+ */
+
+void *ASN1_item_dup(const ASN1_ITEM *it, void *x)
+{
+ unsigned char *b = NULL;
+ const unsigned char *p;
+ long i;
+ void *ret;
+
+ if (x == NULL)
+ return (NULL);
+
+ i = ASN1_item_i2d(x, &b, it);
+ if (b == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_DUP, ERR_R_MALLOC_FAILURE);
+ return (NULL);
+ }
+ p = b;
+ ret = ASN1_item_d2i(NULL, &p, i, it);
+ OPENSSL_free(b);
+ return (ret);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_gentm.c b/openssl-1.1.0h/crypto/asn1/a_gentm.c
new file mode 100644
index 0000000..ff1b695
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_gentm.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * GENERALIZEDTIME implementation. Based on UTCTIME
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include "asn1_locl.h"
+
+int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d)
+{
+ static const int min[9] = { 0, 0, 1, 1, 0, 0, 0, 0, 0 };
+ static const int max[9] = { 99, 99, 12, 31, 23, 59, 59, 12, 59 };
+ char *a;
+ int n, i, l, o;
+
+ if (d->type != V_ASN1_GENERALIZEDTIME)
+ return (0);
+ l = d->length;
+ a = (char *)d->data;
+ o = 0;
+ /*
+ * GENERALIZEDTIME is similar to UTCTIME except the year is represented
+ * as YYYY. This stuff treats everything as a two digit field so make
+ * first two fields 00 to 99
+ */
+ if (l < 13)
+ goto err;
+ for (i = 0; i < 7; i++) {
+ if ((i == 6) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
+ i++;
+ if (tm)
+ tm->tm_sec = 0;
+ break;
+ }
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = a[o] - '0';
+ if (++o > l)
+ goto err;
+
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = (n * 10) + a[o] - '0';
+ if (++o > l)
+ goto err;
+
+ if ((n < min[i]) || (n > max[i]))
+ goto err;
+ if (tm) {
+ switch (i) {
+ case 0:
+ tm->tm_year = n * 100 - 1900;
+ break;
+ case 1:
+ tm->tm_year += n;
+ break;
+ case 2:
+ tm->tm_mon = n - 1;
+ break;
+ case 3:
+ tm->tm_mday = n;
+ break;
+ case 4:
+ tm->tm_hour = n;
+ break;
+ case 5:
+ tm->tm_min = n;
+ break;
+ case 6:
+ tm->tm_sec = n;
+ break;
+ }
+ }
+ }
+ /*
+ * Optional fractional seconds: decimal point followed by one or more
+ * digits.
+ */
+ if (a[o] == '.') {
+ if (++o > l)
+ goto err;
+ i = o;
+ while ((a[o] >= '0') && (a[o] <= '9') && (o <= l))
+ o++;
+ /* Must have at least one digit after decimal point */
+ if (i == o)
+ goto err;
+ }
+
+ if (a[o] == 'Z')
+ o++;
+ else if ((a[o] == '+') || (a[o] == '-')) {
+ int offsign = a[o] == '-' ? 1 : -1, offset = 0;
+ o++;
+ if (o + 4 > l)
+ goto err;
+ for (i = 7; i < 9; i++) {
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = a[o] - '0';
+ o++;
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = (n * 10) + a[o] - '0';
+ if ((n < min[i]) || (n > max[i]))
+ goto err;
+ if (tm) {
+ if (i == 7)
+ offset = n * 3600;
+ else if (i == 8)
+ offset += n * 60;
+ }
+ o++;
+ }
+ if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
+ return 0;
+ } else if (a[o]) {
+ /* Missing time zone information. */
+ goto err;
+ }
+ return (o == l);
+ err:
+ return (0);
+}
+
+int ASN1_GENERALIZEDTIME_check(const ASN1_GENERALIZEDTIME *d)
+{
+ return asn1_generalizedtime_to_tm(NULL, d);
+}
+
+int ASN1_GENERALIZEDTIME_set_string(ASN1_GENERALIZEDTIME *s, const char *str)
+{
+ ASN1_GENERALIZEDTIME t;
+
+ t.type = V_ASN1_GENERALIZEDTIME;
+ t.length = strlen(str);
+ t.data = (unsigned char *)str;
+ if (ASN1_GENERALIZEDTIME_check(&t)) {
+ if (s != NULL) {
+ if (!ASN1_STRING_set((ASN1_STRING *)s, str, t.length))
+ return 0;
+ s->type = V_ASN1_GENERALIZEDTIME;
+ }
+ return (1);
+ } else
+ return (0);
+}
+
+ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_set(ASN1_GENERALIZEDTIME *s,
+ time_t t)
+{
+ return ASN1_GENERALIZEDTIME_adj(s, t, 0, 0);
+}
+
+ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_adj(ASN1_GENERALIZEDTIME *s,
+ time_t t, int offset_day,
+ long offset_sec)
+{
+ char *p;
+ struct tm *ts;
+ struct tm data;
+ size_t len = 20;
+ ASN1_GENERALIZEDTIME *tmps = NULL;
+
+ if (s == NULL)
+ tmps = ASN1_GENERALIZEDTIME_new();
+ else
+ tmps = s;
+ if (tmps == NULL)
+ return NULL;
+
+ ts = OPENSSL_gmtime(&t, &data);
+ if (ts == NULL)
+ goto err;
+
+ if (offset_day || offset_sec) {
+ if (!OPENSSL_gmtime_adj(ts, offset_day, offset_sec))
+ goto err;
+ }
+
+ p = (char *)tmps->data;
+ if ((p == NULL) || ((size_t)tmps->length < len)) {
+ p = OPENSSL_malloc(len);
+ if (p == NULL) {
+ ASN1err(ASN1_F_ASN1_GENERALIZEDTIME_ADJ, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ OPENSSL_free(tmps->data);
+ tmps->data = (unsigned char *)p;
+ }
+
+ BIO_snprintf(p, len, "%04d%02d%02d%02d%02d%02dZ", ts->tm_year + 1900,
+ ts->tm_mon + 1, ts->tm_mday, ts->tm_hour, ts->tm_min,
+ ts->tm_sec);
+ tmps->length = strlen(p);
+ tmps->type = V_ASN1_GENERALIZEDTIME;
+#ifdef CHARSET_EBCDIC_not
+ ebcdic2ascii(tmps->data, tmps->data, tmps->length);
+#endif
+ return tmps;
+ err:
+ if (s == NULL)
+ ASN1_GENERALIZEDTIME_free(tmps);
+ return NULL;
+}
+
+const char *_asn1_mon[12] = {
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+};
+
+int ASN1_GENERALIZEDTIME_print(BIO *bp, const ASN1_GENERALIZEDTIME *tm)
+{
+ char *v;
+ int gmt = 0;
+ int i;
+ int y = 0, M = 0, d = 0, h = 0, m = 0, s = 0;
+ char *f = NULL;
+ int f_len = 0;
+
+ i = tm->length;
+ v = (char *)tm->data;
+
+ if (i < 12)
+ goto err;
+ if (v[i - 1] == 'Z')
+ gmt = 1;
+ for (i = 0; i < 12; i++)
+ if ((v[i] > '9') || (v[i] < '0'))
+ goto err;
+ y = (v[0] - '0') * 1000 + (v[1] - '0') * 100
+ + (v[2] - '0') * 10 + (v[3] - '0');
+ M = (v[4] - '0') * 10 + (v[5] - '0');
+ if ((M > 12) || (M < 1))
+ goto err;
+ d = (v[6] - '0') * 10 + (v[7] - '0');
+ h = (v[8] - '0') * 10 + (v[9] - '0');
+ m = (v[10] - '0') * 10 + (v[11] - '0');
+ if (tm->length >= 14 &&
+ (v[12] >= '0') && (v[12] <= '9') &&
+ (v[13] >= '0') && (v[13] <= '9')) {
+ s = (v[12] - '0') * 10 + (v[13] - '0');
+ /* Check for fractions of seconds. */
+ if (tm->length >= 15 && v[14] == '.') {
+ int l = tm->length;
+ f = &v[14]; /* The decimal point. */
+ f_len = 1;
+ while (14 + f_len < l && f[f_len] >= '0' && f[f_len] <= '9')
+ ++f_len;
+ }
+ }
+
+ if (BIO_printf(bp, "%s %2d %02d:%02d:%02d%.*s %d%s",
+ _asn1_mon[M - 1], d, h, m, s, f_len, f, y,
+ (gmt) ? " GMT" : "") <= 0)
+ return (0);
+ else
+ return (1);
+ err:
+ BIO_write(bp, "Bad time value", 14);
+ return (0);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_i2d_fp.c b/openssl-1.1.0h/crypto/asn1/a_i2d_fp.c
new file mode 100644
index 0000000..3b3f713
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_i2d_fp.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/buffer.h>
+#include <openssl/asn1.h>
+
+#ifndef NO_OLD_ASN1
+
+# ifndef OPENSSL_NO_STDIO
+int ASN1_i2d_fp(i2d_of_void *i2d, FILE *out, void *x)
+{
+ BIO *b;
+ int ret;
+
+ if ((b = BIO_new(BIO_s_file())) == NULL) {
+ ASN1err(ASN1_F_ASN1_I2D_FP, ERR_R_BUF_LIB);
+ return (0);
+ }
+ BIO_set_fp(b, out, BIO_NOCLOSE);
+ ret = ASN1_i2d_bio(i2d, b, x);
+ BIO_free(b);
+ return (ret);
+}
+# endif
+
+int ASN1_i2d_bio(i2d_of_void *i2d, BIO *out, unsigned char *x)
+{
+ char *b;
+ unsigned char *p;
+ int i, j = 0, n, ret = 1;
+
+ n = i2d(x, NULL);
+ if (n <= 0)
+ return 0;
+
+ b = OPENSSL_malloc(n);
+ if (b == NULL) {
+ ASN1err(ASN1_F_ASN1_I2D_BIO, ERR_R_MALLOC_FAILURE);
+ return (0);
+ }
+
+ p = (unsigned char *)b;
+ i2d(x, &p);
+
+ for (;;) {
+ i = BIO_write(out, &(b[j]), n);
+ if (i == n)
+ break;
+ if (i <= 0) {
+ ret = 0;
+ break;
+ }
+ j += i;
+ n -= i;
+ }
+ OPENSSL_free(b);
+ return (ret);
+}
+
+#endif
+
+#ifndef OPENSSL_NO_STDIO
+int ASN1_item_i2d_fp(const ASN1_ITEM *it, FILE *out, void *x)
+{
+ BIO *b;
+ int ret;
+
+ if ((b = BIO_new(BIO_s_file())) == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_I2D_FP, ERR_R_BUF_LIB);
+ return (0);
+ }
+ BIO_set_fp(b, out, BIO_NOCLOSE);
+ ret = ASN1_item_i2d_bio(it, b, x);
+ BIO_free(b);
+ return (ret);
+}
+#endif
+
+int ASN1_item_i2d_bio(const ASN1_ITEM *it, BIO *out, void *x)
+{
+ unsigned char *b = NULL;
+ int i, j = 0, n, ret = 1;
+
+ n = ASN1_item_i2d(x, &b, it);
+ if (b == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_I2D_BIO, ERR_R_MALLOC_FAILURE);
+ return (0);
+ }
+
+ for (;;) {
+ i = BIO_write(out, &(b[j]), n);
+ if (i == n)
+ break;
+ if (i <= 0) {
+ ret = 0;
+ break;
+ }
+ j += i;
+ n -= i;
+ }
+ OPENSSL_free(b);
+ return (ret);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_int.c b/openssl-1.1.0h/crypto/asn1/a_int.c
new file mode 100644
index 0000000..217650a
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_int.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright 1995-2017 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include "internal/numbers.h"
+#include <limits.h>
+#include <openssl/asn1.h>
+#include <openssl/bn.h>
+#include "asn1_locl.h"
+
+ASN1_INTEGER *ASN1_INTEGER_dup(const ASN1_INTEGER *x)
+{
+ return ASN1_STRING_dup(x);
+}
+
+int ASN1_INTEGER_cmp(const ASN1_INTEGER *x, const ASN1_INTEGER *y)
+{
+ int neg, ret;
+ /* Compare signs */
+ neg = x->type & V_ASN1_NEG;
+ if (neg != (y->type & V_ASN1_NEG)) {
+ if (neg)
+ return -1;
+ else
+ return 1;
+ }
+
+ ret = ASN1_STRING_cmp(x, y);
+
+ if (neg)
+ return -ret;
+ else
+ return ret;
+}
+
+/*-
+ * This converts a big endian buffer and sign into its content encoding.
+ * This is used for INTEGER and ENUMERATED types.
+ * The internal representation is an ASN1_STRING whose data is a big endian
+ * representation of the value, ignoring the sign. The sign is determined by
+ * the type: if type & V_ASN1_NEG is true it is negative, otherwise positive.
+ *
+ * Positive integers are no problem: they are almost the same as the DER
+ * encoding, except if the first byte is >= 0x80 we need to add a zero pad.
+ *
+ * Negative integers are a bit trickier...
+ * The DER representation of negative integers is in 2s complement form.
+ * The internal form is converted by complementing each octet and finally
+ * adding one to the result. This can be done less messily with a little trick.
+ * If the internal form has trailing zeroes then they will become FF by the
+ * complement and 0 by the add one (due to carry) so just copy as many trailing
+ * zeros to the destination as there are in the source. The carry will add one
+ * to the last none zero octet: so complement this octet and add one and finally
+ * complement any left over until you get to the start of the string.
+ *
+ * Padding is a little trickier too. If the first bytes is > 0x80 then we pad
+ * with 0xff. However if the first byte is 0x80 and one of the following bytes
+ * is non-zero we pad with 0xff. The reason for this distinction is that 0x80
+ * followed by optional zeros isn't padded.
+ */
+
+/*
+ * If |pad| is zero, the operation is effectively reduced to memcpy,
+ * and if |pad| is 0xff, then it performs two's complement, ~dst + 1.
+ * Note that in latter case sequence of zeros yields itself, and so
+ * does 0x80 followed by any number of zeros. These properties are
+ * used elsewhere below...
+ */
+static void twos_complement(unsigned char *dst, const unsigned char *src,
+ size_t len, unsigned char pad)
+{
+ unsigned int carry = pad & 1;
+
+ /* Begin at the end of the encoding */
+ dst += len;
+ src += len;
+ /* two's complement value: ~value + 1 */
+ while (len-- != 0) {
+ *(--dst) = (unsigned char)(carry += *(--src) ^ pad);
+ carry >>= 8;
+ }
+}
+
+static size_t i2c_ibuf(const unsigned char *b, size_t blen, int neg,
+ unsigned char **pp)
+{
+ unsigned int pad = 0;
+ size_t ret, i;
+ unsigned char *p, pb = 0;
+
+ if (b != NULL && blen) {
+ ret = blen;
+ i = b[0];
+ if (!neg && (i > 127)) {
+ pad = 1;
+ pb = 0;
+ } else if (neg) {
+ pb = 0xFF;
+ if (i > 128) {
+ pad = 1;
+ } else if (i == 128) {
+ /*
+ * Special case [of minimal negative for given length]:
+ * if any other bytes non zero we pad, otherwise we don't.
+ */
+ for (pad = 0, i = 1; i < blen; i++)
+ pad |= b[i];
+ pb = pad != 0 ? 0xffU : 0;
+ pad = pb & 1;
+ }
+ }
+ ret += pad;
+ } else {
+ ret = 1;
+ blen = 0; /* reduce '(b == NULL || blen == 0)' to '(blen == 0)' */
+ }
+
+ if (pp == NULL || (p = *pp) == NULL)
+ return ret;
+
+ /*
+ * This magically handles all corner cases, such as '(b == NULL ||
+ * blen == 0)', non-negative value, "negative" zero, 0x80 followed
+ * by any number of zeros...
+ */
+ *p = pb;
+ p += pad; /* yes, p[0] can be written twice, but it's little
+ * price to pay for eliminated branches */
+ twos_complement(p, b, blen, pb);
+
+ *pp += ret;
+ return ret;
+}
+
+/*
+ * convert content octets into a big endian buffer. Returns the length
+ * of buffer or 0 on error: for malformed INTEGER. If output buffer is
+ * NULL just return length.
+ */
+
+static size_t c2i_ibuf(unsigned char *b, int *pneg,
+ const unsigned char *p, size_t plen)
+{
+ int neg, pad;
+ /* Zero content length is illegal */
+ if (plen == 0) {
+ ASN1err(ASN1_F_C2I_IBUF, ASN1_R_ILLEGAL_ZERO_CONTENT);
+ return 0;
+ }
+ neg = p[0] & 0x80;
+ if (pneg)
+ *pneg = neg;
+ /* Handle common case where length is 1 octet separately */
+ if (plen == 1) {
+ if (b != NULL) {
+ if (neg)
+ b[0] = (p[0] ^ 0xFF) + 1;
+ else
+ b[0] = p[0];
+ }
+ return 1;
+ }
+
+ pad = 0;
+ if (p[0] == 0) {
+ pad = 1;
+ } else if (p[0] == 0xFF) {
+ size_t i;
+
+ /*
+ * Special case [of "one less minimal negative" for given length]:
+ * if any other bytes non zero it was padded, otherwise not.
+ */
+ for (pad = 0, i = 1; i < plen; i++)
+ pad |= p[i];
+ pad = pad != 0 ? 1 : 0;
+ }
+ /* reject illegal padding: first two octets MSB can't match */
+ if (pad && (neg == (p[1] & 0x80))) {
+ ASN1err(ASN1_F_C2I_IBUF, ASN1_R_ILLEGAL_PADDING);
+ return 0;
+ }
+
+ /* skip over pad */
+ p += pad;
+ plen -= pad;
+
+ if (b != NULL)
+ twos_complement(b, p, plen, neg ? 0xffU : 0);
+
+ return plen;
+}
+
+int i2c_ASN1_INTEGER(ASN1_INTEGER *a, unsigned char **pp)
+{
+ return i2c_ibuf(a->data, a->length, a->type & V_ASN1_NEG, pp);
+}
+
+/* Convert big endian buffer into uint64_t, return 0 on error */
+static int asn1_get_uint64(uint64_t *pr, const unsigned char *b, size_t blen)
+{
+ size_t i;
+ uint64_t r;
+
+ if (blen > sizeof(*pr)) {
+ ASN1err(ASN1_F_ASN1_GET_UINT64, ASN1_R_TOO_LARGE);
+ return 0;
+ }
+ if (b == NULL)
+ return 0;
+ for (r = 0, i = 0; i < blen; i++) {
+ r <<= 8;
+ r |= b[i];
+ }
+ *pr = r;
+ return 1;
+}
+
+/*
+ * Write uint64_t to big endian buffer and return offset to first
+ * written octet. In other words it returns offset in range from 0
+ * to 7, with 0 denoting 8 written octets and 7 - one.
+ */
+static size_t asn1_put_uint64(unsigned char b[sizeof(uint64_t)], uint64_t r)
+{
+ size_t off = sizeof(uint64_t);
+
+ do {
+ b[--off] = (unsigned char)r;
+ } while (r >>= 8);
+
+ return off;
+}
+
+/*
+ * Absolute value of INT64_MIN: we can't just use -INT64_MIN as gcc produces
+ * overflow warnings.
+ */
+#define ABS_INT64_MIN ((uint64_t)INT64_MAX + (-(INT64_MIN + INT64_MAX)))
+
+/* signed version of asn1_get_uint64 */
+static int asn1_get_int64(int64_t *pr, const unsigned char *b, size_t blen,
+ int neg)
+{
+ uint64_t r;
+ if (asn1_get_uint64(&r, b, blen) == 0)
+ return 0;
+ if (neg) {
+ if (r <= INT64_MAX) {
+ /* Most significant bit is guaranteed to be clear, negation
+ * is guaranteed to be meaningful in platform-neutral sense. */
+ *pr = -(int64_t)r;
+ } else if (r == ABS_INT64_MIN) {
+ /* This never happens if INT64_MAX == ABS_INT64_MIN, e.g.
+ * on ones'-complement system. */
+ *pr = (int64_t)(0 - r);
+ } else {
+ ASN1err(ASN1_F_ASN1_GET_INT64, ASN1_R_TOO_SMALL);
+ return 0;
+ }
+ } else {
+ if (r <= INT64_MAX) {
+ *pr = (int64_t)r;
+ } else {
+ ASN1err(ASN1_F_ASN1_GET_INT64, ASN1_R_TOO_LARGE);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/* Convert ASN1 INTEGER content octets to ASN1_INTEGER structure */
+ASN1_INTEGER *c2i_ASN1_INTEGER(ASN1_INTEGER **a, const unsigned char **pp,
+ long len)
+{
+ ASN1_INTEGER *ret = NULL;
+ size_t r;
+ int neg;
+
+ r = c2i_ibuf(NULL, NULL, *pp, len);
+
+ if (r == 0)
+ return NULL;
+
+ if ((a == NULL) || ((*a) == NULL)) {
+ ret = ASN1_INTEGER_new();
+ if (ret == NULL)
+ return NULL;
+ ret->type = V_ASN1_INTEGER;
+ } else
+ ret = *a;
+
+ if (ASN1_STRING_set(ret, NULL, r) == 0)
+ goto err;
+
+ c2i_ibuf(ret->data, &neg, *pp, len);
+
+ if (neg)
+ ret->type |= V_ASN1_NEG;
+
+ *pp += len;
+ if (a != NULL)
+ (*a) = ret;
+ return ret;
+ err:
+ ASN1err(ASN1_F_C2I_ASN1_INTEGER, ERR_R_MALLOC_FAILURE);
+ if ((a == NULL) || (*a != ret))
+ ASN1_INTEGER_free(ret);
+ return NULL;
+}
+
+static int asn1_string_get_int64(int64_t *pr, const ASN1_STRING *a, int itype)
+{
+ if (a == NULL) {
+ ASN1err(ASN1_F_ASN1_STRING_GET_INT64, ERR_R_PASSED_NULL_PARAMETER);
+ return 0;
+ }
+ if ((a->type & ~V_ASN1_NEG) != itype) {
+ ASN1err(ASN1_F_ASN1_STRING_GET_INT64, ASN1_R_WRONG_INTEGER_TYPE);
+ return 0;
+ }
+ return asn1_get_int64(pr, a->data, a->length, a->type & V_ASN1_NEG);
+}
+
+static int asn1_string_set_int64(ASN1_STRING *a, int64_t r, int itype)
+{
+ unsigned char tbuf[sizeof(r)];
+ size_t off;
+
+ a->type = itype;
+ if (r < 0) {
+ /* Most obvious '-r' triggers undefined behaviour for most
+ * common INT64_MIN. Even though below '0 - (uint64_t)r' can
+ * appear two's-complement centric, it does produce correct/
+ * expected result even on one's-complement. This is because
+ * cast to unsigned has to change bit pattern... */
+ off = asn1_put_uint64(tbuf, 0 - (uint64_t)r);
+ a->type |= V_ASN1_NEG;
+ } else {
+ off = asn1_put_uint64(tbuf, r);
+ a->type &= ~V_ASN1_NEG;
+ }
+ return ASN1_STRING_set(a, tbuf + off, sizeof(tbuf) - off);
+}
+
+static int asn1_string_get_uint64(uint64_t *pr, const ASN1_STRING *a,
+ int itype)
+{
+ if (a == NULL) {
+ ASN1err(ASN1_F_ASN1_STRING_GET_UINT64, ERR_R_PASSED_NULL_PARAMETER);
+ return 0;
+ }
+ if ((a->type & ~V_ASN1_NEG) != itype) {
+ ASN1err(ASN1_F_ASN1_STRING_GET_UINT64, ASN1_R_WRONG_INTEGER_TYPE);
+ return 0;
+ }
+ if (a->type & V_ASN1_NEG) {
+ ASN1err(ASN1_F_ASN1_STRING_GET_UINT64, ASN1_R_ILLEGAL_NEGATIVE_VALUE);
+ return 0;
+ }
+ return asn1_get_uint64(pr, a->data, a->length);
+}
+
+static int asn1_string_set_uint64(ASN1_STRING *a, uint64_t r, int itype)
+{
+ unsigned char tbuf[sizeof(r)];
+ size_t off;
+
+ a->type = itype;
+ off = asn1_put_uint64(tbuf, r);
+ return ASN1_STRING_set(a, tbuf + off, sizeof(tbuf) - off);
+}
+
+/*
+ * This is a version of d2i_ASN1_INTEGER that ignores the sign bit of ASN1
+ * integers: some broken software can encode a positive INTEGER with its MSB
+ * set as negative (it doesn't add a padding zero).
+ */
+
+ASN1_INTEGER *d2i_ASN1_UINTEGER(ASN1_INTEGER **a, const unsigned char **pp,
+ long length)
+{
+ ASN1_INTEGER *ret = NULL;
+ const unsigned char *p;
+ unsigned char *s;
+ long len;
+ int inf, tag, xclass;
+ int i;
+
+ if ((a == NULL) || ((*a) == NULL)) {
+ if ((ret = ASN1_INTEGER_new()) == NULL)
+ return (NULL);
+ ret->type = V_ASN1_INTEGER;
+ } else
+ ret = (*a);
+
+ p = *pp;
+ inf = ASN1_get_object(&p, &len, &tag, &xclass, length);
+ if (inf & 0x80) {
+ i = ASN1_R_BAD_OBJECT_HEADER;
+ goto err;
+ }
+
+ if (tag != V_ASN1_INTEGER) {
+ i = ASN1_R_EXPECTING_AN_INTEGER;
+ goto err;
+ }
+
+ /*
+ * We must OPENSSL_malloc stuff, even for 0 bytes otherwise it signifies
+ * a missing NULL parameter.
+ */
+ s = OPENSSL_malloc((int)len + 1);
+ if (s == NULL) {
+ i = ERR_R_MALLOC_FAILURE;
+ goto err;
+ }
+ ret->type = V_ASN1_INTEGER;
+ if (len) {
+ if ((*p == 0) && (len != 1)) {
+ p++;
+ len--;
+ }
+ memcpy(s, p, (int)len);
+ p += len;
+ }
+
+ OPENSSL_free(ret->data);
+ ret->data = s;
+ ret->length = (int)len;
+ if (a != NULL)
+ (*a) = ret;
+ *pp = p;
+ return (ret);
+ err:
+ ASN1err(ASN1_F_D2I_ASN1_UINTEGER, i);
+ if ((a == NULL) || (*a != ret))
+ ASN1_INTEGER_free(ret);
+ return (NULL);
+}
+
+static ASN1_STRING *bn_to_asn1_string(const BIGNUM *bn, ASN1_STRING *ai,
+ int atype)
+{
+ ASN1_INTEGER *ret;
+ int len;
+
+ if (ai == NULL) {
+ ret = ASN1_STRING_type_new(atype);
+ } else {
+ ret = ai;
+ ret->type = atype;
+ }
+
+ if (ret == NULL) {
+ ASN1err(ASN1_F_BN_TO_ASN1_STRING, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ }
+
+ if (BN_is_negative(bn) && !BN_is_zero(bn))
+ ret->type |= V_ASN1_NEG_INTEGER;
+
+ len = BN_num_bytes(bn);
+
+ if (len == 0)
+ len = 1;
+
+ if (ASN1_STRING_set(ret, NULL, len) == 0) {
+ ASN1err(ASN1_F_BN_TO_ASN1_STRING, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ /* Correct zero case */
+ if (BN_is_zero(bn))
+ ret->data[0] = 0;
+ else
+ len = BN_bn2bin(bn, ret->data);
+ ret->length = len;
+ return ret;
+ err:
+ if (ret != ai)
+ ASN1_INTEGER_free(ret);
+ return (NULL);
+}
+
+static BIGNUM *asn1_string_to_bn(const ASN1_INTEGER *ai, BIGNUM *bn,
+ int itype)
+{
+ BIGNUM *ret;
+
+ if ((ai->type & ~V_ASN1_NEG) != itype) {
+ ASN1err(ASN1_F_ASN1_STRING_TO_BN, ASN1_R_WRONG_INTEGER_TYPE);
+ return NULL;
+ }
+
+ ret = BN_bin2bn(ai->data, ai->length, bn);
+ if (ret == NULL) {
+ ASN1err(ASN1_F_ASN1_STRING_TO_BN, ASN1_R_BN_LIB);
+ return NULL;
+ }
+ if (ai->type & V_ASN1_NEG)
+ BN_set_negative(ret, 1);
+ return ret;
+}
+
+int ASN1_INTEGER_get_int64(int64_t *pr, const ASN1_INTEGER *a)
+{
+ return asn1_string_get_int64(pr, a, V_ASN1_INTEGER);
+}
+
+int ASN1_INTEGER_set_int64(ASN1_INTEGER *a, int64_t r)
+{
+ return asn1_string_set_int64(a, r, V_ASN1_INTEGER);
+}
+
+int ASN1_INTEGER_get_uint64(uint64_t *pr, const ASN1_INTEGER *a)
+{
+ return asn1_string_get_uint64(pr, a, V_ASN1_INTEGER);
+}
+
+int ASN1_INTEGER_set_uint64(ASN1_INTEGER *a, uint64_t r)
+{
+ return asn1_string_set_uint64(a, r, V_ASN1_INTEGER);
+}
+
+int ASN1_INTEGER_set(ASN1_INTEGER *a, long v)
+{
+ return ASN1_INTEGER_set_int64(a, v);
+}
+
+long ASN1_INTEGER_get(const ASN1_INTEGER *a)
+{
+ int i;
+ int64_t r;
+ if (a == NULL)
+ return 0;
+ i = ASN1_INTEGER_get_int64(&r, a);
+ if (i == 0)
+ return -1;
+ if (r > LONG_MAX || r < LONG_MIN)
+ return -1;
+ return (long)r;
+}
+
+ASN1_INTEGER *BN_to_ASN1_INTEGER(const BIGNUM *bn, ASN1_INTEGER *ai)
+{
+ return bn_to_asn1_string(bn, ai, V_ASN1_INTEGER);
+}
+
+BIGNUM *ASN1_INTEGER_to_BN(const ASN1_INTEGER *ai, BIGNUM *bn)
+{
+ return asn1_string_to_bn(ai, bn, V_ASN1_INTEGER);
+}
+
+int ASN1_ENUMERATED_get_int64(int64_t *pr, const ASN1_ENUMERATED *a)
+{
+ return asn1_string_get_int64(pr, a, V_ASN1_ENUMERATED);
+}
+
+int ASN1_ENUMERATED_set_int64(ASN1_ENUMERATED *a, int64_t r)
+{
+ return asn1_string_set_int64(a, r, V_ASN1_ENUMERATED);
+}
+
+int ASN1_ENUMERATED_set(ASN1_ENUMERATED *a, long v)
+{
+ return ASN1_ENUMERATED_set_int64(a, v);
+}
+
+long ASN1_ENUMERATED_get(const ASN1_ENUMERATED *a)
+{
+ int i;
+ int64_t r;
+ if (a == NULL)
+ return 0;
+ if ((a->type & ~V_ASN1_NEG) != V_ASN1_ENUMERATED)
+ return -1;
+ if (a->length > (int)sizeof(long))
+ return 0xffffffffL;
+ i = ASN1_ENUMERATED_get_int64(&r, a);
+ if (i == 0)
+ return -1;
+ if (r > LONG_MAX || r < LONG_MIN)
+ return -1;
+ return (long)r;
+}
+
+ASN1_ENUMERATED *BN_to_ASN1_ENUMERATED(const BIGNUM *bn, ASN1_ENUMERATED *ai)
+{
+ return bn_to_asn1_string(bn, ai, V_ASN1_ENUMERATED);
+}
+
+BIGNUM *ASN1_ENUMERATED_to_BN(const ASN1_ENUMERATED *ai, BIGNUM *bn)
+{
+ return asn1_string_to_bn(ai, bn, V_ASN1_ENUMERATED);
+}
+
+/* Internal functions used by x_int64.c */
+int c2i_uint64_int(uint64_t *ret, int *neg, const unsigned char **pp, long len)
+{
+ unsigned char buf[sizeof(uint64_t)];
+ size_t buflen;
+
+ buflen = c2i_ibuf(NULL, NULL, *pp, len);
+ if (buflen == 0)
+ return 0;
+ if (buflen > sizeof(uint64_t)) {
+ ASN1err(ASN1_F_C2I_UINT64_INT, ASN1_R_TOO_LARGE);
+ return 0;
+ }
+ (void)c2i_ibuf(buf, neg, *pp, len);
+ return asn1_get_uint64(ret, buf, buflen);
+}
+
+int i2c_uint64_int(unsigned char *p, uint64_t r, int neg)
+{
+ unsigned char buf[sizeof(uint64_t)];
+ size_t off;
+
+ off = asn1_put_uint64(buf, r);
+ return i2c_ibuf(buf + off, sizeof(buf) - off, neg, &p);
+}
+
diff --git a/openssl-1.1.0h/crypto/asn1/a_mbstr.c b/openssl-1.1.0h/crypto/asn1/a_mbstr.c
new file mode 100644
index 0000000..7a035af
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_mbstr.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+
+static int traverse_string(const unsigned char *p, int len, int inform,
+ int (*rfunc) (unsigned long value, void *in),
+ void *arg);
+static int in_utf8(unsigned long value, void *arg);
+static int out_utf8(unsigned long value, void *arg);
+static int type_str(unsigned long value, void *arg);
+static int cpy_asc(unsigned long value, void *arg);
+static int cpy_bmp(unsigned long value, void *arg);
+static int cpy_univ(unsigned long value, void *arg);
+static int cpy_utf8(unsigned long value, void *arg);
+static int is_numeric(unsigned long value);
+static int is_printable(unsigned long value);
+
+/*
+ * These functions take a string in UTF8, ASCII or multibyte form and a mask
+ * of permissible ASN1 string types. It then works out the minimal type
+ * (using the order Numeric < Printable < IA5 < T61 < BMP < Universal < UTF8)
+ * and creates a string of the correct type with the supplied data. Yes this is
+ * horrible: it has to be :-( The 'ncopy' form checks minimum and maximum
+ * size limits too.
+ */
+
+int ASN1_mbstring_copy(ASN1_STRING **out, const unsigned char *in, int len,
+ int inform, unsigned long mask)
+{
+ return ASN1_mbstring_ncopy(out, in, len, inform, mask, 0, 0);
+}
+
+int ASN1_mbstring_ncopy(ASN1_STRING **out, const unsigned char *in, int len,
+ int inform, unsigned long mask,
+ long minsize, long maxsize)
+{
+ int str_type;
+ int ret;
+ char free_out;
+ int outform, outlen = 0;
+ ASN1_STRING *dest;
+ unsigned char *p;
+ int nchar;
+ char strbuf[32];
+ int (*cpyfunc) (unsigned long, void *) = NULL;
+ if (len == -1)
+ len = strlen((const char *)in);
+ if (!mask)
+ mask = DIRSTRING_TYPE;
+
+ /* First do a string check and work out the number of characters */
+ switch (inform) {
+
+ case MBSTRING_BMP:
+ if (len & 1) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY,
+ ASN1_R_INVALID_BMPSTRING_LENGTH);
+ return -1;
+ }
+ nchar = len >> 1;
+ break;
+
+ case MBSTRING_UNIV:
+ if (len & 3) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY,
+ ASN1_R_INVALID_UNIVERSALSTRING_LENGTH);
+ return -1;
+ }
+ nchar = len >> 2;
+ break;
+
+ case MBSTRING_UTF8:
+ nchar = 0;
+ /* This counts the characters and does utf8 syntax checking */
+ ret = traverse_string(in, len, MBSTRING_UTF8, in_utf8, &nchar);
+ if (ret < 0) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ASN1_R_INVALID_UTF8STRING);
+ return -1;
+ }
+ break;
+
+ case MBSTRING_ASC:
+ nchar = len;
+ break;
+
+ default:
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ASN1_R_UNKNOWN_FORMAT);
+ return -1;
+ }
+
+ if ((minsize > 0) && (nchar < minsize)) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ASN1_R_STRING_TOO_SHORT);
+ BIO_snprintf(strbuf, sizeof(strbuf), "%ld", minsize);
+ ERR_add_error_data(2, "minsize=", strbuf);
+ return -1;
+ }
+
+ if ((maxsize > 0) && (nchar > maxsize)) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ASN1_R_STRING_TOO_LONG);
+ BIO_snprintf(strbuf, sizeof(strbuf), "%ld", maxsize);
+ ERR_add_error_data(2, "maxsize=", strbuf);
+ return -1;
+ }
+
+ /* Now work out minimal type (if any) */
+ if (traverse_string(in, len, inform, type_str, &mask) < 0) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ASN1_R_ILLEGAL_CHARACTERS);
+ return -1;
+ }
+
+ /* Now work out output format and string type */
+ outform = MBSTRING_ASC;
+ if (mask & B_ASN1_NUMERICSTRING)
+ str_type = V_ASN1_NUMERICSTRING;
+ else if (mask & B_ASN1_PRINTABLESTRING)
+ str_type = V_ASN1_PRINTABLESTRING;
+ else if (mask & B_ASN1_IA5STRING)
+ str_type = V_ASN1_IA5STRING;
+ else if (mask & B_ASN1_T61STRING)
+ str_type = V_ASN1_T61STRING;
+ else if (mask & B_ASN1_BMPSTRING) {
+ str_type = V_ASN1_BMPSTRING;
+ outform = MBSTRING_BMP;
+ } else if (mask & B_ASN1_UNIVERSALSTRING) {
+ str_type = V_ASN1_UNIVERSALSTRING;
+ outform = MBSTRING_UNIV;
+ } else {
+ str_type = V_ASN1_UTF8STRING;
+ outform = MBSTRING_UTF8;
+ }
+ if (!out)
+ return str_type;
+ if (*out) {
+ free_out = 0;
+ dest = *out;
+ OPENSSL_free(dest->data);
+ dest->data = NULL;
+ dest->length = 0;
+ dest->type = str_type;
+ } else {
+ free_out = 1;
+ dest = ASN1_STRING_type_new(str_type);
+ if (dest == NULL) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ERR_R_MALLOC_FAILURE);
+ return -1;
+ }
+ *out = dest;
+ }
+ /* If both the same type just copy across */
+ if (inform == outform) {
+ if (!ASN1_STRING_set(dest, in, len)) {
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ERR_R_MALLOC_FAILURE);
+ return -1;
+ }
+ return str_type;
+ }
+
+ /* Work out how much space the destination will need */
+ switch (outform) {
+ case MBSTRING_ASC:
+ outlen = nchar;
+ cpyfunc = cpy_asc;
+ break;
+
+ case MBSTRING_BMP:
+ outlen = nchar << 1;
+ cpyfunc = cpy_bmp;
+ break;
+
+ case MBSTRING_UNIV:
+ outlen = nchar << 2;
+ cpyfunc = cpy_univ;
+ break;
+
+ case MBSTRING_UTF8:
+ outlen = 0;
+ traverse_string(in, len, inform, out_utf8, &outlen);
+ cpyfunc = cpy_utf8;
+ break;
+ }
+ if ((p = OPENSSL_malloc(outlen + 1)) == NULL) {
+ if (free_out)
+ ASN1_STRING_free(dest);
+ ASN1err(ASN1_F_ASN1_MBSTRING_NCOPY, ERR_R_MALLOC_FAILURE);
+ return -1;
+ }
+ dest->length = outlen;
+ dest->data = p;
+ p[outlen] = 0;
+ traverse_string(in, len, inform, cpyfunc, &p);
+ return str_type;
+}
+
+/*
+ * This function traverses a string and passes the value of each character to
+ * an optional function along with a void * argument.
+ */
+
+static int traverse_string(const unsigned char *p, int len, int inform,
+ int (*rfunc) (unsigned long value, void *in),
+ void *arg)
+{
+ unsigned long value;
+ int ret;
+ while (len) {
+ if (inform == MBSTRING_ASC) {
+ value = *p++;
+ len--;
+ } else if (inform == MBSTRING_BMP) {
+ value = *p++ << 8;
+ value |= *p++;
+ len -= 2;
+ } else if (inform == MBSTRING_UNIV) {
+ value = ((unsigned long)*p++) << 24;
+ value |= ((unsigned long)*p++) << 16;
+ value |= *p++ << 8;
+ value |= *p++;
+ len -= 4;
+ } else {
+ ret = UTF8_getc(p, len, &value);
+ if (ret < 0)
+ return -1;
+ len -= ret;
+ p += ret;
+ }
+ if (rfunc) {
+ ret = rfunc(value, arg);
+ if (ret <= 0)
+ return ret;
+ }
+ }
+ return 1;
+}
+
+/* Various utility functions for traverse_string */
+
+/* Just count number of characters */
+
+static int in_utf8(unsigned long value, void *arg)
+{
+ int *nchar;
+ nchar = arg;
+ (*nchar)++;
+ return 1;
+}
+
+/* Determine size of output as a UTF8 String */
+
+static int out_utf8(unsigned long value, void *arg)
+{
+ int *outlen;
+ outlen = arg;
+ *outlen += UTF8_putc(NULL, -1, value);
+ return 1;
+}
+
+/*
+ * Determine the "type" of a string: check each character against a supplied
+ * "mask".
+ */
+
+static int type_str(unsigned long value, void *arg)
+{
+ unsigned long types;
+ types = *((unsigned long *)arg);
+ if ((types & B_ASN1_NUMERICSTRING) && !is_numeric(value))
+ types &= ~B_ASN1_NUMERICSTRING;
+ if ((types & B_ASN1_PRINTABLESTRING) && !is_printable(value))
+ types &= ~B_ASN1_PRINTABLESTRING;
+ if ((types & B_ASN1_IA5STRING) && (value > 127))
+ types &= ~B_ASN1_IA5STRING;
+ if ((types & B_ASN1_T61STRING) && (value > 0xff))
+ types &= ~B_ASN1_T61STRING;
+ if ((types & B_ASN1_BMPSTRING) && (value > 0xffff))
+ types &= ~B_ASN1_BMPSTRING;
+ if (!types)
+ return -1;
+ *((unsigned long *)arg) = types;
+ return 1;
+}
+
+/* Copy one byte per character ASCII like strings */
+
+static int cpy_asc(unsigned long value, void *arg)
+{
+ unsigned char **p, *q;
+ p = arg;
+ q = *p;
+ *q = (unsigned char)value;
+ (*p)++;
+ return 1;
+}
+
+/* Copy two byte per character BMPStrings */
+
+static int cpy_bmp(unsigned long value, void *arg)
+{
+ unsigned char **p, *q;
+ p = arg;
+ q = *p;
+ *q++ = (unsigned char)((value >> 8) & 0xff);
+ *q = (unsigned char)(value & 0xff);
+ *p += 2;
+ return 1;
+}
+
+/* Copy four byte per character UniversalStrings */
+
+static int cpy_univ(unsigned long value, void *arg)
+{
+ unsigned char **p, *q;
+ p = arg;
+ q = *p;
+ *q++ = (unsigned char)((value >> 24) & 0xff);
+ *q++ = (unsigned char)((value >> 16) & 0xff);
+ *q++ = (unsigned char)((value >> 8) & 0xff);
+ *q = (unsigned char)(value & 0xff);
+ *p += 4;
+ return 1;
+}
+
+/* Copy to a UTF8String */
+
+static int cpy_utf8(unsigned long value, void *arg)
+{
+ unsigned char **p;
+ int ret;
+ p = arg;
+ /* We already know there is enough room so pass 0xff as the length */
+ ret = UTF8_putc(*p, 0xff, value);
+ *p += ret;
+ return 1;
+}
+
+/* Return 1 if the character is permitted in a PrintableString */
+static int is_printable(unsigned long value)
+{
+ int ch;
+ if (value > 0x7f)
+ return 0;
+ ch = (int)value;
+ /*
+ * Note: we can't use 'isalnum' because certain accented characters may
+ * count as alphanumeric in some environments.
+ */
+#ifndef CHARSET_EBCDIC
+ if ((ch >= 'a') && (ch <= 'z'))
+ return 1;
+ if ((ch >= 'A') && (ch <= 'Z'))
+ return 1;
+ if ((ch >= '0') && (ch <= '9'))
+ return 1;
+ if ((ch == ' ') || strchr("'()+,-./:=?", ch))
+ return 1;
+#else /* CHARSET_EBCDIC */
+ if ((ch >= os_toascii['a']) && (ch <= os_toascii['z']))
+ return 1;
+ if ((ch >= os_toascii['A']) && (ch <= os_toascii['Z']))
+ return 1;
+ if ((ch >= os_toascii['0']) && (ch <= os_toascii['9']))
+ return 1;
+ if ((ch == os_toascii[' ']) || strchr("'()+,-./:=?", os_toebcdic[ch]))
+ return 1;
+#endif /* CHARSET_EBCDIC */
+ return 0;
+}
+
+/* Return 1 if the character is a digit or space */
+static int is_numeric(unsigned long value)
+{
+ int ch;
+ if (value > 0x7f)
+ return 0;
+ ch = (int)value;
+#ifndef CHARSET_EBCDIC
+ if (!isdigit(ch) && ch != ' ')
+ return 0;
+#else
+ if (ch > os_toascii['9'])
+ return 0;
+ if (ch < os_toascii['0'] && ch != os_toascii[' '])
+ return 0;
+#endif
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_object.c b/openssl-1.1.0h/crypto/asn1/a_object.c
new file mode 100644
index 0000000..1ec7a7e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_object.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include "internal/cryptlib.h"
+#include <openssl/buffer.h>
+#include <openssl/asn1.h>
+#include <openssl/objects.h>
+#include <openssl/bn.h>
+#include "internal/asn1_int.h"
+#include "asn1_locl.h"
+
+int i2d_ASN1_OBJECT(const ASN1_OBJECT *a, unsigned char **pp)
+{
+ unsigned char *p;
+ int objsize;
+
+ if ((a == NULL) || (a->data == NULL))
+ return (0);
+
+ objsize = ASN1_object_size(0, a->length, V_ASN1_OBJECT);
+ if (pp == NULL || objsize == -1)
+ return objsize;
+
+ p = *pp;
+ ASN1_put_object(&p, 0, a->length, V_ASN1_OBJECT, V_ASN1_UNIVERSAL);
+ memcpy(p, a->data, a->length);
+ p += a->length;
+
+ *pp = p;
+ return (objsize);
+}
+
+int a2d_ASN1_OBJECT(unsigned char *out, int olen, const char *buf, int num)
+{
+ int i, first, len = 0, c, use_bn;
+ char ftmp[24], *tmp = ftmp;
+ int tmpsize = sizeof(ftmp);
+ const char *p;
+ unsigned long l;
+ BIGNUM *bl = NULL;
+
+ if (num == 0)
+ return (0);
+ else if (num == -1)
+ num = strlen(buf);
+
+ p = buf;
+ c = *(p++);
+ num--;
+ if ((c >= '0') && (c <= '2')) {
+ first = c - '0';
+ } else {
+ ASN1err(ASN1_F_A2D_ASN1_OBJECT, ASN1_R_FIRST_NUM_TOO_LARGE);
+ goto err;
+ }
+
+ if (num <= 0) {
+ ASN1err(ASN1_F_A2D_ASN1_OBJECT, ASN1_R_MISSING_SECOND_NUMBER);
+ goto err;
+ }
+ c = *(p++);
+ num--;
+ for (;;) {
+ if (num <= 0)
+ break;
+ if ((c != '.') && (c != ' ')) {
+ ASN1err(ASN1_F_A2D_ASN1_OBJECT, ASN1_R_INVALID_SEPARATOR);
+ goto err;
+ }
+ l = 0;
+ use_bn = 0;
+ for (;;) {
+ if (num <= 0)
+ break;
+ num--;
+ c = *(p++);
+ if ((c == ' ') || (c == '.'))
+ break;
+ if ((c < '0') || (c > '9')) {
+ ASN1err(ASN1_F_A2D_ASN1_OBJECT, ASN1_R_INVALID_DIGIT);
+ goto err;
+ }
+ if (!use_bn && l >= ((ULONG_MAX - 80) / 10L)) {
+ use_bn = 1;
+ if (bl == NULL)
+ bl = BN_new();
+ if (bl == NULL || !BN_set_word(bl, l))
+ goto err;
+ }
+ if (use_bn) {
+ if (!BN_mul_word(bl, 10L)
+ || !BN_add_word(bl, c - '0'))
+ goto err;
+ } else
+ l = l * 10L + (long)(c - '0');
+ }
+ if (len == 0) {
+ if ((first < 2) && (l >= 40)) {
+ ASN1err(ASN1_F_A2D_ASN1_OBJECT,
+ ASN1_R_SECOND_NUMBER_TOO_LARGE);
+ goto err;
+ }
+ if (use_bn) {
+ if (!BN_add_word(bl, first * 40))
+ goto err;
+ } else
+ l += (long)first *40;
+ }
+ i = 0;
+ if (use_bn) {
+ int blsize;
+ blsize = BN_num_bits(bl);
+ blsize = (blsize + 6) / 7;
+ if (blsize > tmpsize) {
+ if (tmp != ftmp)
+ OPENSSL_free(tmp);
+ tmpsize = blsize + 32;
+ tmp = OPENSSL_malloc(tmpsize);
+ if (tmp == NULL)
+ goto err;
+ }
+ while (blsize--) {
+ BN_ULONG t = BN_div_word(bl, 0x80L);
+ if (t == (BN_ULONG)-1)
+ goto err;
+ tmp[i++] = (unsigned char)t;
+ }
+ } else {
+
+ for (;;) {
+ tmp[i++] = (unsigned char)l & 0x7f;
+ l >>= 7L;
+ if (l == 0L)
+ break;
+ }
+
+ }
+ if (out != NULL) {
+ if (len + i > olen) {
+ ASN1err(ASN1_F_A2D_ASN1_OBJECT, ASN1_R_BUFFER_TOO_SMALL);
+ goto err;
+ }
+ while (--i > 0)
+ out[len++] = tmp[i] | 0x80;
+ out[len++] = tmp[0];
+ } else
+ len += i;
+ }
+ if (tmp != ftmp)
+ OPENSSL_free(tmp);
+ BN_free(bl);
+ return (len);
+ err:
+ if (tmp != ftmp)
+ OPENSSL_free(tmp);
+ BN_free(bl);
+ return (0);
+}
+
+int i2t_ASN1_OBJECT(char *buf, int buf_len, const ASN1_OBJECT *a)
+{
+ return OBJ_obj2txt(buf, buf_len, a, 0);
+}
+
+int i2a_ASN1_OBJECT(BIO *bp, const ASN1_OBJECT *a)
+{
+ char buf[80], *p = buf;
+ int i;
+
+ if ((a == NULL) || (a->data == NULL))
+ return (BIO_write(bp, "NULL", 4));
+ i = i2t_ASN1_OBJECT(buf, sizeof(buf), a);
+ if (i > (int)(sizeof(buf) - 1)) {
+ p = OPENSSL_malloc(i + 1);
+ if (p == NULL)
+ return -1;
+ i2t_ASN1_OBJECT(p, i + 1, a);
+ }
+ if (i <= 0) {
+ i = BIO_write(bp, "<INVALID>", 9);
+ i += BIO_dump(bp, (const char *)a->data, a->length);
+ return i;
+ }
+ BIO_write(bp, p, i);
+ if (p != buf)
+ OPENSSL_free(p);
+ return (i);
+}
+
+ASN1_OBJECT *d2i_ASN1_OBJECT(ASN1_OBJECT **a, const unsigned char **pp,
+ long length)
+{
+ const unsigned char *p;
+ long len;
+ int tag, xclass;
+ int inf, i;
+ ASN1_OBJECT *ret = NULL;
+ p = *pp;
+ inf = ASN1_get_object(&p, &len, &tag, &xclass, length);
+ if (inf & 0x80) {
+ i = ASN1_R_BAD_OBJECT_HEADER;
+ goto err;
+ }
+
+ if (tag != V_ASN1_OBJECT) {
+ i = ASN1_R_EXPECTING_AN_OBJECT;
+ goto err;
+ }
+ ret = c2i_ASN1_OBJECT(a, &p, len);
+ if (ret)
+ *pp = p;
+ return ret;
+ err:
+ ASN1err(ASN1_F_D2I_ASN1_OBJECT, i);
+ return (NULL);
+}
+
+ASN1_OBJECT *c2i_ASN1_OBJECT(ASN1_OBJECT **a, const unsigned char **pp,
+ long len)
+{
+ ASN1_OBJECT *ret = NULL, tobj;
+ const unsigned char *p;
+ unsigned char *data;
+ int i, length;
+
+ /*
+ * Sanity check OID encoding. Need at least one content octet. MSB must
+ * be clear in the last octet. can't have leading 0x80 in subidentifiers,
+ * see: X.690 8.19.2
+ */
+ if (len <= 0 || len > INT_MAX || pp == NULL || (p = *pp) == NULL ||
+ p[len - 1] & 0x80) {
+ ASN1err(ASN1_F_C2I_ASN1_OBJECT, ASN1_R_INVALID_OBJECT_ENCODING);
+ return NULL;
+ }
+ /* Now 0 < len <= INT_MAX, so the cast is safe. */
+ length = (int)len;
+ /*
+ * Try to lookup OID in table: these are all valid encodings so if we get
+ * a match we know the OID is valid.
+ */
+ tobj.nid = NID_undef;
+ tobj.data = p;
+ tobj.length = length;
+ tobj.flags = 0;
+ i = OBJ_obj2nid(&tobj);
+ if (i != NID_undef) {
+ /*
+ * Return shared registered OID object: this improves efficiency
+ * because we don't have to return a dynamically allocated OID
+ * and NID lookups can use the cached value.
+ */
+ ret = OBJ_nid2obj(i);
+ if (a) {
+ ASN1_OBJECT_free(*a);
+ *a = ret;
+ }
+ *pp += len;
+ return ret;
+ }
+ for (i = 0; i < length; i++, p++) {
+ if (*p == 0x80 && (!i || !(p[-1] & 0x80))) {
+ ASN1err(ASN1_F_C2I_ASN1_OBJECT, ASN1_R_INVALID_OBJECT_ENCODING);
+ return NULL;
+ }
+ }
+
+ /*
+ * only the ASN1_OBJECTs from the 'table' will have values for ->sn or
+ * ->ln
+ */
+ if ((a == NULL) || ((*a) == NULL) ||
+ !((*a)->flags & ASN1_OBJECT_FLAG_DYNAMIC)) {
+ if ((ret = ASN1_OBJECT_new()) == NULL)
+ return (NULL);
+ } else
+ ret = (*a);
+
+ p = *pp;
+ /* detach data from object */
+ data = (unsigned char *)ret->data;
+ ret->data = NULL;
+ /* once detached we can change it */
+ if ((data == NULL) || (ret->length < length)) {
+ ret->length = 0;
+ OPENSSL_free(data);
+ data = OPENSSL_malloc(length);
+ if (data == NULL) {
+ i = ERR_R_MALLOC_FAILURE;
+ goto err;
+ }
+ ret->flags |= ASN1_OBJECT_FLAG_DYNAMIC_DATA;
+ }
+ memcpy(data, p, length);
+ /* reattach data to object, after which it remains const */
+ ret->data = data;
+ ret->length = length;
+ ret->sn = NULL;
+ ret->ln = NULL;
+ /* ret->flags=ASN1_OBJECT_FLAG_DYNAMIC; we know it is dynamic */
+ p += length;
+
+ if (a != NULL)
+ (*a) = ret;
+ *pp = p;
+ return (ret);
+ err:
+ ASN1err(ASN1_F_C2I_ASN1_OBJECT, i);
+ if ((a == NULL) || (*a != ret))
+ ASN1_OBJECT_free(ret);
+ return (NULL);
+}
+
+ASN1_OBJECT *ASN1_OBJECT_new(void)
+{
+ ASN1_OBJECT *ret;
+
+ ret = OPENSSL_zalloc(sizeof(*ret));
+ if (ret == NULL) {
+ ASN1err(ASN1_F_ASN1_OBJECT_NEW, ERR_R_MALLOC_FAILURE);
+ return (NULL);
+ }
+ ret->flags = ASN1_OBJECT_FLAG_DYNAMIC;
+ return (ret);
+}
+
+void ASN1_OBJECT_free(ASN1_OBJECT *a)
+{
+ if (a == NULL)
+ return;
+ if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC_STRINGS) {
+#ifndef CONST_STRICT /* disable purely for compile-time strict
+ * const checking. Doing this on a "real"
+ * compile will cause memory leaks */
+ OPENSSL_free((void*)a->sn);
+ OPENSSL_free((void*)a->ln);
+#endif
+ a->sn = a->ln = NULL;
+ }
+ if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC_DATA) {
+ OPENSSL_free((void*)a->data);
+ a->data = NULL;
+ a->length = 0;
+ }
+ if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC)
+ OPENSSL_free(a);
+}
+
+ASN1_OBJECT *ASN1_OBJECT_create(int nid, unsigned char *data, int len,
+ const char *sn, const char *ln)
+{
+ ASN1_OBJECT o;
+
+ o.sn = sn;
+ o.ln = ln;
+ o.data = data;
+ o.nid = nid;
+ o.length = len;
+ o.flags = ASN1_OBJECT_FLAG_DYNAMIC | ASN1_OBJECT_FLAG_DYNAMIC_STRINGS |
+ ASN1_OBJECT_FLAG_DYNAMIC_DATA;
+ return (OBJ_dup(&o));
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_octet.c b/openssl-1.1.0h/crypto/asn1/a_octet.c
new file mode 100644
index 0000000..2e1205c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_octet.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+
+ASN1_OCTET_STRING *ASN1_OCTET_STRING_dup(const ASN1_OCTET_STRING *x)
+{
+ return ASN1_STRING_dup(x);
+}
+
+int ASN1_OCTET_STRING_cmp(const ASN1_OCTET_STRING *a,
+ const ASN1_OCTET_STRING *b)
+{
+ return ASN1_STRING_cmp(a, b);
+}
+
+int ASN1_OCTET_STRING_set(ASN1_OCTET_STRING *x, const unsigned char *d,
+ int len)
+{
+ return ASN1_STRING_set(x, d, len);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_print.c b/openssl-1.1.0h/crypto/asn1/a_print.c
new file mode 100644
index 0000000..1aafe7c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_print.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+
+int ASN1_PRINTABLE_type(const unsigned char *s, int len)
+{
+ int c;
+ int ia5 = 0;
+ int t61 = 0;
+
+ if (len <= 0)
+ len = -1;
+ if (s == NULL)
+ return (V_ASN1_PRINTABLESTRING);
+
+ while ((*s) && (len-- != 0)) {
+ c = *(s++);
+#ifndef CHARSET_EBCDIC
+ if (!(((c >= 'a') && (c <= 'z')) ||
+ ((c >= 'A') && (c <= 'Z')) ||
+ ((c >= '0') && (c <= '9')) ||
+ (c == ' ') || (c == '\'') ||
+ (c == '(') || (c == ')') ||
+ (c == '+') || (c == ',') ||
+ (c == '-') || (c == '.') ||
+ (c == '/') || (c == ':') || (c == '=') || (c == '?')))
+ ia5 = 1;
+ if (c & 0x80)
+ t61 = 1;
+#else
+ if (!isalnum(c) && (c != ' ') && strchr("'()+,-./:=?", c) == NULL)
+ ia5 = 1;
+ if (os_toascii[c] & 0x80)
+ t61 = 1;
+#endif
+ }
+ if (t61)
+ return (V_ASN1_T61STRING);
+ if (ia5)
+ return (V_ASN1_IA5STRING);
+ return (V_ASN1_PRINTABLESTRING);
+}
+
+int ASN1_UNIVERSALSTRING_to_string(ASN1_UNIVERSALSTRING *s)
+{
+ int i;
+ unsigned char *p;
+
+ if (s->type != V_ASN1_UNIVERSALSTRING)
+ return (0);
+ if ((s->length % 4) != 0)
+ return (0);
+ p = s->data;
+ for (i = 0; i < s->length; i += 4) {
+ if ((p[0] != '\0') || (p[1] != '\0') || (p[2] != '\0'))
+ break;
+ else
+ p += 4;
+ }
+ if (i < s->length)
+ return (0);
+ p = s->data;
+ for (i = 3; i < s->length; i += 4) {
+ *(p++) = s->data[i];
+ }
+ *(p) = '\0';
+ s->length /= 4;
+ s->type = ASN1_PRINTABLE_type(s->data, s->length);
+ return (1);
+}
+
+int ASN1_STRING_print(BIO *bp, const ASN1_STRING *v)
+{
+ int i, n;
+ char buf[80];
+ const char *p;
+
+ if (v == NULL)
+ return (0);
+ n = 0;
+ p = (const char *)v->data;
+ for (i = 0; i < v->length; i++) {
+ if ((p[i] > '~') || ((p[i] < ' ') &&
+ (p[i] != '\n') && (p[i] != '\r')))
+ buf[n] = '.';
+ else
+ buf[n] = p[i];
+ n++;
+ if (n >= 80) {
+ if (BIO_write(bp, buf, n) <= 0)
+ return (0);
+ n = 0;
+ }
+ }
+ if (n > 0)
+ if (BIO_write(bp, buf, n) <= 0)
+ return (0);
+ return (1);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_sign.c b/openssl-1.1.0h/crypto/asn1/a_sign.c
new file mode 100644
index 0000000..3b261eb
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_sign.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include <sys/types.h>
+
+#include "internal/cryptlib.h"
+
+#include <openssl/bn.h>
+#include <openssl/evp.h>
+#include <openssl/x509.h>
+#include <openssl/objects.h>
+#include <openssl/buffer.h>
+#include "internal/asn1_int.h"
+#include "internal/evp_int.h"
+
+#ifndef NO_ASN1_OLD
+
+int ASN1_sign(i2d_of_void *i2d, X509_ALGOR *algor1, X509_ALGOR *algor2,
+ ASN1_BIT_STRING *signature, char *data, EVP_PKEY *pkey,
+ const EVP_MD *type)
+{
+ EVP_MD_CTX *ctx = EVP_MD_CTX_new();
+ unsigned char *p, *buf_in = NULL, *buf_out = NULL;
+ int i, inl = 0, outl = 0, outll = 0;
+ X509_ALGOR *a;
+
+ if (ctx == NULL) {
+ ASN1err(ASN1_F_ASN1_SIGN, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ for (i = 0; i < 2; i++) {
+ if (i == 0)
+ a = algor1;
+ else
+ a = algor2;
+ if (a == NULL)
+ continue;
+ if (type->pkey_type == NID_dsaWithSHA1) {
+ /*
+ * special case: RFC 2459 tells us to omit 'parameters' with
+ * id-dsa-with-sha1
+ */
+ ASN1_TYPE_free(a->parameter);
+ a->parameter = NULL;
+ } else if ((a->parameter == NULL) ||
+ (a->parameter->type != V_ASN1_NULL)) {
+ ASN1_TYPE_free(a->parameter);
+ if ((a->parameter = ASN1_TYPE_new()) == NULL)
+ goto err;
+ a->parameter->type = V_ASN1_NULL;
+ }
+ ASN1_OBJECT_free(a->algorithm);
+ a->algorithm = OBJ_nid2obj(type->pkey_type);
+ if (a->algorithm == NULL) {
+ ASN1err(ASN1_F_ASN1_SIGN, ASN1_R_UNKNOWN_OBJECT_TYPE);
+ goto err;
+ }
+ if (a->algorithm->length == 0) {
+ ASN1err(ASN1_F_ASN1_SIGN,
+ ASN1_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD);
+ goto err;
+ }
+ }
+ inl = i2d(data, NULL);
+ buf_in = OPENSSL_malloc((unsigned int)inl);
+ outll = outl = EVP_PKEY_size(pkey);
+ buf_out = OPENSSL_malloc((unsigned int)outl);
+ if ((buf_in == NULL) || (buf_out == NULL)) {
+ outl = 0;
+ ASN1err(ASN1_F_ASN1_SIGN, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ p = buf_in;
+
+ i2d(data, &p);
+ if (!EVP_SignInit_ex(ctx, type, NULL)
+ || !EVP_SignUpdate(ctx, (unsigned char *)buf_in, inl)
+ || !EVP_SignFinal(ctx, (unsigned char *)buf_out,
+ (unsigned int *)&outl, pkey)) {
+ outl = 0;
+ ASN1err(ASN1_F_ASN1_SIGN, ERR_R_EVP_LIB);
+ goto err;
+ }
+ OPENSSL_free(signature->data);
+ signature->data = buf_out;
+ buf_out = NULL;
+ signature->length = outl;
+ /*
+ * In the interests of compatibility, I'll make sure that the bit string
+ * has a 'not-used bits' value of 0
+ */
+ signature->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07);
+ signature->flags |= ASN1_STRING_FLAG_BITS_LEFT;
+ err:
+ EVP_MD_CTX_free(ctx);
+ OPENSSL_clear_free((char *)buf_in, (unsigned int)inl);
+ OPENSSL_clear_free((char *)buf_out, outll);
+ return (outl);
+}
+
+#endif
+
+int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1,
+ X509_ALGOR *algor2, ASN1_BIT_STRING *signature, void *asn,
+ EVP_PKEY *pkey, const EVP_MD *type)
+{
+ int rv;
+ EVP_MD_CTX *ctx = EVP_MD_CTX_new();
+
+ if (ctx == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ if (!EVP_DigestSignInit(ctx, NULL, type, NULL, pkey)) {
+ EVP_MD_CTX_free(ctx);
+ return 0;
+ }
+
+ rv = ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, ctx);
+
+ EVP_MD_CTX_free(ctx);
+ return rv;
+}
+
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+ X509_ALGOR *algor1, X509_ALGOR *algor2,
+ ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx)
+{
+ const EVP_MD *type;
+ EVP_PKEY *pkey;
+ unsigned char *buf_in = NULL, *buf_out = NULL;
+ size_t inl = 0, outl = 0, outll = 0;
+ int signid, paramtype;
+ int rv;
+
+ type = EVP_MD_CTX_md(ctx);
+ pkey = EVP_PKEY_CTX_get0_pkey(EVP_MD_CTX_pkey_ctx(ctx));
+
+ if (type == NULL || pkey == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED);
+ goto err;
+ }
+
+ if (pkey->ameth == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+ goto err;
+ }
+
+ if (pkey->ameth->item_sign) {
+ rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2, signature);
+ if (rv == 1)
+ outl = signature->length;
+ /*-
+ * Return value meanings:
+ * <=0: error.
+ * 1: method does everything.
+ * 2: carry on as normal.
+ * 3: ASN1 method sets algorithm identifiers: just sign.
+ */
+ if (rv <= 0)
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+ if (rv <= 1)
+ goto err;
+ } else
+ rv = 2;
+
+ if (rv == 2) {
+ if (!OBJ_find_sigid_by_algs(&signid,
+ EVP_MD_nid(type),
+ pkey->ameth->pkey_id)) {
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,
+ ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+ goto err;
+ }
+
+ if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
+ paramtype = V_ASN1_NULL;
+ else
+ paramtype = V_ASN1_UNDEF;
+
+ if (algor1)
+ X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
+ if (algor2)
+ X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+
+ }
+
+ inl = ASN1_item_i2d(asn, &buf_in, it);
+ outll = outl = EVP_PKEY_size(pkey);
+ buf_out = OPENSSL_malloc((unsigned int)outl);
+ if ((buf_in == NULL) || (buf_out == NULL)) {
+ outl = 0;
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ if (!EVP_DigestSignUpdate(ctx, buf_in, inl)
+ || !EVP_DigestSignFinal(ctx, buf_out, &outl)) {
+ outl = 0;
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+ goto err;
+ }
+ OPENSSL_free(signature->data);
+ signature->data = buf_out;
+ buf_out = NULL;
+ signature->length = outl;
+ /*
+ * In the interests of compatibility, I'll make sure that the bit string
+ * has a 'not-used bits' value of 0
+ */
+ signature->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07);
+ signature->flags |= ASN1_STRING_FLAG_BITS_LEFT;
+ err:
+ OPENSSL_clear_free((char *)buf_in, (unsigned int)inl);
+ OPENSSL_clear_free((char *)buf_out, outll);
+ return (outl);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_strex.c b/openssl-1.1.0h/crypto/asn1/a_strex.c
new file mode 100644
index 0000000..b91266b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_strex.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "internal/cryptlib.h"
+#include "internal/asn1_int.h"
+#include <openssl/crypto.h>
+#include <openssl/x509.h>
+#include <openssl/asn1.h>
+
+#include "charmap.h"
+
+/*
+ * ASN1_STRING_print_ex() and X509_NAME_print_ex(). Enhanced string and name
+ * printing routines handling multibyte characters, RFC2253 and a host of
+ * other options.
+ */
+
+#define CHARTYPE_BS_ESC (ASN1_STRFLGS_ESC_2253 | CHARTYPE_FIRST_ESC_2253 | CHARTYPE_LAST_ESC_2253)
+
+#define ESC_FLAGS (ASN1_STRFLGS_ESC_2253 | \
+ ASN1_STRFLGS_ESC_2254 | \
+ ASN1_STRFLGS_ESC_QUOTE | \
+ ASN1_STRFLGS_ESC_CTRL | \
+ ASN1_STRFLGS_ESC_MSB)
+
+/*
+ * Three IO functions for sending data to memory, a BIO and and a FILE
+ * pointer.
+ */
+static int send_bio_chars(void *arg, const void *buf, int len)
+{
+ if (!arg)
+ return 1;
+ if (BIO_write(arg, buf, len) != len)
+ return 0;
+ return 1;
+}
+
+#ifndef OPENSSL_NO_STDIO
+static int send_fp_chars(void *arg, const void *buf, int len)
+{
+ if (!arg)
+ return 1;
+ if (fwrite(buf, 1, len, arg) != (unsigned int)len)
+ return 0;
+ return 1;
+}
+#endif
+
+typedef int char_io (void *arg, const void *buf, int len);
+
+/*
+ * This function handles display of strings, one character at a time. It is
+ * passed an unsigned long for each character because it could come from 2 or
+ * even 4 byte forms.
+ */
+
+static int do_esc_char(unsigned long c, unsigned short flags, char *do_quotes,
+ char_io *io_ch, void *arg)
+{
+ unsigned short chflgs;
+ unsigned char chtmp;
+ char tmphex[HEX_SIZE(long) + 3];
+
+ if (c > 0xffffffffL)
+ return -1;
+ if (c > 0xffff) {
+ BIO_snprintf(tmphex, sizeof(tmphex), "\\W%08lX", c);
+ if (!io_ch(arg, tmphex, 10))
+ return -1;
+ return 10;
+ }
+ if (c > 0xff) {
+ BIO_snprintf(tmphex, sizeof(tmphex), "\\U%04lX", c);
+ if (!io_ch(arg, tmphex, 6))
+ return -1;
+ return 6;
+ }
+ chtmp = (unsigned char)c;
+ if (chtmp > 0x7f)
+ chflgs = flags & ASN1_STRFLGS_ESC_MSB;
+ else
+ chflgs = char_type[chtmp] & flags;
+ if (chflgs & CHARTYPE_BS_ESC) {
+ /* If we don't escape with quotes, signal we need quotes */
+ if (chflgs & ASN1_STRFLGS_ESC_QUOTE) {
+ if (do_quotes)
+ *do_quotes = 1;
+ if (!io_ch(arg, &chtmp, 1))
+ return -1;
+ return 1;
+ }
+ if (!io_ch(arg, "\\", 1))
+ return -1;
+ if (!io_ch(arg, &chtmp, 1))
+ return -1;
+ return 2;
+ }
+ if (chflgs & (ASN1_STRFLGS_ESC_CTRL
+ | ASN1_STRFLGS_ESC_MSB
+ | ASN1_STRFLGS_ESC_2254)) {
+ BIO_snprintf(tmphex, 11, "\\%02X", chtmp);
+ if (!io_ch(arg, tmphex, 3))
+ return -1;
+ return 3;
+ }
+ /*
+ * If we get this far and do any escaping at all must escape the escape
+ * character itself: backslash.
+ */
+ if (chtmp == '\\' && (flags & ESC_FLAGS)) {
+ if (!io_ch(arg, "\\\\", 2))
+ return -1;
+ return 2;
+ }
+ if (!io_ch(arg, &chtmp, 1))
+ return -1;
+ return 1;
+}
+
+#define BUF_TYPE_WIDTH_MASK 0x7
+#define BUF_TYPE_CONVUTF8 0x8
+
+/*
+ * This function sends each character in a buffer to do_esc_char(). It
+ * interprets the content formats and converts to or from UTF8 as
+ * appropriate.
+ */
+
+static int do_buf(unsigned char *buf, int buflen,
+ int type, unsigned short flags, char *quotes, char_io *io_ch,
+ void *arg)
+{
+ int i, outlen, len;
+ unsigned short orflags;
+ unsigned char *p, *q;
+ unsigned long c;
+
+ p = buf;
+ q = buf + buflen;
+ outlen = 0;
+ while (p != q) {
+ if (p == buf && flags & ASN1_STRFLGS_ESC_2253)
+ orflags = CHARTYPE_FIRST_ESC_2253;
+ else
+ orflags = 0;
+ switch (type & BUF_TYPE_WIDTH_MASK) {
+ case 4:
+ c = ((unsigned long)*p++) << 24;
+ c |= ((unsigned long)*p++) << 16;
+ c |= ((unsigned long)*p++) << 8;
+ c |= *p++;
+ break;
+
+ case 2:
+ c = ((unsigned long)*p++) << 8;
+ c |= *p++;
+ break;
+
+ case 1:
+ c = *p++;
+ break;
+
+ case 0:
+ i = UTF8_getc(p, buflen, &c);
+ if (i < 0)
+ return -1; /* Invalid UTF8String */
+ p += i;
+ break;
+ default:
+ return -1; /* invalid width */
+ }
+ if (p == q && flags & ASN1_STRFLGS_ESC_2253)
+ orflags = CHARTYPE_LAST_ESC_2253;
+ if (type & BUF_TYPE_CONVUTF8) {
+ unsigned char utfbuf[6];
+ int utflen;
+ utflen = UTF8_putc(utfbuf, sizeof(utfbuf), c);
+ for (i = 0; i < utflen; i++) {
+ /*
+ * We don't need to worry about setting orflags correctly
+ * because if utflen==1 its value will be correct anyway
+ * otherwise each character will be > 0x7f and so the
+ * character will never be escaped on first and last.
+ */
+ len = do_esc_char(utfbuf[i], flags | orflags, quotes,
+ io_ch, arg);
+ if (len < 0)
+ return -1;
+ outlen += len;
+ }
+ } else {
+ len = do_esc_char(c, flags | orflags, quotes,
+ io_ch, arg);
+ if (len < 0)
+ return -1;
+ outlen += len;
+ }
+ }
+ return outlen;
+}
+
+/* This function hex dumps a buffer of characters */
+
+static int do_hex_dump(char_io *io_ch, void *arg, unsigned char *buf,
+ int buflen)
+{
+ static const char hexdig[] = "0123456789ABCDEF";
+ unsigned char *p, *q;
+ char hextmp[2];
+ if (arg) {
+ p = buf;
+ q = buf + buflen;
+ while (p != q) {
+ hextmp[0] = hexdig[*p >> 4];
+ hextmp[1] = hexdig[*p & 0xf];
+ if (!io_ch(arg, hextmp, 2))
+ return -1;
+ p++;
+ }
+ }
+ return buflen << 1;
+}
+
+/*
+ * "dump" a string. This is done when the type is unknown, or the flags
+ * request it. We can either dump the content octets or the entire DER
+ * encoding. This uses the RFC2253 #01234 format.
+ */
+
+static int do_dump(unsigned long lflags, char_io *io_ch, void *arg,
+ const ASN1_STRING *str)
+{
+ /*
+ * Placing the ASN1_STRING in a temp ASN1_TYPE allows the DER encoding to
+ * readily obtained
+ */
+ ASN1_TYPE t;
+ unsigned char *der_buf, *p;
+ int outlen, der_len;
+
+ if (!io_ch(arg, "#", 1))
+ return -1;
+ /* If we don't dump DER encoding just dump content octets */
+ if (!(lflags & ASN1_STRFLGS_DUMP_DER)) {
+ outlen = do_hex_dump(io_ch, arg, str->data, str->length);
+ if (outlen < 0)
+ return -1;
+ return outlen + 1;
+ }
+ t.type = str->type;
+ t.value.ptr = (char *)str;
+ der_len = i2d_ASN1_TYPE(&t, NULL);
+ der_buf = OPENSSL_malloc(der_len);
+ if (der_buf == NULL)
+ return -1;
+ p = der_buf;
+ i2d_ASN1_TYPE(&t, &p);
+ outlen = do_hex_dump(io_ch, arg, der_buf, der_len);
+ OPENSSL_free(der_buf);
+ if (outlen < 0)
+ return -1;
+ return outlen + 1;
+}
+
+/*
+ * Lookup table to convert tags to character widths, 0 = UTF8 encoded, -1 is
+ * used for non string types otherwise it is the number of bytes per
+ * character
+ */
+
+static const signed char tag2nbyte[] = {
+ -1, -1, -1, -1, -1, /* 0-4 */
+ -1, -1, -1, -1, -1, /* 5-9 */
+ -1, -1, 0, -1, /* 10-13 */
+ -1, -1, -1, -1, /* 15-17 */
+ 1, 1, 1, /* 18-20 */
+ -1, 1, 1, 1, /* 21-24 */
+ -1, 1, -1, /* 25-27 */
+ 4, -1, 2 /* 28-30 */
+};
+
+/*
+ * This is the main function, print out an ASN1_STRING taking note of various
+ * escape and display options. Returns number of characters written or -1 if
+ * an error occurred.
+ */
+
+static int do_print_ex(char_io *io_ch, void *arg, unsigned long lflags,
+ const ASN1_STRING *str)
+{
+ int outlen, len;
+ int type;
+ char quotes;
+ unsigned short flags;
+ quotes = 0;
+ /* Keep a copy of escape flags */
+ flags = (unsigned short)(lflags & ESC_FLAGS);
+
+ type = str->type;
+
+ outlen = 0;
+
+ if (lflags & ASN1_STRFLGS_SHOW_TYPE) {
+ const char *tagname;
+ tagname = ASN1_tag2str(type);
+ outlen += strlen(tagname);
+ if (!io_ch(arg, tagname, outlen) || !io_ch(arg, ":", 1))
+ return -1;
+ outlen++;
+ }
+
+ /* Decide what to do with type, either dump content or display it */
+
+ /* Dump everything */
+ if (lflags & ASN1_STRFLGS_DUMP_ALL)
+ type = -1;
+ /* Ignore the string type */
+ else if (lflags & ASN1_STRFLGS_IGNORE_TYPE)
+ type = 1;
+ else {
+ /* Else determine width based on type */
+ if ((type > 0) && (type < 31))
+ type = tag2nbyte[type];
+ else
+ type = -1;
+ if ((type == -1) && !(lflags & ASN1_STRFLGS_DUMP_UNKNOWN))
+ type = 1;
+ }
+
+ if (type == -1) {
+ len = do_dump(lflags, io_ch, arg, str);
+ if (len < 0)
+ return -1;
+ outlen += len;
+ return outlen;
+ }
+
+ if (lflags & ASN1_STRFLGS_UTF8_CONVERT) {
+ /*
+ * Note: if string is UTF8 and we want to convert to UTF8 then we
+ * just interpret it as 1 byte per character to avoid converting
+ * twice.
+ */
+ if (!type)
+ type = 1;
+ else
+ type |= BUF_TYPE_CONVUTF8;
+ }
+
+ len = do_buf(str->data, str->length, type, flags, &quotes, io_ch, NULL);
+ if (len < 0)
+ return -1;
+ outlen += len;
+ if (quotes)
+ outlen += 2;
+ if (!arg)
+ return outlen;
+ if (quotes && !io_ch(arg, "\"", 1))
+ return -1;
+ if (do_buf(str->data, str->length, type, flags, NULL, io_ch, arg) < 0)
+ return -1;
+ if (quotes && !io_ch(arg, "\"", 1))
+ return -1;
+ return outlen;
+}
+
+/* Used for line indenting: print 'indent' spaces */
+
+static int do_indent(char_io *io_ch, void *arg, int indent)
+{
+ int i;
+ for (i = 0; i < indent; i++)
+ if (!io_ch(arg, " ", 1))
+ return 0;
+ return 1;
+}
+
+#define FN_WIDTH_LN 25
+#define FN_WIDTH_SN 10
+
+static int do_name_ex(char_io *io_ch, void *arg, const X509_NAME *n,
+ int indent, unsigned long flags)
+{
+ int i, prev = -1, orflags, cnt;
+ int fn_opt, fn_nid;
+ ASN1_OBJECT *fn;
+ const ASN1_STRING *val;
+ const X509_NAME_ENTRY *ent;
+ char objtmp[80];
+ const char *objbuf;
+ int outlen, len;
+ char *sep_dn, *sep_mv, *sep_eq;
+ int sep_dn_len, sep_mv_len, sep_eq_len;
+ if (indent < 0)
+ indent = 0;
+ outlen = indent;
+ if (!do_indent(io_ch, arg, indent))
+ return -1;
+ switch (flags & XN_FLAG_SEP_MASK) {
+ case XN_FLAG_SEP_MULTILINE:
+ sep_dn = "\n";
+ sep_dn_len = 1;
+ sep_mv = " + ";
+ sep_mv_len = 3;
+ break;
+
+ case XN_FLAG_SEP_COMMA_PLUS:
+ sep_dn = ",";
+ sep_dn_len = 1;
+ sep_mv = "+";
+ sep_mv_len = 1;
+ indent = 0;
+ break;
+
+ case XN_FLAG_SEP_CPLUS_SPC:
+ sep_dn = ", ";
+ sep_dn_len = 2;
+ sep_mv = " + ";
+ sep_mv_len = 3;
+ indent = 0;
+ break;
+
+ case XN_FLAG_SEP_SPLUS_SPC:
+ sep_dn = "; ";
+ sep_dn_len = 2;
+ sep_mv = " + ";
+ sep_mv_len = 3;
+ indent = 0;
+ break;
+
+ default:
+ return -1;
+ }
+
+ if (flags & XN_FLAG_SPC_EQ) {
+ sep_eq = " = ";
+ sep_eq_len = 3;
+ } else {
+ sep_eq = "=";
+ sep_eq_len = 1;
+ }
+
+ fn_opt = flags & XN_FLAG_FN_MASK;
+
+ cnt = X509_NAME_entry_count(n);
+ for (i = 0; i < cnt; i++) {
+ if (flags & XN_FLAG_DN_REV)
+ ent = X509_NAME_get_entry(n, cnt - i - 1);
+ else
+ ent = X509_NAME_get_entry(n, i);
+ if (prev != -1) {
+ if (prev == X509_NAME_ENTRY_set(ent)) {
+ if (!io_ch(arg, sep_mv, sep_mv_len))
+ return -1;
+ outlen += sep_mv_len;
+ } else {
+ if (!io_ch(arg, sep_dn, sep_dn_len))
+ return -1;
+ outlen += sep_dn_len;
+ if (!do_indent(io_ch, arg, indent))
+ return -1;
+ outlen += indent;
+ }
+ }
+ prev = X509_NAME_ENTRY_set(ent);
+ fn = X509_NAME_ENTRY_get_object(ent);
+ val = X509_NAME_ENTRY_get_data(ent);
+ fn_nid = OBJ_obj2nid(fn);
+ if (fn_opt != XN_FLAG_FN_NONE) {
+ int objlen, fld_len;
+ if ((fn_opt == XN_FLAG_FN_OID) || (fn_nid == NID_undef)) {
+ OBJ_obj2txt(objtmp, sizeof(objtmp), fn, 1);
+ fld_len = 0; /* XXX: what should this be? */
+ objbuf = objtmp;
+ } else {
+ if (fn_opt == XN_FLAG_FN_SN) {
+ fld_len = FN_WIDTH_SN;
+ objbuf = OBJ_nid2sn(fn_nid);
+ } else if (fn_opt == XN_FLAG_FN_LN) {
+ fld_len = FN_WIDTH_LN;
+ objbuf = OBJ_nid2ln(fn_nid);
+ } else {
+ fld_len = 0; /* XXX: what should this be? */
+ objbuf = "";
+ }
+ }
+ objlen = strlen(objbuf);
+ if (!io_ch(arg, objbuf, objlen))
+ return -1;
+ if ((objlen < fld_len) && (flags & XN_FLAG_FN_ALIGN)) {
+ if (!do_indent(io_ch, arg, fld_len - objlen))
+ return -1;
+ outlen += fld_len - objlen;
+ }
+ if (!io_ch(arg, sep_eq, sep_eq_len))
+ return -1;
+ outlen += objlen + sep_eq_len;
+ }
+ /*
+ * If the field name is unknown then fix up the DER dump flag. We
+ * might want to limit this further so it will DER dump on anything
+ * other than a few 'standard' fields.
+ */
+ if ((fn_nid == NID_undef) && (flags & XN_FLAG_DUMP_UNKNOWN_FIELDS))
+ orflags = ASN1_STRFLGS_DUMP_ALL;
+ else
+ orflags = 0;
+
+ len = do_print_ex(io_ch, arg, flags | orflags, val);
+ if (len < 0)
+ return -1;
+ outlen += len;
+ }
+ return outlen;
+}
+
+/* Wrappers round the main functions */
+
+int X509_NAME_print_ex(BIO *out, const X509_NAME *nm, int indent,
+ unsigned long flags)
+{
+ if (flags == XN_FLAG_COMPAT)
+ return X509_NAME_print(out, nm, indent);
+ return do_name_ex(send_bio_chars, out, nm, indent, flags);
+}
+
+#ifndef OPENSSL_NO_STDIO
+int X509_NAME_print_ex_fp(FILE *fp, const X509_NAME *nm, int indent,
+ unsigned long flags)
+{
+ if (flags == XN_FLAG_COMPAT) {
+ BIO *btmp;
+ int ret;
+ btmp = BIO_new_fp(fp, BIO_NOCLOSE);
+ if (!btmp)
+ return -1;
+ ret = X509_NAME_print(btmp, nm, indent);
+ BIO_free(btmp);
+ return ret;
+ }
+ return do_name_ex(send_fp_chars, fp, nm, indent, flags);
+}
+#endif
+
+int ASN1_STRING_print_ex(BIO *out, const ASN1_STRING *str, unsigned long flags)
+{
+ return do_print_ex(send_bio_chars, out, flags, str);
+}
+
+#ifndef OPENSSL_NO_STDIO
+int ASN1_STRING_print_ex_fp(FILE *fp, const ASN1_STRING *str, unsigned long flags)
+{
+ return do_print_ex(send_fp_chars, fp, flags, str);
+}
+#endif
+
+/*
+ * Utility function: convert any string type to UTF8, returns number of bytes
+ * in output string or a negative error code
+ */
+
+int ASN1_STRING_to_UTF8(unsigned char **out, const ASN1_STRING *in)
+{
+ ASN1_STRING stmp, *str = &stmp;
+ int mbflag, type, ret;
+ if (!in)
+ return -1;
+ type = in->type;
+ if ((type < 0) || (type > 30))
+ return -1;
+ mbflag = tag2nbyte[type];
+ if (mbflag == -1)
+ return -1;
+ mbflag |= MBSTRING_FLAG;
+ stmp.data = NULL;
+ stmp.length = 0;
+ stmp.flags = 0;
+ ret =
+ ASN1_mbstring_copy(&str, in->data, in->length, mbflag,
+ B_ASN1_UTF8STRING);
+ if (ret < 0)
+ return ret;
+ *out = stmp.data;
+ return stmp.length;
+}
+
+/* Return 1 if host is a valid hostname and 0 otherwise */
+int asn1_valid_host(const ASN1_STRING *host)
+{
+ int hostlen = host->length;
+ const unsigned char *hostptr = host->data;
+ int type = host->type;
+ int i;
+ signed char width = -1;
+ unsigned short chflags = 0, prevchflags;
+
+ if (type > 0 && type < 31)
+ width = tag2nbyte[type];
+ if (width == -1 || hostlen == 0)
+ return 0;
+ /* Treat UTF8String as width 1 as any MSB set is invalid */
+ if (width == 0)
+ width = 1;
+ for (i = 0 ; i < hostlen; i+= width) {
+ prevchflags = chflags;
+ /* Value must be <= 0x7F: check upper bytes are all zeroes */
+ if (width == 4) {
+ if (*hostptr++ != 0 || *hostptr++ != 0 || *hostptr++ != 0)
+ return 0;
+ } else if (width == 2) {
+ if (*hostptr++ != 0)
+ return 0;
+ }
+ if (*hostptr > 0x7f)
+ return 0;
+ chflags = char_type[*hostptr++];
+ if (!(chflags & (CHARTYPE_HOST_ANY | CHARTYPE_HOST_WILD))) {
+ /* Nothing else allowed at start or end of string */
+ if (i == 0 || i == hostlen - 1)
+ return 0;
+ /* Otherwise invalid if not dot or hyphen */
+ if (!(chflags & (CHARTYPE_HOST_DOT | CHARTYPE_HOST_HYPHEN)))
+ return 0;
+ /*
+ * If previous is dot or hyphen then illegal unless both
+ * are hyphens: as .- -. .. are all illegal
+ */
+ if (prevchflags & (CHARTYPE_HOST_DOT | CHARTYPE_HOST_HYPHEN)
+ && ((prevchflags & CHARTYPE_HOST_DOT)
+ || (chflags & CHARTYPE_HOST_DOT)))
+ return 0;
+ }
+ }
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_strnid.c b/openssl-1.1.0h/crypto/asn1/a_strnid.c
new file mode 100644
index 0000000..ecf178e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_strnid.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright 1999-2017 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include <openssl/objects.h>
+
+static STACK_OF(ASN1_STRING_TABLE) *stable = NULL;
+static void st_free(ASN1_STRING_TABLE *tbl);
+static int sk_table_cmp(const ASN1_STRING_TABLE *const *a,
+ const ASN1_STRING_TABLE *const *b);
+
+/*
+ * This is the global mask for the mbstring functions: this is use to mask
+ * out certain types (such as BMPString and UTF8String) because certain
+ * software (e.g. Netscape) has problems with them.
+ */
+
+static unsigned long global_mask = B_ASN1_UTF8STRING;
+
+void ASN1_STRING_set_default_mask(unsigned long mask)
+{
+ global_mask = mask;
+}
+
+unsigned long ASN1_STRING_get_default_mask(void)
+{
+ return global_mask;
+}
+
+/*-
+ * This function sets the default to various "flavours" of configuration.
+ * based on an ASCII string. Currently this is:
+ * MASK:XXXX : a numerical mask value.
+ * nobmp : Don't use BMPStrings (just Printable, T61).
+ * pkix : PKIX recommendation in RFC2459.
+ * utf8only : only use UTF8Strings (RFC2459 recommendation for 2004).
+ * default: the default value, Printable, T61, BMP.
+ */
+
+int ASN1_STRING_set_default_mask_asc(const char *p)
+{
+ unsigned long mask;
+ char *end;
+ if (strncmp(p, "MASK:", 5) == 0) {
+ if (!p[5])
+ return 0;
+ mask = strtoul(p + 5, &end, 0);
+ if (*end)
+ return 0;
+ } else if (strcmp(p, "nombstr") == 0)
+ mask = ~((unsigned long)(B_ASN1_BMPSTRING | B_ASN1_UTF8STRING));
+ else if (strcmp(p, "pkix") == 0)
+ mask = ~((unsigned long)B_ASN1_T61STRING);
+ else if (strcmp(p, "utf8only") == 0)
+ mask = B_ASN1_UTF8STRING;
+ else if (strcmp(p, "default") == 0)
+ mask = 0xFFFFFFFFL;
+ else
+ return 0;
+ ASN1_STRING_set_default_mask(mask);
+ return 1;
+}
+
+/*
+ * The following function generates an ASN1_STRING based on limits in a
+ * table. Frequently the types and length of an ASN1_STRING are restricted by
+ * a corresponding OID. For example certificates and certificate requests.
+ */
+
+ASN1_STRING *ASN1_STRING_set_by_NID(ASN1_STRING **out,
+ const unsigned char *in, int inlen,
+ int inform, int nid)
+{
+ ASN1_STRING_TABLE *tbl;
+ ASN1_STRING *str = NULL;
+ unsigned long mask;
+ int ret;
+ if (!out)
+ out = &str;
+ tbl = ASN1_STRING_TABLE_get(nid);
+ if (tbl) {
+ mask = tbl->mask;
+ if (!(tbl->flags & STABLE_NO_MASK))
+ mask &= global_mask;
+ ret = ASN1_mbstring_ncopy(out, in, inlen, inform, mask,
+ tbl->minsize, tbl->maxsize);
+ } else
+ ret =
+ ASN1_mbstring_copy(out, in, inlen, inform,
+ DIRSTRING_TYPE & global_mask);
+ if (ret <= 0)
+ return NULL;
+ return *out;
+}
+
+/*
+ * Now the tables and helper functions for the string table:
+ */
+
+/* size limits: this stuff is taken straight from RFC3280 */
+
+#define ub_name 32768
+#define ub_common_name 64
+#define ub_locality_name 128
+#define ub_state_name 128
+#define ub_organization_name 64
+#define ub_organization_unit_name 64
+#define ub_title 64
+#define ub_email_address 128
+#define ub_serial_number 64
+
+/* From RFC4524 */
+
+#define ub_rfc822_mailbox 256
+
+/* This table must be kept in NID order */
+
+static const ASN1_STRING_TABLE tbl_standard[] = {
+ {NID_commonName, 1, ub_common_name, DIRSTRING_TYPE, 0},
+ {NID_countryName, 2, 2, B_ASN1_PRINTABLESTRING, STABLE_NO_MASK},
+ {NID_localityName, 1, ub_locality_name, DIRSTRING_TYPE, 0},
+ {NID_stateOrProvinceName, 1, ub_state_name, DIRSTRING_TYPE, 0},
+ {NID_organizationName, 1, ub_organization_name, DIRSTRING_TYPE, 0},
+ {NID_organizationalUnitName, 1, ub_organization_unit_name, DIRSTRING_TYPE,
+ 0},
+ {NID_pkcs9_emailAddress, 1, ub_email_address, B_ASN1_IA5STRING,
+ STABLE_NO_MASK},
+ {NID_pkcs9_unstructuredName, 1, -1, PKCS9STRING_TYPE, 0},
+ {NID_pkcs9_challengePassword, 1, -1, PKCS9STRING_TYPE, 0},
+ {NID_pkcs9_unstructuredAddress, 1, -1, DIRSTRING_TYPE, 0},
+ {NID_givenName, 1, ub_name, DIRSTRING_TYPE, 0},
+ {NID_surname, 1, ub_name, DIRSTRING_TYPE, 0},
+ {NID_initials, 1, ub_name, DIRSTRING_TYPE, 0},
+ {NID_serialNumber, 1, ub_serial_number, B_ASN1_PRINTABLESTRING,
+ STABLE_NO_MASK},
+ {NID_friendlyName, -1, -1, B_ASN1_BMPSTRING, STABLE_NO_MASK},
+ {NID_name, 1, ub_name, DIRSTRING_TYPE, 0},
+ {NID_dnQualifier, -1, -1, B_ASN1_PRINTABLESTRING, STABLE_NO_MASK},
+ {NID_domainComponent, 1, -1, B_ASN1_IA5STRING, STABLE_NO_MASK},
+ {NID_ms_csp_name, -1, -1, B_ASN1_BMPSTRING, STABLE_NO_MASK},
+ {NID_rfc822Mailbox, 1, ub_rfc822_mailbox, B_ASN1_IA5STRING,
+ STABLE_NO_MASK},
+ {NID_jurisdictionCountryName, 2, 2, B_ASN1_PRINTABLESTRING, STABLE_NO_MASK},
+ {NID_INN, 1, 12, B_ASN1_NUMERICSTRING, STABLE_NO_MASK},
+ {NID_OGRN, 1, 13, B_ASN1_NUMERICSTRING, STABLE_NO_MASK},
+ {NID_SNILS, 1, 11, B_ASN1_NUMERICSTRING, STABLE_NO_MASK}
+};
+
+static int sk_table_cmp(const ASN1_STRING_TABLE *const *a,
+ const ASN1_STRING_TABLE *const *b)
+{
+ return (*a)->nid - (*b)->nid;
+}
+
+DECLARE_OBJ_BSEARCH_CMP_FN(ASN1_STRING_TABLE, ASN1_STRING_TABLE, table);
+
+static int table_cmp(const ASN1_STRING_TABLE *a, const ASN1_STRING_TABLE *b)
+{
+ return a->nid - b->nid;
+}
+
+IMPLEMENT_OBJ_BSEARCH_CMP_FN(ASN1_STRING_TABLE, ASN1_STRING_TABLE, table);
+
+ASN1_STRING_TABLE *ASN1_STRING_TABLE_get(int nid)
+{
+ int idx;
+ ASN1_STRING_TABLE fnd;
+ fnd.nid = nid;
+ if (stable) {
+ idx = sk_ASN1_STRING_TABLE_find(stable, &fnd);
+ if (idx >= 0)
+ return sk_ASN1_STRING_TABLE_value(stable, idx);
+ }
+ return OBJ_bsearch_table(&fnd, tbl_standard, OSSL_NELEM(tbl_standard));
+}
+
+/*
+ * Return a string table pointer which can be modified: either directly from
+ * table or a copy of an internal value added to the table.
+ */
+
+static ASN1_STRING_TABLE *stable_get(int nid)
+{
+ ASN1_STRING_TABLE *tmp, *rv;
+ /* Always need a string table so allocate one if NULL */
+ if (stable == NULL) {
+ stable = sk_ASN1_STRING_TABLE_new(sk_table_cmp);
+ if (stable == NULL)
+ return NULL;
+ }
+ tmp = ASN1_STRING_TABLE_get(nid);
+ if (tmp && tmp->flags & STABLE_FLAGS_MALLOC)
+ return tmp;
+ rv = OPENSSL_zalloc(sizeof(*rv));
+ if (rv == NULL)
+ return NULL;
+ if (!sk_ASN1_STRING_TABLE_push(stable, rv)) {
+ OPENSSL_free(rv);
+ return NULL;
+ }
+ if (tmp) {
+ rv->nid = tmp->nid;
+ rv->minsize = tmp->minsize;
+ rv->maxsize = tmp->maxsize;
+ rv->mask = tmp->mask;
+ rv->flags = tmp->flags | STABLE_FLAGS_MALLOC;
+ } else {
+ rv->nid = nid;
+ rv->minsize = -1;
+ rv->maxsize = -1;
+ rv->flags = STABLE_FLAGS_MALLOC;
+ }
+ return rv;
+}
+
+int ASN1_STRING_TABLE_add(int nid,
+ long minsize, long maxsize, unsigned long mask,
+ unsigned long flags)
+{
+ ASN1_STRING_TABLE *tmp;
+ tmp = stable_get(nid);
+ if (!tmp) {
+ ASN1err(ASN1_F_ASN1_STRING_TABLE_ADD, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ if (minsize >= 0)
+ tmp->minsize = minsize;
+ if (maxsize >= 0)
+ tmp->maxsize = maxsize;
+ if (mask)
+ tmp->mask = mask;
+ if (flags)
+ tmp->flags = STABLE_FLAGS_MALLOC | flags;
+ return 1;
+}
+
+void ASN1_STRING_TABLE_cleanup(void)
+{
+ STACK_OF(ASN1_STRING_TABLE) *tmp;
+ tmp = stable;
+ if (!tmp)
+ return;
+ stable = NULL;
+ sk_ASN1_STRING_TABLE_pop_free(tmp, st_free);
+}
+
+static void st_free(ASN1_STRING_TABLE *tbl)
+{
+ if (tbl->flags & STABLE_FLAGS_MALLOC)
+ OPENSSL_free(tbl);
+}
+
+
+#ifdef STRING_TABLE_TEST
+
+main()
+{
+ ASN1_STRING_TABLE *tmp;
+ int i, last_nid = -1;
+
+ for (tmp = tbl_standard, i = 0; i < OSSL_NELEM(tbl_standard); i++, tmp++) {
+ if (tmp->nid < last_nid) {
+ last_nid = 0;
+ break;
+ }
+ last_nid = tmp->nid;
+ }
+
+ if (last_nid != 0) {
+ printf("Table order OK\n");
+ exit(0);
+ }
+
+ for (tmp = tbl_standard, i = 0; i < OSSL_NELEM(tbl_standard); i++, tmp++)
+ printf("Index %d, NID %d, Name=%s\n", i, tmp->nid,
+ OBJ_nid2ln(tmp->nid));
+
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/asn1/a_time.c b/openssl-1.1.0h/crypto/asn1/a_time.c
new file mode 100644
index 0000000..46f539c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_time.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*-
+ * This is an implementation of the ASN1 Time structure which is:
+ * Time ::= CHOICE {
+ * utcTime UTCTime,
+ * generalTime GeneralizedTime }
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include "asn1_locl.h"
+
+IMPLEMENT_ASN1_MSTRING(ASN1_TIME, B_ASN1_TIME)
+
+IMPLEMENT_ASN1_FUNCTIONS(ASN1_TIME)
+
+ASN1_TIME *ASN1_TIME_set(ASN1_TIME *s, time_t t)
+{
+ return ASN1_TIME_adj(s, t, 0, 0);
+}
+
+ASN1_TIME *ASN1_TIME_adj(ASN1_TIME *s, time_t t,
+ int offset_day, long offset_sec)
+{
+ struct tm *ts;
+ struct tm data;
+
+ ts = OPENSSL_gmtime(&t, &data);
+ if (ts == NULL) {
+ ASN1err(ASN1_F_ASN1_TIME_ADJ, ASN1_R_ERROR_GETTING_TIME);
+ return NULL;
+ }
+ if (offset_day || offset_sec) {
+ if (!OPENSSL_gmtime_adj(ts, offset_day, offset_sec))
+ return NULL;
+ }
+ if ((ts->tm_year >= 50) && (ts->tm_year < 150))
+ return ASN1_UTCTIME_adj(s, t, offset_day, offset_sec);
+ return ASN1_GENERALIZEDTIME_adj(s, t, offset_day, offset_sec);
+}
+
+int ASN1_TIME_check(const ASN1_TIME *t)
+{
+ if (t->type == V_ASN1_GENERALIZEDTIME)
+ return ASN1_GENERALIZEDTIME_check(t);
+ else if (t->type == V_ASN1_UTCTIME)
+ return ASN1_UTCTIME_check(t);
+ return 0;
+}
+
+/* Convert an ASN1_TIME structure to GeneralizedTime */
+ASN1_GENERALIZEDTIME *ASN1_TIME_to_generalizedtime(const ASN1_TIME *t,
+ ASN1_GENERALIZEDTIME **out)
+{
+ ASN1_GENERALIZEDTIME *ret = NULL;
+ char *str;
+ int newlen;
+
+ if (!ASN1_TIME_check(t))
+ return NULL;
+
+ if (out == NULL || *out == NULL) {
+ if ((ret = ASN1_GENERALIZEDTIME_new()) == NULL)
+ goto err;
+ } else
+ ret = *out;
+
+ /* If already GeneralizedTime just copy across */
+ if (t->type == V_ASN1_GENERALIZEDTIME) {
+ if (!ASN1_STRING_set(ret, t->data, t->length))
+ goto err;
+ goto done;
+ }
+
+ /* grow the string */
+ if (!ASN1_STRING_set(ret, NULL, t->length + 2))
+ goto err;
+ /* ASN1_STRING_set() allocated 'len + 1' bytes. */
+ newlen = t->length + 2 + 1;
+ str = (char *)ret->data;
+ /* Work out the century and prepend */
+ if (t->data[0] >= '5')
+ OPENSSL_strlcpy(str, "19", newlen);
+ else
+ OPENSSL_strlcpy(str, "20", newlen);
+
+ OPENSSL_strlcat(str, (const char *)t->data, newlen);
+
+ done:
+ if (out != NULL && *out == NULL)
+ *out = ret;
+ return ret;
+
+ err:
+ if (out == NULL || *out != ret)
+ ASN1_GENERALIZEDTIME_free(ret);
+ return NULL;
+}
+
+
+int ASN1_TIME_set_string(ASN1_TIME *s, const char *str)
+{
+ ASN1_TIME t;
+
+ t.length = strlen(str);
+ t.data = (unsigned char *)str;
+ t.flags = 0;
+
+ t.type = V_ASN1_UTCTIME;
+
+ if (!ASN1_TIME_check(&t)) {
+ t.type = V_ASN1_GENERALIZEDTIME;
+ if (!ASN1_TIME_check(&t))
+ return 0;
+ }
+
+ if (s && !ASN1_STRING_copy((ASN1_STRING *)s, (ASN1_STRING *)&t))
+ return 0;
+
+ return 1;
+}
+
+static int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *t)
+{
+ if (t == NULL) {
+ time_t now_t;
+ time(&now_t);
+ if (OPENSSL_gmtime(&now_t, tm))
+ return 1;
+ return 0;
+ }
+
+ if (t->type == V_ASN1_UTCTIME)
+ return asn1_utctime_to_tm(tm, t);
+ else if (t->type == V_ASN1_GENERALIZEDTIME)
+ return asn1_generalizedtime_to_tm(tm, t);
+
+ return 0;
+}
+
+int ASN1_TIME_diff(int *pday, int *psec,
+ const ASN1_TIME *from, const ASN1_TIME *to)
+{
+ struct tm tm_from, tm_to;
+ if (!asn1_time_to_tm(&tm_from, from))
+ return 0;
+ if (!asn1_time_to_tm(&tm_to, to))
+ return 0;
+ return OPENSSL_gmtime_diff(pday, psec, &tm_from, &tm_to);
+}
+
+int ASN1_TIME_print(BIO *bp, const ASN1_TIME *tm)
+{
+ if (tm->type == V_ASN1_UTCTIME)
+ return ASN1_UTCTIME_print(bp, tm);
+ if (tm->type == V_ASN1_GENERALIZEDTIME)
+ return ASN1_GENERALIZEDTIME_print(bp, tm);
+ BIO_write(bp, "Bad time value", 14);
+ return (0);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_type.c b/openssl-1.1.0h/crypto/asn1/a_type.c
new file mode 100644
index 0000000..df42360
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_type.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/objects.h>
+#include "asn1_locl.h"
+
+int ASN1_TYPE_get(const ASN1_TYPE *a)
+{
+ if ((a->value.ptr != NULL) || (a->type == V_ASN1_NULL))
+ return (a->type);
+ else
+ return (0);
+}
+
+void ASN1_TYPE_set(ASN1_TYPE *a, int type, void *value)
+{
+ if (a->value.ptr != NULL) {
+ ASN1_TYPE **tmp_a = &a;
+ asn1_primitive_free((ASN1_VALUE **)tmp_a, NULL, 0);
+ }
+ a->type = type;
+ if (type == V_ASN1_BOOLEAN)
+ a->value.boolean = value ? 0xff : 0;
+ else
+ a->value.ptr = value;
+}
+
+int ASN1_TYPE_set1(ASN1_TYPE *a, int type, const void *value)
+{
+ if (!value || (type == V_ASN1_BOOLEAN)) {
+ void *p = (void *)value;
+ ASN1_TYPE_set(a, type, p);
+ } else if (type == V_ASN1_OBJECT) {
+ ASN1_OBJECT *odup;
+ odup = OBJ_dup(value);
+ if (!odup)
+ return 0;
+ ASN1_TYPE_set(a, type, odup);
+ } else {
+ ASN1_STRING *sdup;
+ sdup = ASN1_STRING_dup(value);
+ if (!sdup)
+ return 0;
+ ASN1_TYPE_set(a, type, sdup);
+ }
+ return 1;
+}
+
+/* Returns 0 if they are equal, != 0 otherwise. */
+int ASN1_TYPE_cmp(const ASN1_TYPE *a, const ASN1_TYPE *b)
+{
+ int result = -1;
+
+ if (!a || !b || a->type != b->type)
+ return -1;
+
+ switch (a->type) {
+ case V_ASN1_OBJECT:
+ result = OBJ_cmp(a->value.object, b->value.object);
+ break;
+ case V_ASN1_BOOLEAN:
+ result = a->value.boolean - b->value.boolean;
+ break;
+ case V_ASN1_NULL:
+ result = 0; /* They do not have content. */
+ break;
+ case V_ASN1_INTEGER:
+ case V_ASN1_ENUMERATED:
+ case V_ASN1_BIT_STRING:
+ case V_ASN1_OCTET_STRING:
+ case V_ASN1_SEQUENCE:
+ case V_ASN1_SET:
+ case V_ASN1_NUMERICSTRING:
+ case V_ASN1_PRINTABLESTRING:
+ case V_ASN1_T61STRING:
+ case V_ASN1_VIDEOTEXSTRING:
+ case V_ASN1_IA5STRING:
+ case V_ASN1_UTCTIME:
+ case V_ASN1_GENERALIZEDTIME:
+ case V_ASN1_GRAPHICSTRING:
+ case V_ASN1_VISIBLESTRING:
+ case V_ASN1_GENERALSTRING:
+ case V_ASN1_UNIVERSALSTRING:
+ case V_ASN1_BMPSTRING:
+ case V_ASN1_UTF8STRING:
+ case V_ASN1_OTHER:
+ default:
+ result = ASN1_STRING_cmp((ASN1_STRING *)a->value.ptr,
+ (ASN1_STRING *)b->value.ptr);
+ break;
+ }
+
+ return result;
+}
+
+ASN1_TYPE *ASN1_TYPE_pack_sequence(const ASN1_ITEM *it, void *s, ASN1_TYPE **t)
+{
+ ASN1_OCTET_STRING *oct;
+ ASN1_TYPE *rt;
+
+ oct = ASN1_item_pack(s, it, NULL);
+ if (oct == NULL)
+ return NULL;
+
+ if (t && *t) {
+ rt = *t;
+ } else {
+ rt = ASN1_TYPE_new();
+ if (rt == NULL) {
+ ASN1_OCTET_STRING_free(oct);
+ return NULL;
+ }
+ if (t)
+ *t = rt;
+ }
+ ASN1_TYPE_set(rt, V_ASN1_SEQUENCE, oct);
+ return rt;
+}
+
+void *ASN1_TYPE_unpack_sequence(const ASN1_ITEM *it, const ASN1_TYPE *t)
+{
+ if (t == NULL || t->type != V_ASN1_SEQUENCE || t->value.sequence == NULL)
+ return NULL;
+ return ASN1_item_unpack(t->value.sequence, it);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_utctm.c b/openssl-1.1.0h/crypto/asn1/a_utctm.c
new file mode 100644
index 0000000..9797aa8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_utctm.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include "asn1_locl.h"
+
+int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d)
+{
+ static const int min[8] = { 0, 1, 1, 0, 0, 0, 0, 0 };
+ static const int max[8] = { 99, 12, 31, 23, 59, 59, 12, 59 };
+ char *a;
+ int n, i, l, o;
+
+ if (d->type != V_ASN1_UTCTIME)
+ return (0);
+ l = d->length;
+ a = (char *)d->data;
+ o = 0;
+
+ if (l < 11)
+ goto err;
+ for (i = 0; i < 6; i++) {
+ if ((i == 5) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
+ i++;
+ if (tm)
+ tm->tm_sec = 0;
+ break;
+ }
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = a[o] - '0';
+ if (++o > l)
+ goto err;
+
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = (n * 10) + a[o] - '0';
+ if (++o > l)
+ goto err;
+
+ if ((n < min[i]) || (n > max[i]))
+ goto err;
+ if (tm) {
+ switch (i) {
+ case 0:
+ tm->tm_year = n < 50 ? n + 100 : n;
+ break;
+ case 1:
+ tm->tm_mon = n - 1;
+ break;
+ case 2:
+ tm->tm_mday = n;
+ break;
+ case 3:
+ tm->tm_hour = n;
+ break;
+ case 4:
+ tm->tm_min = n;
+ break;
+ case 5:
+ tm->tm_sec = n;
+ break;
+ }
+ }
+ }
+ if (a[o] == 'Z')
+ o++;
+ else if ((a[o] == '+') || (a[o] == '-')) {
+ int offsign = a[o] == '-' ? 1 : -1, offset = 0;
+ o++;
+ if (o + 4 > l)
+ goto err;
+ for (i = 6; i < 8; i++) {
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = a[o] - '0';
+ o++;
+ if ((a[o] < '0') || (a[o] > '9'))
+ goto err;
+ n = (n * 10) + a[o] - '0';
+ if ((n < min[i]) || (n > max[i]))
+ goto err;
+ if (tm) {
+ if (i == 6)
+ offset = n * 3600;
+ else if (i == 7)
+ offset += n * 60;
+ }
+ o++;
+ }
+ if (offset && !OPENSSL_gmtime_adj(tm, 0, offset * offsign))
+ return 0;
+ }
+ return o == l;
+ err:
+ return 0;
+}
+
+int ASN1_UTCTIME_check(const ASN1_UTCTIME *d)
+{
+ return asn1_utctime_to_tm(NULL, d);
+}
+
+int ASN1_UTCTIME_set_string(ASN1_UTCTIME *s, const char *str)
+{
+ ASN1_UTCTIME t;
+
+ t.type = V_ASN1_UTCTIME;
+ t.length = strlen(str);
+ t.data = (unsigned char *)str;
+ if (ASN1_UTCTIME_check(&t)) {
+ if (s != NULL) {
+ if (!ASN1_STRING_set((ASN1_STRING *)s, str, t.length))
+ return 0;
+ s->type = V_ASN1_UTCTIME;
+ }
+ return (1);
+ } else
+ return (0);
+}
+
+ASN1_UTCTIME *ASN1_UTCTIME_set(ASN1_UTCTIME *s, time_t t)
+{
+ return ASN1_UTCTIME_adj(s, t, 0, 0);
+}
+
+ASN1_UTCTIME *ASN1_UTCTIME_adj(ASN1_UTCTIME *s, time_t t,
+ int offset_day, long offset_sec)
+{
+ char *p;
+ struct tm *ts;
+ struct tm data;
+ size_t len = 20;
+ int free_s = 0;
+
+ if (s == NULL) {
+ s = ASN1_UTCTIME_new();
+ if (s == NULL)
+ goto err;
+ free_s = 1;
+ }
+
+ ts = OPENSSL_gmtime(&t, &data);
+ if (ts == NULL)
+ goto err;
+
+ if (offset_day || offset_sec) {
+ if (!OPENSSL_gmtime_adj(ts, offset_day, offset_sec))
+ goto err;
+ }
+
+ if ((ts->tm_year < 50) || (ts->tm_year >= 150))
+ goto err;
+
+ p = (char *)s->data;
+ if ((p == NULL) || ((size_t)s->length < len)) {
+ p = OPENSSL_malloc(len);
+ if (p == NULL) {
+ ASN1err(ASN1_F_ASN1_UTCTIME_ADJ, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ OPENSSL_free(s->data);
+ s->data = (unsigned char *)p;
+ }
+
+ BIO_snprintf(p, len, "%02d%02d%02d%02d%02d%02dZ", ts->tm_year % 100,
+ ts->tm_mon + 1, ts->tm_mday, ts->tm_hour, ts->tm_min,
+ ts->tm_sec);
+ s->length = strlen(p);
+ s->type = V_ASN1_UTCTIME;
+#ifdef CHARSET_EBCDIC_not
+ ebcdic2ascii(s->data, s->data, s->length);
+#endif
+ return (s);
+ err:
+ if (free_s)
+ ASN1_UTCTIME_free(s);
+ return NULL;
+}
+
+int ASN1_UTCTIME_cmp_time_t(const ASN1_UTCTIME *s, time_t t)
+{
+ struct tm stm, ttm;
+ int day, sec;
+
+ if (!asn1_utctime_to_tm(&stm, s))
+ return -2;
+
+ if (!OPENSSL_gmtime(&t, &ttm))
+ return -2;
+
+ if (!OPENSSL_gmtime_diff(&day, &sec, &ttm, &stm))
+ return -2;
+
+ if (day > 0)
+ return 1;
+ if (day < 0)
+ return -1;
+ if (sec > 0)
+ return 1;
+ if (sec < 0)
+ return -1;
+ return 0;
+}
+
+int ASN1_UTCTIME_print(BIO *bp, const ASN1_UTCTIME *tm)
+{
+ const char *v;
+ int gmt = 0;
+ int i;
+ int y = 0, M = 0, d = 0, h = 0, m = 0, s = 0;
+
+ i = tm->length;
+ v = (const char *)tm->data;
+
+ if (i < 10)
+ goto err;
+ if (v[i - 1] == 'Z')
+ gmt = 1;
+ for (i = 0; i < 10; i++)
+ if ((v[i] > '9') || (v[i] < '0'))
+ goto err;
+ y = (v[0] - '0') * 10 + (v[1] - '0');
+ if (y < 50)
+ y += 100;
+ M = (v[2] - '0') * 10 + (v[3] - '0');
+ if ((M > 12) || (M < 1))
+ goto err;
+ d = (v[4] - '0') * 10 + (v[5] - '0');
+ h = (v[6] - '0') * 10 + (v[7] - '0');
+ m = (v[8] - '0') * 10 + (v[9] - '0');
+ if (tm->length >= 12 &&
+ (v[10] >= '0') && (v[10] <= '9') && (v[11] >= '0') && (v[11] <= '9'))
+ s = (v[10] - '0') * 10 + (v[11] - '0');
+
+ if (BIO_printf(bp, "%s %2d %02d:%02d:%02d %d%s",
+ _asn1_mon[M - 1], d, h, m, s, y + 1900,
+ (gmt) ? " GMT" : "") <= 0)
+ return (0);
+ else
+ return (1);
+ err:
+ BIO_write(bp, "Bad time value", 14);
+ return (0);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_utf8.c b/openssl-1.1.0h/crypto/asn1/a_utf8.c
new file mode 100644
index 0000000..e2dc09f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_utf8.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+
+/* UTF8 utilities */
+
+/*-
+ * This parses a UTF8 string one character at a time. It is passed a pointer
+ * to the string and the length of the string. It sets 'value' to the value of
+ * the current character. It returns the number of characters read or a
+ * negative error code:
+ * -1 = string too short
+ * -2 = illegal character
+ * -3 = subsequent characters not of the form 10xxxxxx
+ * -4 = character encoded incorrectly (not minimal length).
+ */
+
+int UTF8_getc(const unsigned char *str, int len, unsigned long *val)
+{
+ const unsigned char *p;
+ unsigned long value;
+ int ret;
+ if (len <= 0)
+ return 0;
+ p = str;
+
+ /* Check syntax and work out the encoded value (if correct) */
+ if ((*p & 0x80) == 0) {
+ value = *p++ & 0x7f;
+ ret = 1;
+ } else if ((*p & 0xe0) == 0xc0) {
+ if (len < 2)
+ return -1;
+ if ((p[1] & 0xc0) != 0x80)
+ return -3;
+ value = (*p++ & 0x1f) << 6;
+ value |= *p++ & 0x3f;
+ if (value < 0x80)
+ return -4;
+ ret = 2;
+ } else if ((*p & 0xf0) == 0xe0) {
+ if (len < 3)
+ return -1;
+ if (((p[1] & 0xc0) != 0x80)
+ || ((p[2] & 0xc0) != 0x80))
+ return -3;
+ value = (*p++ & 0xf) << 12;
+ value |= (*p++ & 0x3f) << 6;
+ value |= *p++ & 0x3f;
+ if (value < 0x800)
+ return -4;
+ ret = 3;
+ } else if ((*p & 0xf8) == 0xf0) {
+ if (len < 4)
+ return -1;
+ if (((p[1] & 0xc0) != 0x80)
+ || ((p[2] & 0xc0) != 0x80)
+ || ((p[3] & 0xc0) != 0x80))
+ return -3;
+ value = ((unsigned long)(*p++ & 0x7)) << 18;
+ value |= (*p++ & 0x3f) << 12;
+ value |= (*p++ & 0x3f) << 6;
+ value |= *p++ & 0x3f;
+ if (value < 0x10000)
+ return -4;
+ ret = 4;
+ } else if ((*p & 0xfc) == 0xf8) {
+ if (len < 5)
+ return -1;
+ if (((p[1] & 0xc0) != 0x80)
+ || ((p[2] & 0xc0) != 0x80)
+ || ((p[3] & 0xc0) != 0x80)
+ || ((p[4] & 0xc0) != 0x80))
+ return -3;
+ value = ((unsigned long)(*p++ & 0x3)) << 24;
+ value |= ((unsigned long)(*p++ & 0x3f)) << 18;
+ value |= ((unsigned long)(*p++ & 0x3f)) << 12;
+ value |= (*p++ & 0x3f) << 6;
+ value |= *p++ & 0x3f;
+ if (value < 0x200000)
+ return -4;
+ ret = 5;
+ } else if ((*p & 0xfe) == 0xfc) {
+ if (len < 6)
+ return -1;
+ if (((p[1] & 0xc0) != 0x80)
+ || ((p[2] & 0xc0) != 0x80)
+ || ((p[3] & 0xc0) != 0x80)
+ || ((p[4] & 0xc0) != 0x80)
+ || ((p[5] & 0xc0) != 0x80))
+ return -3;
+ value = ((unsigned long)(*p++ & 0x1)) << 30;
+ value |= ((unsigned long)(*p++ & 0x3f)) << 24;
+ value |= ((unsigned long)(*p++ & 0x3f)) << 18;
+ value |= ((unsigned long)(*p++ & 0x3f)) << 12;
+ value |= (*p++ & 0x3f) << 6;
+ value |= *p++ & 0x3f;
+ if (value < 0x4000000)
+ return -4;
+ ret = 6;
+ } else
+ return -2;
+ *val = value;
+ return ret;
+}
+
+/*
+ * This takes a character 'value' and writes the UTF8 encoded value in 'str'
+ * where 'str' is a buffer containing 'len' characters. Returns the number of
+ * characters written or -1 if 'len' is too small. 'str' can be set to NULL
+ * in which case it just returns the number of characters. It will need at
+ * most 6 characters.
+ */
+
+int UTF8_putc(unsigned char *str, int len, unsigned long value)
+{
+ if (!str)
+ len = 6; /* Maximum we will need */
+ else if (len <= 0)
+ return -1;
+ if (value < 0x80) {
+ if (str)
+ *str = (unsigned char)value;
+ return 1;
+ }
+ if (value < 0x800) {
+ if (len < 2)
+ return -1;
+ if (str) {
+ *str++ = (unsigned char)(((value >> 6) & 0x1f) | 0xc0);
+ *str = (unsigned char)((value & 0x3f) | 0x80);
+ }
+ return 2;
+ }
+ if (value < 0x10000) {
+ if (len < 3)
+ return -1;
+ if (str) {
+ *str++ = (unsigned char)(((value >> 12) & 0xf) | 0xe0);
+ *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
+ *str = (unsigned char)((value & 0x3f) | 0x80);
+ }
+ return 3;
+ }
+ if (value < 0x200000) {
+ if (len < 4)
+ return -1;
+ if (str) {
+ *str++ = (unsigned char)(((value >> 18) & 0x7) | 0xf0);
+ *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
+ *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
+ *str = (unsigned char)((value & 0x3f) | 0x80);
+ }
+ return 4;
+ }
+ if (value < 0x4000000) {
+ if (len < 5)
+ return -1;
+ if (str) {
+ *str++ = (unsigned char)(((value >> 24) & 0x3) | 0xf8);
+ *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
+ *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
+ *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
+ *str = (unsigned char)((value & 0x3f) | 0x80);
+ }
+ return 5;
+ }
+ if (len < 6)
+ return -1;
+ if (str) {
+ *str++ = (unsigned char)(((value >> 30) & 0x1) | 0xfc);
+ *str++ = (unsigned char)(((value >> 24) & 0x3f) | 0x80);
+ *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
+ *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
+ *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
+ *str = (unsigned char)((value & 0x3f) | 0x80);
+ }
+ return 6;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/a_verify.c b/openssl-1.1.0h/crypto/asn1/a_verify.c
new file mode 100644
index 0000000..fb3607c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/a_verify.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include <sys/types.h>
+
+#include "internal/cryptlib.h"
+
+#include <openssl/bn.h>
+#include <openssl/x509.h>
+#include <openssl/objects.h>
+#include <openssl/buffer.h>
+#include <openssl/evp.h>
+#include "internal/asn1_int.h"
+#include "internal/evp_int.h"
+
+#ifndef NO_ASN1_OLD
+
+int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
+ char *data, EVP_PKEY *pkey)
+{
+ EVP_MD_CTX *ctx = EVP_MD_CTX_new();
+ const EVP_MD *type;
+ unsigned char *p, *buf_in = NULL;
+ int ret = -1, i, inl;
+
+ if (ctx == NULL) {
+ ASN1err(ASN1_F_ASN1_VERIFY, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ i = OBJ_obj2nid(a->algorithm);
+ type = EVP_get_digestbyname(OBJ_nid2sn(i));
+ if (type == NULL) {
+ ASN1err(ASN1_F_ASN1_VERIFY, ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+ goto err;
+ }
+
+ if (signature->type == V_ASN1_BIT_STRING && signature->flags & 0x7) {
+ ASN1err(ASN1_F_ASN1_VERIFY, ASN1_R_INVALID_BIT_STRING_BITS_LEFT);
+ goto err;
+ }
+
+ inl = i2d(data, NULL);
+ buf_in = OPENSSL_malloc((unsigned int)inl);
+ if (buf_in == NULL) {
+ ASN1err(ASN1_F_ASN1_VERIFY, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ p = buf_in;
+
+ i2d(data, &p);
+ ret = EVP_VerifyInit_ex(ctx, type, NULL)
+ && EVP_VerifyUpdate(ctx, (unsigned char *)buf_in, inl);
+
+ OPENSSL_clear_free(buf_in, (unsigned int)inl);
+
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_VERIFY, ERR_R_EVP_LIB);
+ goto err;
+ }
+ ret = -1;
+
+ if (EVP_VerifyFinal(ctx, (unsigned char *)signature->data,
+ (unsigned int)signature->length, pkey) <= 0) {
+ ASN1err(ASN1_F_ASN1_VERIFY, ERR_R_EVP_LIB);
+ ret = 0;
+ goto err;
+ }
+ ret = 1;
+ err:
+ EVP_MD_CTX_free(ctx);
+ return (ret);
+}
+
+#endif
+
+int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a,
+ ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey)
+{
+ EVP_MD_CTX *ctx = NULL;
+ unsigned char *buf_in = NULL;
+ int ret = -1, inl;
+
+ int mdnid, pknid;
+
+ if (!pkey) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_PASSED_NULL_PARAMETER);
+ return -1;
+ }
+
+ if (signature->type == V_ASN1_BIT_STRING && signature->flags & 0x7) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ASN1_R_INVALID_BIT_STRING_BITS_LEFT);
+ return -1;
+ }
+
+ ctx = EVP_MD_CTX_new();
+ if (ctx == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ /* Convert signature OID into digest and public key OIDs */
+ if (!OBJ_find_sigid_algs(OBJ_obj2nid(a->algorithm), &mdnid, &pknid)) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+ goto err;
+ }
+ if (mdnid == NID_undef) {
+ if (!pkey->ameth || !pkey->ameth->item_verify) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,
+ ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+ goto err;
+ }
+ ret = pkey->ameth->item_verify(ctx, it, asn, a, signature, pkey);
+ /*
+ * Return value of 2 means carry on, anything else means we exit
+ * straight away: either a fatal error of the underlying verification
+ * routine handles all verification.
+ */
+ if (ret != 2)
+ goto err;
+ ret = -1;
+ } else {
+ const EVP_MD *type;
+ type = EVP_get_digestbynid(mdnid);
+ if (type == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,
+ ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+ goto err;
+ }
+
+ /* Check public key OID matches public key type */
+ if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ASN1_R_WRONG_PUBLIC_KEY_TYPE);
+ goto err;
+ }
+
+ if (!EVP_DigestVerifyInit(ctx, NULL, type, NULL, pkey)) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_EVP_LIB);
+ ret = 0;
+ goto err;
+ }
+
+ }
+
+ inl = ASN1_item_i2d(asn, &buf_in, it);
+
+ if (buf_in == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ ret = EVP_DigestVerifyUpdate(ctx, buf_in, inl);
+
+ OPENSSL_clear_free(buf_in, (unsigned int)inl);
+
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_EVP_LIB);
+ goto err;
+ }
+ ret = -1;
+
+ if (EVP_DigestVerifyFinal(ctx, signature->data,
+ (size_t)signature->length) <= 0) {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_EVP_LIB);
+ ret = 0;
+ goto err;
+ }
+ ret = 1;
+ err:
+ EVP_MD_CTX_free(ctx);
+ return (ret);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/ameth_lib.c b/openssl-1.1.0h/crypto/asn1/ameth_lib.c
new file mode 100644
index 0000000..b8ba067
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/ameth_lib.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/engine.h>
+#include "internal/asn1_int.h"
+#include "internal/evp_int.h"
+
+/* Keep this sorted in type order !! */
+static const EVP_PKEY_ASN1_METHOD *standard_methods[] = {
+#ifndef OPENSSL_NO_RSA
+ &rsa_asn1_meths[0],
+ &rsa_asn1_meths[1],
+#endif
+#ifndef OPENSSL_NO_DH
+ &dh_asn1_meth,
+#endif
+#ifndef OPENSSL_NO_DSA
+ &dsa_asn1_meths[0],
+ &dsa_asn1_meths[1],
+ &dsa_asn1_meths[2],
+ &dsa_asn1_meths[3],
+ &dsa_asn1_meths[4],
+#endif
+#ifndef OPENSSL_NO_EC
+ &eckey_asn1_meth,
+#endif
+ &hmac_asn1_meth,
+#ifndef OPENSSL_NO_CMAC
+ &cmac_asn1_meth,
+#endif
+#ifndef OPENSSL_NO_DH
+ &dhx_asn1_meth,
+#endif
+#ifndef OPENSSL_NO_EC
+ &ecx25519_asn1_meth
+#endif
+};
+
+typedef int sk_cmp_fn_type(const char *const *a, const char *const *b);
+static STACK_OF(EVP_PKEY_ASN1_METHOD) *app_methods = NULL;
+
+#ifdef TEST
+void main()
+{
+ int i;
+ for (i = 0; i < OSSL_NELEM(standard_methods); i++)
+ fprintf(stderr, "Number %d id=%d (%s)\n", i,
+ standard_methods[i]->pkey_id,
+ OBJ_nid2sn(standard_methods[i]->pkey_id));
+}
+#endif
+
+DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *,
+ const EVP_PKEY_ASN1_METHOD *, ameth);
+
+static int ameth_cmp(const EVP_PKEY_ASN1_METHOD *const *a,
+ const EVP_PKEY_ASN1_METHOD *const *b)
+{
+ return ((*a)->pkey_id - (*b)->pkey_id);
+}
+
+IMPLEMENT_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *,
+ const EVP_PKEY_ASN1_METHOD *, ameth);
+
+int EVP_PKEY_asn1_get_count(void)
+{
+ int num = OSSL_NELEM(standard_methods);
+ if (app_methods)
+ num += sk_EVP_PKEY_ASN1_METHOD_num(app_methods);
+ return num;
+}
+
+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_get0(int idx)
+{
+ int num = OSSL_NELEM(standard_methods);
+ if (idx < 0)
+ return NULL;
+ if (idx < num)
+ return standard_methods[idx];
+ idx -= num;
+ return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx);
+}
+
+static const EVP_PKEY_ASN1_METHOD *pkey_asn1_find(int type)
+{
+ EVP_PKEY_ASN1_METHOD tmp;
+ const EVP_PKEY_ASN1_METHOD *t = &tmp, **ret;
+ tmp.pkey_id = type;
+ if (app_methods) {
+ int idx;
+ idx = sk_EVP_PKEY_ASN1_METHOD_find(app_methods, &tmp);
+ if (idx >= 0)
+ return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx);
+ }
+ ret = OBJ_bsearch_ameth(&t, standard_methods, OSSL_NELEM(standard_methods));
+ if (!ret || !*ret)
+ return NULL;
+ return *ret;
+}
+
+/*
+ * Find an implementation of an ASN1 algorithm. If 'pe' is not NULL also
+ * search through engines and set *pe to a functional reference to the engine
+ * implementing 'type' or NULL if no engine implements it.
+ */
+
+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find(ENGINE **pe, int type)
+{
+ const EVP_PKEY_ASN1_METHOD *t;
+
+ for (;;) {
+ t = pkey_asn1_find(type);
+ if (!t || !(t->pkey_flags & ASN1_PKEY_ALIAS))
+ break;
+ type = t->pkey_base_id;
+ }
+ if (pe) {
+#ifndef OPENSSL_NO_ENGINE
+ ENGINE *e;
+ /* type will contain the final unaliased type */
+ e = ENGINE_get_pkey_asn1_meth_engine(type);
+ if (e) {
+ *pe = e;
+ return ENGINE_get_pkey_asn1_meth(e, type);
+ }
+#endif
+ *pe = NULL;
+ }
+ return t;
+}
+
+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find_str(ENGINE **pe,
+ const char *str, int len)
+{
+ int i;
+ const EVP_PKEY_ASN1_METHOD *ameth = NULL;
+
+ if (len == -1)
+ len = strlen(str);
+ if (pe) {
+#ifndef OPENSSL_NO_ENGINE
+ ENGINE *e;
+ ameth = ENGINE_pkey_asn1_find_str(&e, str, len);
+ if (ameth) {
+ /*
+ * Convert structural into functional reference
+ */
+ if (!ENGINE_init(e))
+ ameth = NULL;
+ ENGINE_free(e);
+ *pe = e;
+ return ameth;
+ }
+#endif
+ *pe = NULL;
+ }
+ for (i = EVP_PKEY_asn1_get_count(); i-- > 0; ) {
+ ameth = EVP_PKEY_asn1_get0(i);
+ if (ameth->pkey_flags & ASN1_PKEY_ALIAS)
+ continue;
+ if ((int)strlen(ameth->pem_str) == len
+ && strncasecmp(ameth->pem_str, str, len) == 0)
+ return ameth;
+ }
+ return NULL;
+}
+
+int EVP_PKEY_asn1_add0(const EVP_PKEY_ASN1_METHOD *ameth)
+{
+ EVP_PKEY_ASN1_METHOD tmp = { 0, };
+
+ if (app_methods == NULL) {
+ app_methods = sk_EVP_PKEY_ASN1_METHOD_new(ameth_cmp);
+ if (app_methods == NULL)
+ return 0;
+ }
+
+ tmp.pkey_id = ameth->pkey_id;
+ if (sk_EVP_PKEY_ASN1_METHOD_find(app_methods, &tmp) >= 0) {
+ EVPerr(EVP_F_EVP_PKEY_ASN1_ADD0,
+ EVP_R_PKEY_APPLICATION_ASN1_METHOD_ALREADY_REGISTERED);
+ return 0;
+ }
+
+ if (!sk_EVP_PKEY_ASN1_METHOD_push(app_methods, ameth))
+ return 0;
+ sk_EVP_PKEY_ASN1_METHOD_sort(app_methods);
+ return 1;
+}
+
+int EVP_PKEY_asn1_add_alias(int to, int from)
+{
+ EVP_PKEY_ASN1_METHOD *ameth;
+ ameth = EVP_PKEY_asn1_new(from, ASN1_PKEY_ALIAS, NULL, NULL);
+ if (ameth == NULL)
+ return 0;
+ ameth->pkey_base_id = to;
+ if (!EVP_PKEY_asn1_add0(ameth)) {
+ EVP_PKEY_asn1_free(ameth);
+ return 0;
+ }
+ return 1;
+}
+
+int EVP_PKEY_asn1_get0_info(int *ppkey_id, int *ppkey_base_id,
+ int *ppkey_flags, const char **pinfo,
+ const char **ppem_str,
+ const EVP_PKEY_ASN1_METHOD *ameth)
+{
+ if (!ameth)
+ return 0;
+ if (ppkey_id)
+ *ppkey_id = ameth->pkey_id;
+ if (ppkey_base_id)
+ *ppkey_base_id = ameth->pkey_base_id;
+ if (ppkey_flags)
+ *ppkey_flags = ameth->pkey_flags;
+ if (pinfo)
+ *pinfo = ameth->info;
+ if (ppem_str)
+ *ppem_str = ameth->pem_str;
+ return 1;
+}
+
+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_get0_asn1(const EVP_PKEY *pkey)
+{
+ return pkey->ameth;
+}
+
+EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_new(int id, int flags,
+ const char *pem_str, const char *info)
+{
+ EVP_PKEY_ASN1_METHOD *ameth = OPENSSL_zalloc(sizeof(*ameth));
+
+ if (ameth == NULL)
+ return NULL;
+
+ ameth->pkey_id = id;
+ ameth->pkey_base_id = id;
+ ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
+
+ if (info) {
+ ameth->info = OPENSSL_strdup(info);
+ if (!ameth->info)
+ goto err;
+ }
+
+ if (pem_str) {
+ ameth->pem_str = OPENSSL_strdup(pem_str);
+ if (!ameth->pem_str)
+ goto err;
+ }
+
+ return ameth;
+
+ err:
+ EVP_PKEY_asn1_free(ameth);
+ return NULL;
+
+}
+
+void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
+ const EVP_PKEY_ASN1_METHOD *src)
+{
+
+ dst->pub_decode = src->pub_decode;
+ dst->pub_encode = src->pub_encode;
+ dst->pub_cmp = src->pub_cmp;
+ dst->pub_print = src->pub_print;
+
+ dst->priv_decode = src->priv_decode;
+ dst->priv_encode = src->priv_encode;
+ dst->priv_print = src->priv_print;
+
+ dst->old_priv_encode = src->old_priv_encode;
+ dst->old_priv_decode = src->old_priv_decode;
+
+ dst->pkey_size = src->pkey_size;
+ dst->pkey_bits = src->pkey_bits;
+
+ dst->param_decode = src->param_decode;
+ dst->param_encode = src->param_encode;
+ dst->param_missing = src->param_missing;
+ dst->param_copy = src->param_copy;
+ dst->param_cmp = src->param_cmp;
+ dst->param_print = src->param_print;
+
+ dst->pkey_free = src->pkey_free;
+ dst->pkey_ctrl = src->pkey_ctrl;
+
+ dst->item_sign = src->item_sign;
+ dst->item_verify = src->item_verify;
+
+}
+
+void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
+{
+ if (ameth && (ameth->pkey_flags & ASN1_PKEY_DYNAMIC)) {
+ OPENSSL_free(ameth->pem_str);
+ OPENSSL_free(ameth->info);
+ OPENSSL_free(ameth);
+ }
+}
+
+void EVP_PKEY_asn1_set_public(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*pub_decode) (EVP_PKEY *pk,
+ X509_PUBKEY *pub),
+ int (*pub_encode) (X509_PUBKEY *pub,
+ const EVP_PKEY *pk),
+ int (*pub_cmp) (const EVP_PKEY *a,
+ const EVP_PKEY *b),
+ int (*pub_print) (BIO *out,
+ const EVP_PKEY *pkey,
+ int indent, ASN1_PCTX *pctx),
+ int (*pkey_size) (const EVP_PKEY *pk),
+ int (*pkey_bits) (const EVP_PKEY *pk))
+{
+ ameth->pub_decode = pub_decode;
+ ameth->pub_encode = pub_encode;
+ ameth->pub_cmp = pub_cmp;
+ ameth->pub_print = pub_print;
+ ameth->pkey_size = pkey_size;
+ ameth->pkey_bits = pkey_bits;
+}
+
+void EVP_PKEY_asn1_set_private(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*priv_decode) (EVP_PKEY *pk,
+ const PKCS8_PRIV_KEY_INFO
+ *p8inf),
+ int (*priv_encode) (PKCS8_PRIV_KEY_INFO *p8,
+ const EVP_PKEY *pk),
+ int (*priv_print) (BIO *out,
+ const EVP_PKEY *pkey,
+ int indent,
+ ASN1_PCTX *pctx))
+{
+ ameth->priv_decode = priv_decode;
+ ameth->priv_encode = priv_encode;
+ ameth->priv_print = priv_print;
+}
+
+void EVP_PKEY_asn1_set_param(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*param_decode) (EVP_PKEY *pkey,
+ const unsigned char **pder,
+ int derlen),
+ int (*param_encode) (const EVP_PKEY *pkey,
+ unsigned char **pder),
+ int (*param_missing) (const EVP_PKEY *pk),
+ int (*param_copy) (EVP_PKEY *to,
+ const EVP_PKEY *from),
+ int (*param_cmp) (const EVP_PKEY *a,
+ const EVP_PKEY *b),
+ int (*param_print) (BIO *out,
+ const EVP_PKEY *pkey,
+ int indent, ASN1_PCTX *pctx))
+{
+ ameth->param_decode = param_decode;
+ ameth->param_encode = param_encode;
+ ameth->param_missing = param_missing;
+ ameth->param_copy = param_copy;
+ ameth->param_cmp = param_cmp;
+ ameth->param_print = param_print;
+}
+
+void EVP_PKEY_asn1_set_free(EVP_PKEY_ASN1_METHOD *ameth,
+ void (*pkey_free) (EVP_PKEY *pkey))
+{
+ ameth->pkey_free = pkey_free;
+}
+
+void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*pkey_ctrl) (EVP_PKEY *pkey, int op,
+ long arg1, void *arg2))
+{
+ ameth->pkey_ctrl = pkey_ctrl;
+}
+
+void EVP_PKEY_asn1_set_security_bits(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*pkey_security_bits) (const EVP_PKEY
+ *pk))
+{
+ ameth->pkey_security_bits = pkey_security_bits;
+}
+
+void EVP_PKEY_asn1_set_item(EVP_PKEY_ASN1_METHOD *ameth,
+ int (*item_verify) (EVP_MD_CTX *ctx,
+ const ASN1_ITEM *it,
+ void *asn,
+ X509_ALGOR *a,
+ ASN1_BIT_STRING *sig,
+ EVP_PKEY *pkey),
+ int (*item_sign) (EVP_MD_CTX *ctx,
+ const ASN1_ITEM *it,
+ void *asn,
+ X509_ALGOR *alg1,
+ X509_ALGOR *alg2,
+ ASN1_BIT_STRING *sig))
+{
+ ameth->item_sign = item_sign;
+ ameth->item_verify = item_verify;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/asn1_err.c b/openssl-1.1.0h/crypto/asn1/asn1_err.c
new file mode 100644
index 0000000..8602c40
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn1_err.c
@@ -0,0 +1,271 @@
+/*
+ * Generated by util/mkerr.pl DO NOT EDIT
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <openssl/err.h>
+#include <openssl/asn1.h>
+
+/* BEGIN ERROR CODES */
+#ifndef OPENSSL_NO_ERR
+
+# define ERR_FUNC(func) ERR_PACK(ERR_LIB_ASN1,func,0)
+# define ERR_REASON(reason) ERR_PACK(ERR_LIB_ASN1,0,reason)
+
+static ERR_STRING_DATA ASN1_str_functs[] = {
+ {ERR_FUNC(ASN1_F_A2D_ASN1_OBJECT), "a2d_ASN1_OBJECT"},
+ {ERR_FUNC(ASN1_F_A2I_ASN1_INTEGER), "a2i_ASN1_INTEGER"},
+ {ERR_FUNC(ASN1_F_A2I_ASN1_STRING), "a2i_ASN1_STRING"},
+ {ERR_FUNC(ASN1_F_APPEND_EXP), "append_exp"},
+ {ERR_FUNC(ASN1_F_ASN1_BIT_STRING_SET_BIT), "ASN1_BIT_STRING_set_bit"},
+ {ERR_FUNC(ASN1_F_ASN1_CB), "asn1_cb"},
+ {ERR_FUNC(ASN1_F_ASN1_CHECK_TLEN), "asn1_check_tlen"},
+ {ERR_FUNC(ASN1_F_ASN1_COLLECT), "asn1_collect"},
+ {ERR_FUNC(ASN1_F_ASN1_D2I_EX_PRIMITIVE), "asn1_d2i_ex_primitive"},
+ {ERR_FUNC(ASN1_F_ASN1_D2I_FP), "ASN1_d2i_fp"},
+ {ERR_FUNC(ASN1_F_ASN1_D2I_READ_BIO), "asn1_d2i_read_bio"},
+ {ERR_FUNC(ASN1_F_ASN1_DIGEST), "ASN1_digest"},
+ {ERR_FUNC(ASN1_F_ASN1_DO_ADB), "asn1_do_adb"},
+ {ERR_FUNC(ASN1_F_ASN1_DO_LOCK), "asn1_do_lock"},
+ {ERR_FUNC(ASN1_F_ASN1_DUP), "ASN1_dup"},
+ {ERR_FUNC(ASN1_F_ASN1_EX_C2I), "asn1_ex_c2i"},
+ {ERR_FUNC(ASN1_F_ASN1_FIND_END), "asn1_find_end"},
+ {ERR_FUNC(ASN1_F_ASN1_GENERALIZEDTIME_ADJ), "ASN1_GENERALIZEDTIME_adj"},
+ {ERR_FUNC(ASN1_F_ASN1_GENERATE_V3), "ASN1_generate_v3"},
+ {ERR_FUNC(ASN1_F_ASN1_GET_INT64), "asn1_get_int64"},
+ {ERR_FUNC(ASN1_F_ASN1_GET_OBJECT), "ASN1_get_object"},
+ {ERR_FUNC(ASN1_F_ASN1_GET_UINT64), "asn1_get_uint64"},
+ {ERR_FUNC(ASN1_F_ASN1_I2D_BIO), "ASN1_i2d_bio"},
+ {ERR_FUNC(ASN1_F_ASN1_I2D_FP), "ASN1_i2d_fp"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_D2I_FP), "ASN1_item_d2i_fp"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_DUP), "ASN1_item_dup"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_EMBED_D2I), "asn1_item_embed_d2i"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_EMBED_NEW), "asn1_item_embed_new"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_BIO), "ASN1_item_i2d_bio"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP), "ASN1_item_i2d_fp"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_PACK), "ASN1_item_pack"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN), "ASN1_item_sign"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX), "ASN1_item_sign_ctx"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK), "ASN1_item_unpack"},
+ {ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY), "ASN1_item_verify"},
+ {ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY), "ASN1_mbstring_ncopy"},
+ {ERR_FUNC(ASN1_F_ASN1_OBJECT_NEW), "ASN1_OBJECT_new"},
+ {ERR_FUNC(ASN1_F_ASN1_OUTPUT_DATA), "asn1_output_data"},
+ {ERR_FUNC(ASN1_F_ASN1_PCTX_NEW), "ASN1_PCTX_new"},
+ {ERR_FUNC(ASN1_F_ASN1_SCTX_NEW), "ASN1_SCTX_new"},
+ {ERR_FUNC(ASN1_F_ASN1_SIGN), "ASN1_sign"},
+ {ERR_FUNC(ASN1_F_ASN1_STR2TYPE), "asn1_str2type"},
+ {ERR_FUNC(ASN1_F_ASN1_STRING_GET_INT64), "asn1_string_get_int64"},
+ {ERR_FUNC(ASN1_F_ASN1_STRING_GET_UINT64), "asn1_string_get_uint64"},
+ {ERR_FUNC(ASN1_F_ASN1_STRING_SET), "ASN1_STRING_set"},
+ {ERR_FUNC(ASN1_F_ASN1_STRING_TABLE_ADD), "ASN1_STRING_TABLE_add"},
+ {ERR_FUNC(ASN1_F_ASN1_STRING_TO_BN), "asn1_string_to_bn"},
+ {ERR_FUNC(ASN1_F_ASN1_STRING_TYPE_NEW), "ASN1_STRING_type_new"},
+ {ERR_FUNC(ASN1_F_ASN1_TEMPLATE_EX_D2I), "asn1_template_ex_d2i"},
+ {ERR_FUNC(ASN1_F_ASN1_TEMPLATE_NEW), "asn1_template_new"},
+ {ERR_FUNC(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I), "asn1_template_noexp_d2i"},
+ {ERR_FUNC(ASN1_F_ASN1_TIME_ADJ), "ASN1_TIME_adj"},
+ {ERR_FUNC(ASN1_F_ASN1_TYPE_GET_INT_OCTETSTRING),
+ "ASN1_TYPE_get_int_octetstring"},
+ {ERR_FUNC(ASN1_F_ASN1_TYPE_GET_OCTETSTRING), "ASN1_TYPE_get_octetstring"},
+ {ERR_FUNC(ASN1_F_ASN1_UTCTIME_ADJ), "ASN1_UTCTIME_adj"},
+ {ERR_FUNC(ASN1_F_ASN1_VERIFY), "ASN1_verify"},
+ {ERR_FUNC(ASN1_F_B64_READ_ASN1), "b64_read_asn1"},
+ {ERR_FUNC(ASN1_F_B64_WRITE_ASN1), "B64_write_ASN1"},
+ {ERR_FUNC(ASN1_F_BIO_NEW_NDEF), "BIO_new_NDEF"},
+ {ERR_FUNC(ASN1_F_BITSTR_CB), "bitstr_cb"},
+ {ERR_FUNC(ASN1_F_BN_TO_ASN1_STRING), "bn_to_asn1_string"},
+ {ERR_FUNC(ASN1_F_C2I_ASN1_BIT_STRING), "c2i_ASN1_BIT_STRING"},
+ {ERR_FUNC(ASN1_F_C2I_ASN1_INTEGER), "c2i_ASN1_INTEGER"},
+ {ERR_FUNC(ASN1_F_C2I_ASN1_OBJECT), "c2i_ASN1_OBJECT"},
+ {ERR_FUNC(ASN1_F_C2I_IBUF), "c2i_ibuf"},
+ {ERR_FUNC(ASN1_F_C2I_UINT64_INT), "c2i_uint64_int"},
+ {ERR_FUNC(ASN1_F_COLLECT_DATA), "collect_data"},
+ {ERR_FUNC(ASN1_F_D2I_ASN1_OBJECT), "d2i_ASN1_OBJECT"},
+ {ERR_FUNC(ASN1_F_D2I_ASN1_UINTEGER), "d2i_ASN1_UINTEGER"},
+ {ERR_FUNC(ASN1_F_D2I_AUTOPRIVATEKEY), "d2i_AutoPrivateKey"},
+ {ERR_FUNC(ASN1_F_D2I_PRIVATEKEY), "d2i_PrivateKey"},
+ {ERR_FUNC(ASN1_F_D2I_PUBLICKEY), "d2i_PublicKey"},
+ {ERR_FUNC(ASN1_F_DO_TCREATE), "do_tcreate"},
+ {ERR_FUNC(ASN1_F_I2D_ASN1_BIO_STREAM), "i2d_ASN1_bio_stream"},
+ {ERR_FUNC(ASN1_F_I2D_DSA_PUBKEY), "i2d_DSA_PUBKEY"},
+ {ERR_FUNC(ASN1_F_I2D_EC_PUBKEY), "i2d_EC_PUBKEY"},
+ {ERR_FUNC(ASN1_F_I2D_PRIVATEKEY), "i2d_PrivateKey"},
+ {ERR_FUNC(ASN1_F_I2D_PUBLICKEY), "i2d_PublicKey"},
+ {ERR_FUNC(ASN1_F_I2D_RSA_PUBKEY), "i2d_RSA_PUBKEY"},
+ {ERR_FUNC(ASN1_F_LONG_C2I), "long_c2i"},
+ {ERR_FUNC(ASN1_F_OID_MODULE_INIT), "oid_module_init"},
+ {ERR_FUNC(ASN1_F_PARSE_TAGGING), "parse_tagging"},
+ {ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV), "PKCS5_pbe2_set_iv"},
+ {ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_SCRYPT), "PKCS5_pbe2_set_scrypt"},
+ {ERR_FUNC(ASN1_F_PKCS5_PBE_SET), "PKCS5_pbe_set"},
+ {ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR), "PKCS5_pbe_set0_algor"},
+ {ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET), "PKCS5_pbkdf2_set"},
+ {ERR_FUNC(ASN1_F_PKCS5_SCRYPT_SET), "pkcs5_scrypt_set"},
+ {ERR_FUNC(ASN1_F_SMIME_READ_ASN1), "SMIME_read_ASN1"},
+ {ERR_FUNC(ASN1_F_SMIME_TEXT), "SMIME_text"},
+ {ERR_FUNC(ASN1_F_STBL_MODULE_INIT), "stbl_module_init"},
+ {ERR_FUNC(ASN1_F_UINT32_C2I), "uint32_c2i"},
+ {ERR_FUNC(ASN1_F_UINT64_C2I), "uint64_c2i"},
+ {ERR_FUNC(ASN1_F_X509_CRL_ADD0_REVOKED), "X509_CRL_add0_revoked"},
+ {ERR_FUNC(ASN1_F_X509_INFO_NEW), "X509_INFO_new"},
+ {ERR_FUNC(ASN1_F_X509_NAME_ENCODE), "x509_name_encode"},
+ {ERR_FUNC(ASN1_F_X509_NAME_EX_D2I), "x509_name_ex_d2i"},
+ {ERR_FUNC(ASN1_F_X509_NAME_EX_NEW), "x509_name_ex_new"},
+ {ERR_FUNC(ASN1_F_X509_PKEY_NEW), "X509_PKEY_new"},
+ {0, NULL}
+};
+
+static ERR_STRING_DATA ASN1_str_reasons[] = {
+ {ERR_REASON(ASN1_R_ADDING_OBJECT), "adding object"},
+ {ERR_REASON(ASN1_R_ASN1_PARSE_ERROR), "asn1 parse error"},
+ {ERR_REASON(ASN1_R_ASN1_SIG_PARSE_ERROR), "asn1 sig parse error"},
+ {ERR_REASON(ASN1_R_AUX_ERROR), "aux error"},
+ {ERR_REASON(ASN1_R_BAD_OBJECT_HEADER), "bad object header"},
+ {ERR_REASON(ASN1_R_BMPSTRING_IS_WRONG_LENGTH),
+ "bmpstring is wrong length"},
+ {ERR_REASON(ASN1_R_BN_LIB), "bn lib"},
+ {ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH), "boolean is wrong length"},
+ {ERR_REASON(ASN1_R_BUFFER_TOO_SMALL), "buffer too small"},
+ {ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),
+ "cipher has no object identifier"},
+ {ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED), "context not initialised"},
+ {ERR_REASON(ASN1_R_DATA_IS_WRONG), "data is wrong"},
+ {ERR_REASON(ASN1_R_DECODE_ERROR), "decode error"},
+ {ERR_REASON(ASN1_R_DEPTH_EXCEEDED), "depth exceeded"},
+ {ERR_REASON(ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED),
+ "digest and key type not supported"},
+ {ERR_REASON(ASN1_R_ENCODE_ERROR), "encode error"},
+ {ERR_REASON(ASN1_R_ERROR_GETTING_TIME), "error getting time"},
+ {ERR_REASON(ASN1_R_ERROR_LOADING_SECTION), "error loading section"},
+ {ERR_REASON(ASN1_R_ERROR_SETTING_CIPHER_PARAMS),
+ "error setting cipher params"},
+ {ERR_REASON(ASN1_R_EXPECTING_AN_INTEGER), "expecting an integer"},
+ {ERR_REASON(ASN1_R_EXPECTING_AN_OBJECT), "expecting an object"},
+ {ERR_REASON(ASN1_R_EXPLICIT_LENGTH_MISMATCH), "explicit length mismatch"},
+ {ERR_REASON(ASN1_R_EXPLICIT_TAG_NOT_CONSTRUCTED),
+ "explicit tag not constructed"},
+ {ERR_REASON(ASN1_R_FIELD_MISSING), "field missing"},
+ {ERR_REASON(ASN1_R_FIRST_NUM_TOO_LARGE), "first num too large"},
+ {ERR_REASON(ASN1_R_HEADER_TOO_LONG), "header too long"},
+ {ERR_REASON(ASN1_R_ILLEGAL_BITSTRING_FORMAT), "illegal bitstring format"},
+ {ERR_REASON(ASN1_R_ILLEGAL_BOOLEAN), "illegal boolean"},
+ {ERR_REASON(ASN1_R_ILLEGAL_CHARACTERS), "illegal characters"},
+ {ERR_REASON(ASN1_R_ILLEGAL_FORMAT), "illegal format"},
+ {ERR_REASON(ASN1_R_ILLEGAL_HEX), "illegal hex"},
+ {ERR_REASON(ASN1_R_ILLEGAL_IMPLICIT_TAG), "illegal implicit tag"},
+ {ERR_REASON(ASN1_R_ILLEGAL_INTEGER), "illegal integer"},
+ {ERR_REASON(ASN1_R_ILLEGAL_NEGATIVE_VALUE), "illegal negative value"},
+ {ERR_REASON(ASN1_R_ILLEGAL_NESTED_TAGGING), "illegal nested tagging"},
+ {ERR_REASON(ASN1_R_ILLEGAL_NULL), "illegal null"},
+ {ERR_REASON(ASN1_R_ILLEGAL_NULL_VALUE), "illegal null value"},
+ {ERR_REASON(ASN1_R_ILLEGAL_OBJECT), "illegal object"},
+ {ERR_REASON(ASN1_R_ILLEGAL_OPTIONAL_ANY), "illegal optional any"},
+ {ERR_REASON(ASN1_R_ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE),
+ "illegal options on item template"},
+ {ERR_REASON(ASN1_R_ILLEGAL_PADDING), "illegal padding"},
+ {ERR_REASON(ASN1_R_ILLEGAL_TAGGED_ANY), "illegal tagged any"},
+ {ERR_REASON(ASN1_R_ILLEGAL_TIME_VALUE), "illegal time value"},
+ {ERR_REASON(ASN1_R_ILLEGAL_ZERO_CONTENT), "illegal zero content"},
+ {ERR_REASON(ASN1_R_INTEGER_NOT_ASCII_FORMAT), "integer not ascii format"},
+ {ERR_REASON(ASN1_R_INTEGER_TOO_LARGE_FOR_LONG),
+ "integer too large for long"},
+ {ERR_REASON(ASN1_R_INVALID_BIT_STRING_BITS_LEFT),
+ "invalid bit string bits left"},
+ {ERR_REASON(ASN1_R_INVALID_BMPSTRING_LENGTH), "invalid bmpstring length"},
+ {ERR_REASON(ASN1_R_INVALID_DIGIT), "invalid digit"},
+ {ERR_REASON(ASN1_R_INVALID_MIME_TYPE), "invalid mime type"},
+ {ERR_REASON(ASN1_R_INVALID_MODIFIER), "invalid modifier"},
+ {ERR_REASON(ASN1_R_INVALID_NUMBER), "invalid number"},
+ {ERR_REASON(ASN1_R_INVALID_OBJECT_ENCODING), "invalid object encoding"},
+ {ERR_REASON(ASN1_R_INVALID_SCRYPT_PARAMETERS),
+ "invalid scrypt parameters"},
+ {ERR_REASON(ASN1_R_INVALID_SEPARATOR), "invalid separator"},
+ {ERR_REASON(ASN1_R_INVALID_STRING_TABLE_VALUE),
+ "invalid string table value"},
+ {ERR_REASON(ASN1_R_INVALID_UNIVERSALSTRING_LENGTH),
+ "invalid universalstring length"},
+ {ERR_REASON(ASN1_R_INVALID_UTF8STRING), "invalid utf8string"},
+ {ERR_REASON(ASN1_R_INVALID_VALUE), "invalid value"},
+ {ERR_REASON(ASN1_R_LIST_ERROR), "list error"},
+ {ERR_REASON(ASN1_R_MIME_NO_CONTENT_TYPE), "mime no content type"},
+ {ERR_REASON(ASN1_R_MIME_PARSE_ERROR), "mime parse error"},
+ {ERR_REASON(ASN1_R_MIME_SIG_PARSE_ERROR), "mime sig parse error"},
+ {ERR_REASON(ASN1_R_MISSING_EOC), "missing eoc"},
+ {ERR_REASON(ASN1_R_MISSING_SECOND_NUMBER), "missing second number"},
+ {ERR_REASON(ASN1_R_MISSING_VALUE), "missing value"},
+ {ERR_REASON(ASN1_R_MSTRING_NOT_UNIVERSAL), "mstring not universal"},
+ {ERR_REASON(ASN1_R_MSTRING_WRONG_TAG), "mstring wrong tag"},
+ {ERR_REASON(ASN1_R_NESTED_ASN1_STRING), "nested asn1 string"},
+ {ERR_REASON(ASN1_R_NESTED_TOO_DEEP), "nested too deep"},
+ {ERR_REASON(ASN1_R_NON_HEX_CHARACTERS), "non hex characters"},
+ {ERR_REASON(ASN1_R_NOT_ASCII_FORMAT), "not ascii format"},
+ {ERR_REASON(ASN1_R_NOT_ENOUGH_DATA), "not enough data"},
+ {ERR_REASON(ASN1_R_NO_CONTENT_TYPE), "no content type"},
+ {ERR_REASON(ASN1_R_NO_MATCHING_CHOICE_TYPE), "no matching choice type"},
+ {ERR_REASON(ASN1_R_NO_MULTIPART_BODY_FAILURE),
+ "no multipart body failure"},
+ {ERR_REASON(ASN1_R_NO_MULTIPART_BOUNDARY), "no multipart boundary"},
+ {ERR_REASON(ASN1_R_NO_SIG_CONTENT_TYPE), "no sig content type"},
+ {ERR_REASON(ASN1_R_NULL_IS_WRONG_LENGTH), "null is wrong length"},
+ {ERR_REASON(ASN1_R_OBJECT_NOT_ASCII_FORMAT), "object not ascii format"},
+ {ERR_REASON(ASN1_R_ODD_NUMBER_OF_CHARS), "odd number of chars"},
+ {ERR_REASON(ASN1_R_SECOND_NUMBER_TOO_LARGE), "second number too large"},
+ {ERR_REASON(ASN1_R_SEQUENCE_LENGTH_MISMATCH), "sequence length mismatch"},
+ {ERR_REASON(ASN1_R_SEQUENCE_NOT_CONSTRUCTED), "sequence not constructed"},
+ {ERR_REASON(ASN1_R_SEQUENCE_OR_SET_NEEDS_CONFIG),
+ "sequence or set needs config"},
+ {ERR_REASON(ASN1_R_SHORT_LINE), "short line"},
+ {ERR_REASON(ASN1_R_SIG_INVALID_MIME_TYPE), "sig invalid mime type"},
+ {ERR_REASON(ASN1_R_STREAMING_NOT_SUPPORTED), "streaming not supported"},
+ {ERR_REASON(ASN1_R_STRING_TOO_LONG), "string too long"},
+ {ERR_REASON(ASN1_R_STRING_TOO_SHORT), "string too short"},
+ {ERR_REASON(ASN1_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),
+ "the asn1 object identifier is not known for this md"},
+ {ERR_REASON(ASN1_R_TIME_NOT_ASCII_FORMAT), "time not ascii format"},
+ {ERR_REASON(ASN1_R_TOO_LARGE), "too large"},
+ {ERR_REASON(ASN1_R_TOO_LONG), "too long"},
+ {ERR_REASON(ASN1_R_TOO_SMALL), "too small"},
+ {ERR_REASON(ASN1_R_TYPE_NOT_CONSTRUCTED), "type not constructed"},
+ {ERR_REASON(ASN1_R_TYPE_NOT_PRIMITIVE), "type not primitive"},
+ {ERR_REASON(ASN1_R_UNEXPECTED_EOC), "unexpected eoc"},
+ {ERR_REASON(ASN1_R_UNIVERSALSTRING_IS_WRONG_LENGTH),
+ "universalstring is wrong length"},
+ {ERR_REASON(ASN1_R_UNKNOWN_FORMAT), "unknown format"},
+ {ERR_REASON(ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM),
+ "unknown message digest algorithm"},
+ {ERR_REASON(ASN1_R_UNKNOWN_OBJECT_TYPE), "unknown object type"},
+ {ERR_REASON(ASN1_R_UNKNOWN_PUBLIC_KEY_TYPE), "unknown public key type"},
+ {ERR_REASON(ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM),
+ "unknown signature algorithm"},
+ {ERR_REASON(ASN1_R_UNKNOWN_TAG), "unknown tag"},
+ {ERR_REASON(ASN1_R_UNSUPPORTED_ANY_DEFINED_BY_TYPE),
+ "unsupported any defined by type"},
+ {ERR_REASON(ASN1_R_UNSUPPORTED_PUBLIC_KEY_TYPE),
+ "unsupported public key type"},
+ {ERR_REASON(ASN1_R_UNSUPPORTED_TYPE), "unsupported type"},
+ {ERR_REASON(ASN1_R_WRONG_INTEGER_TYPE), "wrong integer type"},
+ {ERR_REASON(ASN1_R_WRONG_PUBLIC_KEY_TYPE), "wrong public key type"},
+ {ERR_REASON(ASN1_R_WRONG_TAG), "wrong tag"},
+ {0, NULL}
+};
+
+#endif
+
+int ERR_load_ASN1_strings(void)
+{
+#ifndef OPENSSL_NO_ERR
+
+ if (ERR_func_error_string(ASN1_str_functs[0].error) == NULL) {
+ ERR_load_strings(0, ASN1_str_functs);
+ ERR_load_strings(0, ASN1_str_reasons);
+ }
+#endif
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/asn1_gen.c b/openssl-1.1.0h/crypto/asn1/asn1_gen.c
new file mode 100644
index 0000000..493a693
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn1_gen.c
@@ -0,0 +1,789 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include <openssl/x509v3.h>
+
+#define ASN1_GEN_FLAG 0x10000
+#define ASN1_GEN_FLAG_IMP (ASN1_GEN_FLAG|1)
+#define ASN1_GEN_FLAG_EXP (ASN1_GEN_FLAG|2)
+#define ASN1_GEN_FLAG_TAG (ASN1_GEN_FLAG|3)
+#define ASN1_GEN_FLAG_BITWRAP (ASN1_GEN_FLAG|4)
+#define ASN1_GEN_FLAG_OCTWRAP (ASN1_GEN_FLAG|5)
+#define ASN1_GEN_FLAG_SEQWRAP (ASN1_GEN_FLAG|6)
+#define ASN1_GEN_FLAG_SETWRAP (ASN1_GEN_FLAG|7)
+#define ASN1_GEN_FLAG_FORMAT (ASN1_GEN_FLAG|8)
+
+#define ASN1_GEN_STR(str,val) {str, sizeof(str) - 1, val}
+
+#define ASN1_FLAG_EXP_MAX 20
+/* Maximum number of nested sequences */
+#define ASN1_GEN_SEQ_MAX_DEPTH 50
+
+/* Input formats */
+
+/* ASCII: default */
+#define ASN1_GEN_FORMAT_ASCII 1
+/* UTF8 */
+#define ASN1_GEN_FORMAT_UTF8 2
+/* Hex */
+#define ASN1_GEN_FORMAT_HEX 3
+/* List of bits */
+#define ASN1_GEN_FORMAT_BITLIST 4
+
+struct tag_name_st {
+ const char *strnam;
+ int len;
+ int tag;
+};
+
+typedef struct {
+ int exp_tag;
+ int exp_class;
+ int exp_constructed;
+ int exp_pad;
+ long exp_len;
+} tag_exp_type;
+
+typedef struct {
+ int imp_tag;
+ int imp_class;
+ int utype;
+ int format;
+ const char *str;
+ tag_exp_type exp_list[ASN1_FLAG_EXP_MAX];
+ int exp_count;
+} tag_exp_arg;
+
+static ASN1_TYPE *generate_v3(const char *str, X509V3_CTX *cnf, int depth,
+ int *perr);
+static int bitstr_cb(const char *elem, int len, void *bitstr);
+static int asn1_cb(const char *elem, int len, void *bitstr);
+static int append_exp(tag_exp_arg *arg, int exp_tag, int exp_class,
+ int exp_constructed, int exp_pad, int imp_ok);
+static int parse_tagging(const char *vstart, int vlen, int *ptag,
+ int *pclass);
+static ASN1_TYPE *asn1_multi(int utype, const char *section, X509V3_CTX *cnf,
+ int depth, int *perr);
+static ASN1_TYPE *asn1_str2type(const char *str, int format, int utype);
+static int asn1_str2tag(const char *tagstr, int len);
+
+ASN1_TYPE *ASN1_generate_nconf(const char *str, CONF *nconf)
+{
+ X509V3_CTX cnf;
+
+ if (!nconf)
+ return ASN1_generate_v3(str, NULL);
+
+ X509V3_set_nconf(&cnf, nconf);
+ return ASN1_generate_v3(str, &cnf);
+}
+
+ASN1_TYPE *ASN1_generate_v3(const char *str, X509V3_CTX *cnf)
+{
+ int err = 0;
+ ASN1_TYPE *ret = generate_v3(str, cnf, 0, &err);
+ if (err)
+ ASN1err(ASN1_F_ASN1_GENERATE_V3, err);
+ return ret;
+}
+
+static ASN1_TYPE *generate_v3(const char *str, X509V3_CTX *cnf, int depth,
+ int *perr)
+{
+ ASN1_TYPE *ret;
+ tag_exp_arg asn1_tags;
+ tag_exp_type *etmp;
+
+ int i, len;
+
+ unsigned char *orig_der = NULL, *new_der = NULL;
+ const unsigned char *cpy_start;
+ unsigned char *p;
+ const unsigned char *cp;
+ int cpy_len;
+ long hdr_len = 0;
+ int hdr_constructed = 0, hdr_tag, hdr_class;
+ int r;
+
+ asn1_tags.imp_tag = -1;
+ asn1_tags.imp_class = -1;
+ asn1_tags.format = ASN1_GEN_FORMAT_ASCII;
+ asn1_tags.exp_count = 0;
+ if (CONF_parse_list(str, ',', 1, asn1_cb, &asn1_tags) != 0) {
+ *perr = ASN1_R_UNKNOWN_TAG;
+ return NULL;
+ }
+
+ if ((asn1_tags.utype == V_ASN1_SEQUENCE)
+ || (asn1_tags.utype == V_ASN1_SET)) {
+ if (!cnf) {
+ *perr = ASN1_R_SEQUENCE_OR_SET_NEEDS_CONFIG;
+ return NULL;
+ }
+ if (depth >= ASN1_GEN_SEQ_MAX_DEPTH) {
+ *perr = ASN1_R_ILLEGAL_NESTED_TAGGING;
+ return NULL;
+ }
+ ret = asn1_multi(asn1_tags.utype, asn1_tags.str, cnf, depth, perr);
+ } else
+ ret = asn1_str2type(asn1_tags.str, asn1_tags.format, asn1_tags.utype);
+
+ if (!ret)
+ return NULL;
+
+ /* If no tagging return base type */
+ if ((asn1_tags.imp_tag == -1) && (asn1_tags.exp_count == 0))
+ return ret;
+
+ /* Generate the encoding */
+ cpy_len = i2d_ASN1_TYPE(ret, &orig_der);
+ ASN1_TYPE_free(ret);
+ ret = NULL;
+ /* Set point to start copying for modified encoding */
+ cpy_start = orig_der;
+
+ /* Do we need IMPLICIT tagging? */
+ if (asn1_tags.imp_tag != -1) {
+ /* If IMPLICIT we will replace the underlying tag */
+ /* Skip existing tag+len */
+ r = ASN1_get_object(&cpy_start, &hdr_len, &hdr_tag, &hdr_class,
+ cpy_len);
+ if (r & 0x80)
+ goto err;
+ /* Update copy length */
+ cpy_len -= cpy_start - orig_der;
+ /*
+ * For IMPLICIT tagging the length should match the original length
+ * and constructed flag should be consistent.
+ */
+ if (r & 0x1) {
+ /* Indefinite length constructed */
+ hdr_constructed = 2;
+ hdr_len = 0;
+ } else
+ /* Just retain constructed flag */
+ hdr_constructed = r & V_ASN1_CONSTRUCTED;
+ /*
+ * Work out new length with IMPLICIT tag: ignore constructed because
+ * it will mess up if indefinite length
+ */
+ len = ASN1_object_size(0, hdr_len, asn1_tags.imp_tag);
+ } else
+ len = cpy_len;
+
+ /* Work out length in any EXPLICIT, starting from end */
+
+ for (i = 0, etmp = asn1_tags.exp_list + asn1_tags.exp_count - 1;
+ i < asn1_tags.exp_count; i++, etmp--) {
+ /* Content length: number of content octets + any padding */
+ len += etmp->exp_pad;
+ etmp->exp_len = len;
+ /* Total object length: length including new header */
+ len = ASN1_object_size(0, len, etmp->exp_tag);
+ }
+
+ /* Allocate buffer for new encoding */
+
+ new_der = OPENSSL_malloc(len);
+ if (new_der == NULL)
+ goto err;
+
+ /* Generate tagged encoding */
+
+ p = new_der;
+
+ /* Output explicit tags first */
+
+ for (i = 0, etmp = asn1_tags.exp_list; i < asn1_tags.exp_count;
+ i++, etmp++) {
+ ASN1_put_object(&p, etmp->exp_constructed, etmp->exp_len,
+ etmp->exp_tag, etmp->exp_class);
+ if (etmp->exp_pad)
+ *p++ = 0;
+ }
+
+ /* If IMPLICIT, output tag */
+
+ if (asn1_tags.imp_tag != -1) {
+ if (asn1_tags.imp_class == V_ASN1_UNIVERSAL
+ && (asn1_tags.imp_tag == V_ASN1_SEQUENCE
+ || asn1_tags.imp_tag == V_ASN1_SET))
+ hdr_constructed = V_ASN1_CONSTRUCTED;
+ ASN1_put_object(&p, hdr_constructed, hdr_len,
+ asn1_tags.imp_tag, asn1_tags.imp_class);
+ }
+
+ /* Copy across original encoding */
+ memcpy(p, cpy_start, cpy_len);
+
+ cp = new_der;
+
+ /* Obtain new ASN1_TYPE structure */
+ ret = d2i_ASN1_TYPE(NULL, &cp, len);
+
+ err:
+ OPENSSL_free(orig_der);
+ OPENSSL_free(new_der);
+
+ return ret;
+
+}
+
+static int asn1_cb(const char *elem, int len, void *bitstr)
+{
+ tag_exp_arg *arg = bitstr;
+ int i;
+ int utype;
+ int vlen = 0;
+ const char *p, *vstart = NULL;
+
+ int tmp_tag, tmp_class;
+
+ if (elem == NULL)
+ return -1;
+
+ for (i = 0, p = elem; i < len; p++, i++) {
+ /* Look for the ':' in name value pairs */
+ if (*p == ':') {
+ vstart = p + 1;
+ vlen = len - (vstart - elem);
+ len = p - elem;
+ break;
+ }
+ }
+
+ utype = asn1_str2tag(elem, len);
+
+ if (utype == -1) {
+ ASN1err(ASN1_F_ASN1_CB, ASN1_R_UNKNOWN_TAG);
+ ERR_add_error_data(2, "tag=", elem);
+ return -1;
+ }
+
+ /* If this is not a modifier mark end of string and exit */
+ if (!(utype & ASN1_GEN_FLAG)) {
+ arg->utype = utype;
+ arg->str = vstart;
+ /* If no value and not end of string, error */
+ if (!vstart && elem[len]) {
+ ASN1err(ASN1_F_ASN1_CB, ASN1_R_MISSING_VALUE);
+ return -1;
+ }
+ return 0;
+ }
+
+ switch (utype) {
+
+ case ASN1_GEN_FLAG_IMP:
+ /* Check for illegal multiple IMPLICIT tagging */
+ if (arg->imp_tag != -1) {
+ ASN1err(ASN1_F_ASN1_CB, ASN1_R_ILLEGAL_NESTED_TAGGING);
+ return -1;
+ }
+ if (!parse_tagging(vstart, vlen, &arg->imp_tag, &arg->imp_class))
+ return -1;
+ break;
+
+ case ASN1_GEN_FLAG_EXP:
+
+ if (!parse_tagging(vstart, vlen, &tmp_tag, &tmp_class))
+ return -1;
+ if (!append_exp(arg, tmp_tag, tmp_class, 1, 0, 0))
+ return -1;
+ break;
+
+ case ASN1_GEN_FLAG_SEQWRAP:
+ if (!append_exp(arg, V_ASN1_SEQUENCE, V_ASN1_UNIVERSAL, 1, 0, 1))
+ return -1;
+ break;
+
+ case ASN1_GEN_FLAG_SETWRAP:
+ if (!append_exp(arg, V_ASN1_SET, V_ASN1_UNIVERSAL, 1, 0, 1))
+ return -1;
+ break;
+
+ case ASN1_GEN_FLAG_BITWRAP:
+ if (!append_exp(arg, V_ASN1_BIT_STRING, V_ASN1_UNIVERSAL, 0, 1, 1))
+ return -1;
+ break;
+
+ case ASN1_GEN_FLAG_OCTWRAP:
+ if (!append_exp(arg, V_ASN1_OCTET_STRING, V_ASN1_UNIVERSAL, 0, 0, 1))
+ return -1;
+ break;
+
+ case ASN1_GEN_FLAG_FORMAT:
+ if (!vstart) {
+ ASN1err(ASN1_F_ASN1_CB, ASN1_R_UNKNOWN_FORMAT);
+ return -1;
+ }
+ if (strncmp(vstart, "ASCII", 5) == 0)
+ arg->format = ASN1_GEN_FORMAT_ASCII;
+ else if (strncmp(vstart, "UTF8", 4) == 0)
+ arg->format = ASN1_GEN_FORMAT_UTF8;
+ else if (strncmp(vstart, "HEX", 3) == 0)
+ arg->format = ASN1_GEN_FORMAT_HEX;
+ else if (strncmp(vstart, "BITLIST", 7) == 0)
+ arg->format = ASN1_GEN_FORMAT_BITLIST;
+ else {
+ ASN1err(ASN1_F_ASN1_CB, ASN1_R_UNKNOWN_FORMAT);
+ return -1;
+ }
+ break;
+
+ }
+
+ return 1;
+
+}
+
+static int parse_tagging(const char *vstart, int vlen, int *ptag, int *pclass)
+{
+ char erch[2];
+ long tag_num;
+ char *eptr;
+ if (!vstart)
+ return 0;
+ tag_num = strtoul(vstart, &eptr, 10);
+ /* Check we haven't gone past max length: should be impossible */
+ if (eptr && *eptr && (eptr > vstart + vlen))
+ return 0;
+ if (tag_num < 0) {
+ ASN1err(ASN1_F_PARSE_TAGGING, ASN1_R_INVALID_NUMBER);
+ return 0;
+ }
+ *ptag = tag_num;
+ /* If we have non numeric characters, parse them */
+ if (eptr)
+ vlen -= eptr - vstart;
+ else
+ vlen = 0;
+ if (vlen) {
+ switch (*eptr) {
+
+ case 'U':
+ *pclass = V_ASN1_UNIVERSAL;
+ break;
+
+ case 'A':
+ *pclass = V_ASN1_APPLICATION;
+ break;
+
+ case 'P':
+ *pclass = V_ASN1_PRIVATE;
+ break;
+
+ case 'C':
+ *pclass = V_ASN1_CONTEXT_SPECIFIC;
+ break;
+
+ default:
+ erch[0] = *eptr;
+ erch[1] = 0;
+ ASN1err(ASN1_F_PARSE_TAGGING, ASN1_R_INVALID_MODIFIER);
+ ERR_add_error_data(2, "Char=", erch);
+ return 0;
+
+ }
+ } else
+ *pclass = V_ASN1_CONTEXT_SPECIFIC;
+
+ return 1;
+
+}
+
+/* Handle multiple types: SET and SEQUENCE */
+
+static ASN1_TYPE *asn1_multi(int utype, const char *section, X509V3_CTX *cnf,
+ int depth, int *perr)
+{
+ ASN1_TYPE *ret = NULL;
+ STACK_OF(ASN1_TYPE) *sk = NULL;
+ STACK_OF(CONF_VALUE) *sect = NULL;
+ unsigned char *der = NULL;
+ int derlen;
+ int i;
+ sk = sk_ASN1_TYPE_new_null();
+ if (!sk)
+ goto bad;
+ if (section) {
+ if (!cnf)
+ goto bad;
+ sect = X509V3_get_section(cnf, (char *)section);
+ if (!sect)
+ goto bad;
+ for (i = 0; i < sk_CONF_VALUE_num(sect); i++) {
+ ASN1_TYPE *typ =
+ generate_v3(sk_CONF_VALUE_value(sect, i)->value, cnf,
+ depth + 1, perr);
+ if (!typ)
+ goto bad;
+ if (!sk_ASN1_TYPE_push(sk, typ))
+ goto bad;
+ }
+ }
+
+ /*
+ * Now we has a STACK of the components, convert to the correct form
+ */
+
+ if (utype == V_ASN1_SET)
+ derlen = i2d_ASN1_SET_ANY(sk, &der);
+ else
+ derlen = i2d_ASN1_SEQUENCE_ANY(sk, &der);
+
+ if (derlen < 0)
+ goto bad;
+ if ((ret = ASN1_TYPE_new()) == NULL)
+ goto bad;
+ if ((ret->value.asn1_string = ASN1_STRING_type_new(utype)) == NULL)
+ goto bad;
+
+ ret->type = utype;
+ ret->value.asn1_string->data = der;
+ ret->value.asn1_string->length = derlen;
+
+ der = NULL;
+
+ bad:
+
+ OPENSSL_free(der);
+
+ sk_ASN1_TYPE_pop_free(sk, ASN1_TYPE_free);
+ X509V3_section_free(cnf, sect);
+
+ return ret;
+}
+
+static int append_exp(tag_exp_arg *arg, int exp_tag, int exp_class,
+ int exp_constructed, int exp_pad, int imp_ok)
+{
+ tag_exp_type *exp_tmp;
+ /* Can only have IMPLICIT if permitted */
+ if ((arg->imp_tag != -1) && !imp_ok) {
+ ASN1err(ASN1_F_APPEND_EXP, ASN1_R_ILLEGAL_IMPLICIT_TAG);
+ return 0;
+ }
+
+ if (arg->exp_count == ASN1_FLAG_EXP_MAX) {
+ ASN1err(ASN1_F_APPEND_EXP, ASN1_R_DEPTH_EXCEEDED);
+ return 0;
+ }
+
+ exp_tmp = &arg->exp_list[arg->exp_count++];
+
+ /*
+ * If IMPLICIT set tag to implicit value then reset implicit tag since it
+ * has been used.
+ */
+ if (arg->imp_tag != -1) {
+ exp_tmp->exp_tag = arg->imp_tag;
+ exp_tmp->exp_class = arg->imp_class;
+ arg->imp_tag = -1;
+ arg->imp_class = -1;
+ } else {
+ exp_tmp->exp_tag = exp_tag;
+ exp_tmp->exp_class = exp_class;
+ }
+ exp_tmp->exp_constructed = exp_constructed;
+ exp_tmp->exp_pad = exp_pad;
+
+ return 1;
+}
+
+static int asn1_str2tag(const char *tagstr, int len)
+{
+ unsigned int i;
+ static const struct tag_name_st *tntmp, tnst[] = {
+ ASN1_GEN_STR("BOOL", V_ASN1_BOOLEAN),
+ ASN1_GEN_STR("BOOLEAN", V_ASN1_BOOLEAN),
+ ASN1_GEN_STR("NULL", V_ASN1_NULL),
+ ASN1_GEN_STR("INT", V_ASN1_INTEGER),
+ ASN1_GEN_STR("INTEGER", V_ASN1_INTEGER),
+ ASN1_GEN_STR("ENUM", V_ASN1_ENUMERATED),
+ ASN1_GEN_STR("ENUMERATED", V_ASN1_ENUMERATED),
+ ASN1_GEN_STR("OID", V_ASN1_OBJECT),
+ ASN1_GEN_STR("OBJECT", V_ASN1_OBJECT),
+ ASN1_GEN_STR("UTCTIME", V_ASN1_UTCTIME),
+ ASN1_GEN_STR("UTC", V_ASN1_UTCTIME),
+ ASN1_GEN_STR("GENERALIZEDTIME", V_ASN1_GENERALIZEDTIME),
+ ASN1_GEN_STR("GENTIME", V_ASN1_GENERALIZEDTIME),
+ ASN1_GEN_STR("OCT", V_ASN1_OCTET_STRING),
+ ASN1_GEN_STR("OCTETSTRING", V_ASN1_OCTET_STRING),
+ ASN1_GEN_STR("BITSTR", V_ASN1_BIT_STRING),
+ ASN1_GEN_STR("BITSTRING", V_ASN1_BIT_STRING),
+ ASN1_GEN_STR("UNIVERSALSTRING", V_ASN1_UNIVERSALSTRING),
+ ASN1_GEN_STR("UNIV", V_ASN1_UNIVERSALSTRING),
+ ASN1_GEN_STR("IA5", V_ASN1_IA5STRING),
+ ASN1_GEN_STR("IA5STRING", V_ASN1_IA5STRING),
+ ASN1_GEN_STR("UTF8", V_ASN1_UTF8STRING),
+ ASN1_GEN_STR("UTF8String", V_ASN1_UTF8STRING),
+ ASN1_GEN_STR("BMP", V_ASN1_BMPSTRING),
+ ASN1_GEN_STR("BMPSTRING", V_ASN1_BMPSTRING),
+ ASN1_GEN_STR("VISIBLESTRING", V_ASN1_VISIBLESTRING),
+ ASN1_GEN_STR("VISIBLE", V_ASN1_VISIBLESTRING),
+ ASN1_GEN_STR("PRINTABLESTRING", V_ASN1_PRINTABLESTRING),
+ ASN1_GEN_STR("PRINTABLE", V_ASN1_PRINTABLESTRING),
+ ASN1_GEN_STR("T61", V_ASN1_T61STRING),
+ ASN1_GEN_STR("T61STRING", V_ASN1_T61STRING),
+ ASN1_GEN_STR("TELETEXSTRING", V_ASN1_T61STRING),
+ ASN1_GEN_STR("GeneralString", V_ASN1_GENERALSTRING),
+ ASN1_GEN_STR("GENSTR", V_ASN1_GENERALSTRING),
+ ASN1_GEN_STR("NUMERIC", V_ASN1_NUMERICSTRING),
+ ASN1_GEN_STR("NUMERICSTRING", V_ASN1_NUMERICSTRING),
+
+ /* Special cases */
+ ASN1_GEN_STR("SEQUENCE", V_ASN1_SEQUENCE),
+ ASN1_GEN_STR("SEQ", V_ASN1_SEQUENCE),
+ ASN1_GEN_STR("SET", V_ASN1_SET),
+ /* type modifiers */
+ /* Explicit tag */
+ ASN1_GEN_STR("EXP", ASN1_GEN_FLAG_EXP),
+ ASN1_GEN_STR("EXPLICIT", ASN1_GEN_FLAG_EXP),
+ /* Implicit tag */
+ ASN1_GEN_STR("IMP", ASN1_GEN_FLAG_IMP),
+ ASN1_GEN_STR("IMPLICIT", ASN1_GEN_FLAG_IMP),
+ /* OCTET STRING wrapper */
+ ASN1_GEN_STR("OCTWRAP", ASN1_GEN_FLAG_OCTWRAP),
+ /* SEQUENCE wrapper */
+ ASN1_GEN_STR("SEQWRAP", ASN1_GEN_FLAG_SEQWRAP),
+ /* SET wrapper */
+ ASN1_GEN_STR("SETWRAP", ASN1_GEN_FLAG_SETWRAP),
+ /* BIT STRING wrapper */
+ ASN1_GEN_STR("BITWRAP", ASN1_GEN_FLAG_BITWRAP),
+ ASN1_GEN_STR("FORM", ASN1_GEN_FLAG_FORMAT),
+ ASN1_GEN_STR("FORMAT", ASN1_GEN_FLAG_FORMAT),
+ };
+
+ if (len == -1)
+ len = strlen(tagstr);
+
+ tntmp = tnst;
+ for (i = 0; i < OSSL_NELEM(tnst); i++, tntmp++) {
+ if ((len == tntmp->len) && (strncmp(tntmp->strnam, tagstr, len) == 0))
+ return tntmp->tag;
+ }
+
+ return -1;
+}
+
+static ASN1_TYPE *asn1_str2type(const char *str, int format, int utype)
+{
+ ASN1_TYPE *atmp = NULL;
+ CONF_VALUE vtmp;
+ unsigned char *rdata;
+ long rdlen;
+ int no_unused = 1;
+
+ if ((atmp = ASN1_TYPE_new()) == NULL) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ if (!str)
+ str = "";
+
+ switch (utype) {
+
+ case V_ASN1_NULL:
+ if (str && *str) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_NULL_VALUE);
+ goto bad_form;
+ }
+ break;
+
+ case V_ASN1_BOOLEAN:
+ if (format != ASN1_GEN_FORMAT_ASCII) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_NOT_ASCII_FORMAT);
+ goto bad_form;
+ }
+ vtmp.name = NULL;
+ vtmp.section = NULL;
+ vtmp.value = (char *)str;
+ if (!X509V3_get_value_bool(&vtmp, &atmp->value.boolean)) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_BOOLEAN);
+ goto bad_str;
+ }
+ break;
+
+ case V_ASN1_INTEGER:
+ case V_ASN1_ENUMERATED:
+ if (format != ASN1_GEN_FORMAT_ASCII) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_INTEGER_NOT_ASCII_FORMAT);
+ goto bad_form;
+ }
+ if ((atmp->value.integer
+ = s2i_ASN1_INTEGER(NULL, str)) == NULL) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_INTEGER);
+ goto bad_str;
+ }
+ break;
+
+ case V_ASN1_OBJECT:
+ if (format != ASN1_GEN_FORMAT_ASCII) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_OBJECT_NOT_ASCII_FORMAT);
+ goto bad_form;
+ }
+ if ((atmp->value.object = OBJ_txt2obj(str, 0)) == NULL) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_OBJECT);
+ goto bad_str;
+ }
+ break;
+
+ case V_ASN1_UTCTIME:
+ case V_ASN1_GENERALIZEDTIME:
+ if (format != ASN1_GEN_FORMAT_ASCII) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_TIME_NOT_ASCII_FORMAT);
+ goto bad_form;
+ }
+ if ((atmp->value.asn1_string = ASN1_STRING_new()) == NULL) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ERR_R_MALLOC_FAILURE);
+ goto bad_str;
+ }
+ if (!ASN1_STRING_set(atmp->value.asn1_string, str, -1)) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ERR_R_MALLOC_FAILURE);
+ goto bad_str;
+ }
+ atmp->value.asn1_string->type = utype;
+ if (!ASN1_TIME_check(atmp->value.asn1_string)) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_TIME_VALUE);
+ goto bad_str;
+ }
+
+ break;
+
+ case V_ASN1_BMPSTRING:
+ case V_ASN1_PRINTABLESTRING:
+ case V_ASN1_IA5STRING:
+ case V_ASN1_T61STRING:
+ case V_ASN1_UTF8STRING:
+ case V_ASN1_VISIBLESTRING:
+ case V_ASN1_UNIVERSALSTRING:
+ case V_ASN1_GENERALSTRING:
+ case V_ASN1_NUMERICSTRING:
+ if (format == ASN1_GEN_FORMAT_ASCII)
+ format = MBSTRING_ASC;
+ else if (format == ASN1_GEN_FORMAT_UTF8)
+ format = MBSTRING_UTF8;
+ else {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_FORMAT);
+ goto bad_form;
+ }
+
+ if (ASN1_mbstring_copy(&atmp->value.asn1_string, (unsigned char *)str,
+ -1, format, ASN1_tag2bit(utype)) <= 0) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ERR_R_MALLOC_FAILURE);
+ goto bad_str;
+ }
+
+ break;
+
+ case V_ASN1_BIT_STRING:
+ case V_ASN1_OCTET_STRING:
+ if ((atmp->value.asn1_string = ASN1_STRING_new()) == NULL) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ERR_R_MALLOC_FAILURE);
+ goto bad_form;
+ }
+
+ if (format == ASN1_GEN_FORMAT_HEX) {
+ if ((rdata = OPENSSL_hexstr2buf(str, &rdlen)) == NULL) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_HEX);
+ goto bad_str;
+ }
+ atmp->value.asn1_string->data = rdata;
+ atmp->value.asn1_string->length = rdlen;
+ atmp->value.asn1_string->type = utype;
+ } else if (format == ASN1_GEN_FORMAT_ASCII)
+ ASN1_STRING_set(atmp->value.asn1_string, str, -1);
+ else if ((format == ASN1_GEN_FORMAT_BITLIST)
+ && (utype == V_ASN1_BIT_STRING)) {
+ if (!CONF_parse_list
+ (str, ',', 1, bitstr_cb, atmp->value.bit_string)) {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_LIST_ERROR);
+ goto bad_str;
+ }
+ no_unused = 0;
+
+ } else {
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_ILLEGAL_BITSTRING_FORMAT);
+ goto bad_form;
+ }
+
+ if ((utype == V_ASN1_BIT_STRING) && no_unused) {
+ atmp->value.asn1_string->flags
+ &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07);
+ atmp->value.asn1_string->flags |= ASN1_STRING_FLAG_BITS_LEFT;
+ }
+
+ break;
+
+ default:
+ ASN1err(ASN1_F_ASN1_STR2TYPE, ASN1_R_UNSUPPORTED_TYPE);
+ goto bad_str;
+ }
+
+ atmp->type = utype;
+ return atmp;
+
+ bad_str:
+ ERR_add_error_data(2, "string=", str);
+ bad_form:
+
+ ASN1_TYPE_free(atmp);
+ return NULL;
+
+}
+
+static int bitstr_cb(const char *elem, int len, void *bitstr)
+{
+ long bitnum;
+ char *eptr;
+ if (!elem)
+ return 0;
+ bitnum = strtoul(elem, &eptr, 10);
+ if (eptr && *eptr && (eptr != elem + len))
+ return 0;
+ if (bitnum < 0) {
+ ASN1err(ASN1_F_BITSTR_CB, ASN1_R_INVALID_NUMBER);
+ return 0;
+ }
+ if (!ASN1_BIT_STRING_set_bit(bitstr, bitnum, 1)) {
+ ASN1err(ASN1_F_BITSTR_CB, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ return 1;
+}
+
+static int mask_cb(const char *elem, int len, void *arg)
+{
+ unsigned long *pmask = arg, tmpmask;
+ int tag;
+ if (elem == NULL)
+ return 0;
+ if ((len == 3) && (strncmp(elem, "DIR", 3) == 0)) {
+ *pmask |= B_ASN1_DIRECTORYSTRING;
+ return 1;
+ }
+ tag = asn1_str2tag(elem, len);
+ if (!tag || (tag & ASN1_GEN_FLAG))
+ return 0;
+ tmpmask = ASN1_tag2bit(tag);
+ if (!tmpmask)
+ return 0;
+ *pmask |= tmpmask;
+ return 1;
+}
+
+int ASN1_str2mask(const char *str, unsigned long *pmask)
+{
+ *pmask = 0;
+ return CONF_parse_list(str, '|', 1, mask_cb, pmask);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/asn1_lib.c b/openssl-1.1.0h/crypto/asn1/asn1_lib.c
new file mode 100644
index 0000000..8ca53b4
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn1_lib.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include "asn1_locl.h"
+
+static int asn1_get_length(const unsigned char **pp, int *inf, long *rl,
+ long max);
+static void asn1_put_length(unsigned char **pp, int length);
+
+static int _asn1_check_infinite_end(const unsigned char **p, long len)
+{
+ /*
+ * If there is 0 or 1 byte left, the length check should pick things up
+ */
+ if (len <= 0)
+ return (1);
+ else if ((len >= 2) && ((*p)[0] == 0) && ((*p)[1] == 0)) {
+ (*p) += 2;
+ return (1);
+ }
+ return (0);
+}
+
+int ASN1_check_infinite_end(unsigned char **p, long len)
+{
+ return _asn1_check_infinite_end((const unsigned char **)p, len);
+}
+
+int ASN1_const_check_infinite_end(const unsigned char **p, long len)
+{
+ return _asn1_check_infinite_end(p, len);
+}
+
+int ASN1_get_object(const unsigned char **pp, long *plength, int *ptag,
+ int *pclass, long omax)
+{
+ int i, ret;
+ long l;
+ const unsigned char *p = *pp;
+ int tag, xclass, inf;
+ long max = omax;
+
+ if (!max)
+ goto err;
+ ret = (*p & V_ASN1_CONSTRUCTED);
+ xclass = (*p & V_ASN1_PRIVATE);
+ i = *p & V_ASN1_PRIMITIVE_TAG;
+ if (i == V_ASN1_PRIMITIVE_TAG) { /* high-tag */
+ p++;
+ if (--max == 0)
+ goto err;
+ l = 0;
+ while (*p & 0x80) {
+ l <<= 7L;
+ l |= *(p++) & 0x7f;
+ if (--max == 0)
+ goto err;
+ if (l > (INT_MAX >> 7L))
+ goto err;
+ }
+ l <<= 7L;
+ l |= *(p++) & 0x7f;
+ tag = (int)l;
+ if (--max == 0)
+ goto err;
+ } else {
+ tag = i;
+ p++;
+ if (--max == 0)
+ goto err;
+ }
+ *ptag = tag;
+ *pclass = xclass;
+ if (!asn1_get_length(&p, &inf, plength, max))
+ goto err;
+
+ if (inf && !(ret & V_ASN1_CONSTRUCTED))
+ goto err;
+
+ if (*plength > (omax - (p - *pp))) {
+ ASN1err(ASN1_F_ASN1_GET_OBJECT, ASN1_R_TOO_LONG);
+ /*
+ * Set this so that even if things are not long enough the values are
+ * set correctly
+ */
+ ret |= 0x80;
+ }
+ *pp = p;
+ return (ret | inf);
+ err:
+ ASN1err(ASN1_F_ASN1_GET_OBJECT, ASN1_R_HEADER_TOO_LONG);
+ return (0x80);
+}
+
+static int asn1_get_length(const unsigned char **pp, int *inf, long *rl,
+ long max)
+{
+ const unsigned char *p = *pp;
+ unsigned long ret = 0;
+ unsigned long i;
+
+ if (max-- < 1)
+ return 0;
+ if (*p == 0x80) {
+ *inf = 1;
+ ret = 0;
+ p++;
+ } else {
+ *inf = 0;
+ i = *p & 0x7f;
+ if (*(p++) & 0x80) {
+ if (max < (long)i + 1)
+ return 0;
+ /* Skip leading zeroes */
+ while (i && *p == 0) {
+ p++;
+ i--;
+ }
+ if (i > sizeof(long))
+ return 0;
+ while (i-- > 0) {
+ ret <<= 8L;
+ ret |= *(p++);
+ }
+ } else
+ ret = i;
+ }
+ if (ret > LONG_MAX)
+ return 0;
+ *pp = p;
+ *rl = (long)ret;
+ return 1;
+}
+
+/*
+ * class 0 is constructed constructed == 2 for indefinite length constructed
+ */
+void ASN1_put_object(unsigned char **pp, int constructed, int length, int tag,
+ int xclass)
+{
+ unsigned char *p = *pp;
+ int i, ttag;
+
+ i = (constructed) ? V_ASN1_CONSTRUCTED : 0;
+ i |= (xclass & V_ASN1_PRIVATE);
+ if (tag < 31)
+ *(p++) = i | (tag & V_ASN1_PRIMITIVE_TAG);
+ else {
+ *(p++) = i | V_ASN1_PRIMITIVE_TAG;
+ for (i = 0, ttag = tag; ttag > 0; i++)
+ ttag >>= 7;
+ ttag = i;
+ while (i-- > 0) {
+ p[i] = tag & 0x7f;
+ if (i != (ttag - 1))
+ p[i] |= 0x80;
+ tag >>= 7;
+ }
+ p += ttag;
+ }
+ if (constructed == 2)
+ *(p++) = 0x80;
+ else
+ asn1_put_length(&p, length);
+ *pp = p;
+}
+
+int ASN1_put_eoc(unsigned char **pp)
+{
+ unsigned char *p = *pp;
+ *p++ = 0;
+ *p++ = 0;
+ *pp = p;
+ return 2;
+}
+
+static void asn1_put_length(unsigned char **pp, int length)
+{
+ unsigned char *p = *pp;
+ int i, l;
+ if (length <= 127)
+ *(p++) = (unsigned char)length;
+ else {
+ l = length;
+ for (i = 0; l > 0; i++)
+ l >>= 8;
+ *(p++) = i | 0x80;
+ l = i;
+ while (i-- > 0) {
+ p[i] = length & 0xff;
+ length >>= 8;
+ }
+ p += l;
+ }
+ *pp = p;
+}
+
+int ASN1_object_size(int constructed, int length, int tag)
+{
+ int ret = 1;
+ if (length < 0)
+ return -1;
+ if (tag >= 31) {
+ while (tag > 0) {
+ tag >>= 7;
+ ret++;
+ }
+ }
+ if (constructed == 2) {
+ ret += 3;
+ } else {
+ ret++;
+ if (length > 127) {
+ int tmplen = length;
+ while (tmplen > 0) {
+ tmplen >>= 8;
+ ret++;
+ }
+ }
+ }
+ if (ret >= INT_MAX - length)
+ return -1;
+ return ret + length;
+}
+
+int ASN1_STRING_copy(ASN1_STRING *dst, const ASN1_STRING *str)
+{
+ if (str == NULL)
+ return 0;
+ dst->type = str->type;
+ if (!ASN1_STRING_set(dst, str->data, str->length))
+ return 0;
+ /* Copy flags but preserve embed value */
+ dst->flags &= ASN1_STRING_FLAG_EMBED;
+ dst->flags |= str->flags & ~ASN1_STRING_FLAG_EMBED;
+ return 1;
+}
+
+ASN1_STRING *ASN1_STRING_dup(const ASN1_STRING *str)
+{
+ ASN1_STRING *ret;
+ if (!str)
+ return NULL;
+ ret = ASN1_STRING_new();
+ if (ret == NULL)
+ return NULL;
+ if (!ASN1_STRING_copy(ret, str)) {
+ ASN1_STRING_free(ret);
+ return NULL;
+ }
+ return ret;
+}
+
+int ASN1_STRING_set(ASN1_STRING *str, const void *_data, int len)
+{
+ unsigned char *c;
+ const char *data = _data;
+
+ if (len < 0) {
+ if (data == NULL)
+ return (0);
+ else
+ len = strlen(data);
+ }
+ if ((str->length <= len) || (str->data == NULL)) {
+ c = str->data;
+ str->data = OPENSSL_realloc(c, len + 1);
+ if (str->data == NULL) {
+ ASN1err(ASN1_F_ASN1_STRING_SET, ERR_R_MALLOC_FAILURE);
+ str->data = c;
+ return (0);
+ }
+ }
+ str->length = len;
+ if (data != NULL) {
+ memcpy(str->data, data, len);
+ /* an allowance for strings :-) */
+ str->data[len] = '\0';
+ }
+ return (1);
+}
+
+void ASN1_STRING_set0(ASN1_STRING *str, void *data, int len)
+{
+ OPENSSL_free(str->data);
+ str->data = data;
+ str->length = len;
+}
+
+ASN1_STRING *ASN1_STRING_new(void)
+{
+ return (ASN1_STRING_type_new(V_ASN1_OCTET_STRING));
+}
+
+ASN1_STRING *ASN1_STRING_type_new(int type)
+{
+ ASN1_STRING *ret;
+
+ ret = OPENSSL_zalloc(sizeof(*ret));
+ if (ret == NULL) {
+ ASN1err(ASN1_F_ASN1_STRING_TYPE_NEW, ERR_R_MALLOC_FAILURE);
+ return (NULL);
+ }
+ ret->type = type;
+ return (ret);
+}
+
+void asn1_string_embed_free(ASN1_STRING *a, int embed)
+{
+ if (a == NULL)
+ return;
+ if (!(a->flags & ASN1_STRING_FLAG_NDEF))
+ OPENSSL_free(a->data);
+ if (embed == 0)
+ OPENSSL_free(a);
+}
+
+void ASN1_STRING_free(ASN1_STRING *a)
+{
+ if (a == NULL)
+ return;
+ asn1_string_embed_free(a, a->flags & ASN1_STRING_FLAG_EMBED);
+}
+
+void ASN1_STRING_clear_free(ASN1_STRING *a)
+{
+ if (a == NULL)
+ return;
+ if (a->data && !(a->flags & ASN1_STRING_FLAG_NDEF))
+ OPENSSL_cleanse(a->data, a->length);
+ ASN1_STRING_free(a);
+}
+
+int ASN1_STRING_cmp(const ASN1_STRING *a, const ASN1_STRING *b)
+{
+ int i;
+
+ i = (a->length - b->length);
+ if (i == 0) {
+ i = memcmp(a->data, b->data, a->length);
+ if (i == 0)
+ return (a->type - b->type);
+ else
+ return (i);
+ } else
+ return (i);
+}
+
+int ASN1_STRING_length(const ASN1_STRING *x)
+{
+ return x->length;
+}
+
+void ASN1_STRING_length_set(ASN1_STRING *x, int len)
+{
+ x->length = len;
+}
+
+int ASN1_STRING_type(const ASN1_STRING *x)
+{
+ return x->type;
+}
+
+const unsigned char *ASN1_STRING_get0_data(const ASN1_STRING *x)
+{
+ return x->data;
+}
+
+# if OPENSSL_API_COMPAT < 0x10100000L
+unsigned char *ASN1_STRING_data(ASN1_STRING *x)
+{
+ return x->data;
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/asn1/asn1_locl.h b/openssl-1.1.0h/crypto/asn1/asn1_locl.h
new file mode 100644
index 0000000..9a47b1e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn1_locl.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2005-2017 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/* Internal ASN1 structures and functions: not for application use */
+
+int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d);
+int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d);
+
+/* ASN1 scan context structure */
+
+struct asn1_sctx_st {
+ /* The ASN1_ITEM associated with this field */
+ const ASN1_ITEM *it;
+ /* If ASN1_TEMPLATE associated with this field */
+ const ASN1_TEMPLATE *tt;
+ /* Various flags associated with field and context */
+ unsigned long flags;
+ /* If SEQUENCE OF or SET OF, field index */
+ int skidx;
+ /* ASN1 depth of field */
+ int depth;
+ /* Structure and field name */
+ const char *sname, *fname;
+ /* If a primitive type the type of underlying field */
+ int prim_type;
+ /* The field value itself */
+ ASN1_VALUE **field;
+ /* Callback to pass information to */
+ int (*scan_cb) (ASN1_SCTX *ctx);
+ /* Context specific application data */
+ void *app_data;
+} /* ASN1_SCTX */ ;
+
+typedef struct mime_param_st MIME_PARAM;
+DEFINE_STACK_OF(MIME_PARAM)
+typedef struct mime_header_st MIME_HEADER;
+DEFINE_STACK_OF(MIME_HEADER)
+
+/* Month values for printing out times */
+extern const char *_asn1_mon[12];
+
+void asn1_string_embed_free(ASN1_STRING *a, int embed);
+
+int asn1_get_choice_selector(ASN1_VALUE **pval, const ASN1_ITEM *it);
+int asn1_set_choice_selector(ASN1_VALUE **pval, int value,
+ const ASN1_ITEM *it);
+
+ASN1_VALUE **asn1_get_field_ptr(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt);
+
+const ASN1_TEMPLATE *asn1_do_adb(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt,
+ int nullerr);
+
+int asn1_do_lock(ASN1_VALUE **pval, int op, const ASN1_ITEM *it);
+
+void asn1_enc_init(ASN1_VALUE **pval, const ASN1_ITEM *it);
+void asn1_enc_free(ASN1_VALUE **pval, const ASN1_ITEM *it);
+int asn1_enc_restore(int *len, unsigned char **out, ASN1_VALUE **pval,
+ const ASN1_ITEM *it);
+int asn1_enc_save(ASN1_VALUE **pval, const unsigned char *in, int inlen,
+ const ASN1_ITEM *it);
+
+void asn1_item_embed_free(ASN1_VALUE **pval, const ASN1_ITEM *it, int embed);
+void asn1_primitive_free(ASN1_VALUE **pval, const ASN1_ITEM *it, int embed);
+void asn1_template_free(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt);
+
+ASN1_OBJECT *c2i_ASN1_OBJECT(ASN1_OBJECT **a, const unsigned char **pp,
+ long length);
+int i2c_ASN1_BIT_STRING(ASN1_BIT_STRING *a, unsigned char **pp);
+ASN1_BIT_STRING *c2i_ASN1_BIT_STRING(ASN1_BIT_STRING **a,
+ const unsigned char **pp, long length);
+int i2c_ASN1_INTEGER(ASN1_INTEGER *a, unsigned char **pp);
+ASN1_INTEGER *c2i_ASN1_INTEGER(ASN1_INTEGER **a, const unsigned char **pp,
+ long length);
+
+/* Internal functions used by x_int64.c */
+int c2i_uint64_int(uint64_t *ret, int *neg, const unsigned char **pp, long len);
+int i2c_uint64_int(unsigned char *p, uint64_t r, int neg);
diff --git a/openssl-1.1.0h/crypto/asn1/asn1_par.c b/openssl-1.1.0h/crypto/asn1/asn1_par.c
new file mode 100644
index 0000000..fabc8d6
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn1_par.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/buffer.h>
+#include <openssl/objects.h>
+#include <openssl/asn1.h>
+
+#ifndef ASN1_PARSE_MAXDEPTH
+#define ASN1_PARSE_MAXDEPTH 128
+#endif
+
+static int asn1_print_info(BIO *bp, int tag, int xclass, int constructed,
+ int indent);
+static int asn1_parse2(BIO *bp, const unsigned char **pp, long length,
+ int offset, int depth, int indent, int dump);
+static int asn1_print_info(BIO *bp, int tag, int xclass, int constructed,
+ int indent)
+{
+ static const char fmt[] = "%-18s";
+ char str[128];
+ const char *p;
+
+ if (constructed & V_ASN1_CONSTRUCTED)
+ p = "cons: ";
+ else
+ p = "prim: ";
+ if (BIO_write(bp, p, 6) < 6)
+ goto err;
+ BIO_indent(bp, indent, 128);
+
+ p = str;
+ if ((xclass & V_ASN1_PRIVATE) == V_ASN1_PRIVATE)
+ BIO_snprintf(str, sizeof(str), "priv [ %d ] ", tag);
+ else if ((xclass & V_ASN1_CONTEXT_SPECIFIC) == V_ASN1_CONTEXT_SPECIFIC)
+ BIO_snprintf(str, sizeof(str), "cont [ %d ]", tag);
+ else if ((xclass & V_ASN1_APPLICATION) == V_ASN1_APPLICATION)
+ BIO_snprintf(str, sizeof(str), "appl [ %d ]", tag);
+ else if (tag > 30)
+ BIO_snprintf(str, sizeof(str), "<ASN1 %d>", tag);
+ else
+ p = ASN1_tag2str(tag);
+
+ if (BIO_printf(bp, fmt, p) <= 0)
+ goto err;
+ return (1);
+ err:
+ return (0);
+}
+
+int ASN1_parse(BIO *bp, const unsigned char *pp, long len, int indent)
+{
+ return (asn1_parse2(bp, &pp, len, 0, 0, indent, 0));
+}
+
+int ASN1_parse_dump(BIO *bp, const unsigned char *pp, long len, int indent,
+ int dump)
+{
+ return (asn1_parse2(bp, &pp, len, 0, 0, indent, dump));
+}
+
+static int asn1_parse2(BIO *bp, const unsigned char **pp, long length,
+ int offset, int depth, int indent, int dump)
+{
+ const unsigned char *p, *ep, *tot, *op, *opp;
+ long len;
+ int tag, xclass, ret = 0;
+ int nl, hl, j, r;
+ ASN1_OBJECT *o = NULL;
+ ASN1_OCTET_STRING *os = NULL;
+ /* ASN1_BMPSTRING *bmp=NULL; */
+ int dump_indent, dump_cont = 0;
+
+ if (depth > ASN1_PARSE_MAXDEPTH) {
+ BIO_puts(bp, "BAD RECURSION DEPTH\n");
+ return 0;
+ }
+
+ dump_indent = 6; /* Because we know BIO_dump_indent() */
+ p = *pp;
+ tot = p + length;
+ while (length > 0) {
+ op = p;
+ j = ASN1_get_object(&p, &len, &tag, &xclass, length);
+ if (j & 0x80) {
+ if (BIO_write(bp, "Error in encoding\n", 18) <= 0)
+ goto end;
+ ret = 0;
+ goto end;
+ }
+ hl = (p - op);
+ length -= hl;
+ /*
+ * if j == 0x21 it is a constructed indefinite length object
+ */
+ if (BIO_printf(bp, "%5ld:", (long)offset + (long)(op - *pp))
+ <= 0)
+ goto end;
+
+ if (j != (V_ASN1_CONSTRUCTED | 1)) {
+ if (BIO_printf(bp, "d=%-2d hl=%ld l=%4ld ",
+ depth, (long)hl, len) <= 0)
+ goto end;
+ } else {
+ if (BIO_printf(bp, "d=%-2d hl=%ld l=inf ", depth, (long)hl) <= 0)
+ goto end;
+ }
+ if (!asn1_print_info(bp, tag, xclass, j, (indent) ? depth : 0))
+ goto end;
+ if (j & V_ASN1_CONSTRUCTED) {
+ const unsigned char *sp = p;
+
+ ep = p + len;
+ if (BIO_write(bp, "\n", 1) <= 0)
+ goto end;
+ if (len > length) {
+ BIO_printf(bp, "length is greater than %ld\n", length);
+ ret = 0;
+ goto end;
+ }
+ if ((j == 0x21) && (len == 0)) {
+ for (;;) {
+ r = asn1_parse2(bp, &p, (long)(tot - p),
+ offset + (p - *pp), depth + 1,
+ indent, dump);
+ if (r == 0) {
+ ret = 0;
+ goto end;
+ }
+ if ((r == 2) || (p >= tot)) {
+ len = p - sp;
+ break;
+ }
+ }
+ } else {
+ long tmp = len;
+
+ while (p < ep) {
+ sp = p;
+ r = asn1_parse2(bp, &p, tmp,
+ offset + (p - *pp), depth + 1,
+ indent, dump);
+ if (r == 0) {
+ ret = 0;
+ goto end;
+ }
+ tmp -= p - sp;
+ }
+ }
+ } else if (xclass != 0) {
+ p += len;
+ if (BIO_write(bp, "\n", 1) <= 0)
+ goto end;
+ } else {
+ nl = 0;
+ if ((tag == V_ASN1_PRINTABLESTRING) ||
+ (tag == V_ASN1_T61STRING) ||
+ (tag == V_ASN1_IA5STRING) ||
+ (tag == V_ASN1_VISIBLESTRING) ||
+ (tag == V_ASN1_NUMERICSTRING) ||
+ (tag == V_ASN1_UTF8STRING) ||
+ (tag == V_ASN1_UTCTIME) || (tag == V_ASN1_GENERALIZEDTIME)) {
+ if (BIO_write(bp, ":", 1) <= 0)
+ goto end;
+ if ((len > 0) && BIO_write(bp, (const char *)p, (int)len)
+ != (int)len)
+ goto end;
+ } else if (tag == V_ASN1_OBJECT) {
+ opp = op;
+ if (d2i_ASN1_OBJECT(&o, &opp, len + hl) != NULL) {
+ if (BIO_write(bp, ":", 1) <= 0)
+ goto end;
+ i2a_ASN1_OBJECT(bp, o);
+ } else {
+ if (BIO_puts(bp, ":BAD OBJECT") <= 0)
+ goto end;
+ dump_cont = 1;
+ }
+ } else if (tag == V_ASN1_BOOLEAN) {
+ if (len != 1) {
+ if (BIO_puts(bp, ":BAD BOOLEAN") <= 0)
+ goto end;
+ dump_cont = 1;
+ }
+ if (len > 0)
+ BIO_printf(bp, ":%u", p[0]);
+ } else if (tag == V_ASN1_BMPSTRING) {
+ /* do the BMP thang */
+ } else if (tag == V_ASN1_OCTET_STRING) {
+ int i, printable = 1;
+
+ opp = op;
+ os = d2i_ASN1_OCTET_STRING(NULL, &opp, len + hl);
+ if (os != NULL && os->length > 0) {
+ opp = os->data;
+ /*
+ * testing whether the octet string is printable
+ */
+ for (i = 0; i < os->length; i++) {
+ if (((opp[i] < ' ') &&
+ (opp[i] != '\n') &&
+ (opp[i] != '\r') &&
+ (opp[i] != '\t')) || (opp[i] > '~')) {
+ printable = 0;
+ break;
+ }
+ }
+ if (printable)
+ /* printable string */
+ {
+ if (BIO_write(bp, ":", 1) <= 0)
+ goto end;
+ if (BIO_write(bp, (const char *)opp, os->length) <= 0)
+ goto end;
+ } else if (!dump)
+ /*
+ * not printable => print octet string as hex dump
+ */
+ {
+ if (BIO_write(bp, "[HEX DUMP]:", 11) <= 0)
+ goto end;
+ for (i = 0; i < os->length; i++) {
+ if (BIO_printf(bp, "%02X", opp[i]) <= 0)
+ goto end;
+ }
+ } else
+ /* print the normal dump */
+ {
+ if (!nl) {
+ if (BIO_write(bp, "\n", 1) <= 0)
+ goto end;
+ }
+ if (BIO_dump_indent(bp,
+ (const char *)opp,
+ ((dump == -1 || dump >
+ os->
+ length) ? os->length : dump),
+ dump_indent) <= 0)
+ goto end;
+ nl = 1;
+ }
+ }
+ ASN1_OCTET_STRING_free(os);
+ os = NULL;
+ } else if (tag == V_ASN1_INTEGER) {
+ ASN1_INTEGER *bs;
+ int i;
+
+ opp = op;
+ bs = d2i_ASN1_INTEGER(NULL, &opp, len + hl);
+ if (bs != NULL) {
+ if (BIO_write(bp, ":", 1) <= 0)
+ goto end;
+ if (bs->type == V_ASN1_NEG_INTEGER)
+ if (BIO_write(bp, "-", 1) <= 0)
+ goto end;
+ for (i = 0; i < bs->length; i++) {
+ if (BIO_printf(bp, "%02X", bs->data[i]) <= 0)
+ goto end;
+ }
+ if (bs->length == 0) {
+ if (BIO_write(bp, "00", 2) <= 0)
+ goto end;
+ }
+ } else {
+ if (BIO_puts(bp, ":BAD INTEGER") <= 0)
+ goto end;
+ dump_cont = 1;
+ }
+ ASN1_INTEGER_free(bs);
+ } else if (tag == V_ASN1_ENUMERATED) {
+ ASN1_ENUMERATED *bs;
+ int i;
+
+ opp = op;
+ bs = d2i_ASN1_ENUMERATED(NULL, &opp, len + hl);
+ if (bs != NULL) {
+ if (BIO_write(bp, ":", 1) <= 0)
+ goto end;
+ if (bs->type == V_ASN1_NEG_ENUMERATED)
+ if (BIO_write(bp, "-", 1) <= 0)
+ goto end;
+ for (i = 0; i < bs->length; i++) {
+ if (BIO_printf(bp, "%02X", bs->data[i]) <= 0)
+ goto end;
+ }
+ if (bs->length == 0) {
+ if (BIO_write(bp, "00", 2) <= 0)
+ goto end;
+ }
+ } else {
+ if (BIO_puts(bp, ":BAD ENUMERATED") <= 0)
+ goto end;
+ dump_cont = 1;
+ }
+ ASN1_ENUMERATED_free(bs);
+ } else if (len > 0 && dump) {
+ if (!nl) {
+ if (BIO_write(bp, "\n", 1) <= 0)
+ goto end;
+ }
+ if (BIO_dump_indent(bp, (const char *)p,
+ ((dump == -1 || dump > len) ? len : dump),
+ dump_indent) <= 0)
+ goto end;
+ nl = 1;
+ }
+ if (dump_cont) {
+ int i;
+ const unsigned char *tmp = op + hl;
+ if (BIO_puts(bp, ":[") <= 0)
+ goto end;
+ for (i = 0; i < len; i++) {
+ if (BIO_printf(bp, "%02X", tmp[i]) <= 0)
+ goto end;
+ }
+ if (BIO_puts(bp, "]") <= 0)
+ goto end;
+ }
+
+ if (!nl) {
+ if (BIO_write(bp, "\n", 1) <= 0)
+ goto end;
+ }
+ p += len;
+ if ((tag == V_ASN1_EOC) && (xclass == 0)) {
+ ret = 2; /* End of sequence */
+ goto end;
+ }
+ }
+ length -= len;
+ }
+ ret = 1;
+ end:
+ ASN1_OBJECT_free(o);
+ ASN1_OCTET_STRING_free(os);
+ *pp = p;
+ return (ret);
+}
+
+const char *ASN1_tag2str(int tag)
+{
+ static const char *const tag2str[] = {
+ /* 0-4 */
+ "EOC", "BOOLEAN", "INTEGER", "BIT STRING", "OCTET STRING",
+ /* 5-9 */
+ "NULL", "OBJECT", "OBJECT DESCRIPTOR", "EXTERNAL", "REAL",
+ /* 10-13 */
+ "ENUMERATED", "<ASN1 11>", "UTF8STRING", "<ASN1 13>",
+ /* 15-17 */
+ "<ASN1 14>", "<ASN1 15>", "SEQUENCE", "SET",
+ /* 18-20 */
+ "NUMERICSTRING", "PRINTABLESTRING", "T61STRING",
+ /* 21-24 */
+ "VIDEOTEXSTRING", "IA5STRING", "UTCTIME", "GENERALIZEDTIME",
+ /* 25-27 */
+ "GRAPHICSTRING", "VISIBLESTRING", "GENERALSTRING",
+ /* 28-30 */
+ "UNIVERSALSTRING", "<ASN1 29>", "BMPSTRING"
+ };
+
+ if ((tag == V_ASN1_NEG_INTEGER) || (tag == V_ASN1_NEG_ENUMERATED))
+ tag &= ~0x100;
+
+ if (tag < 0 || tag > 30)
+ return "(unknown)";
+ return tag2str[tag];
+}
diff --git a/openssl-1.1.0h/crypto/asn1/asn_mime.c b/openssl-1.1.0h/crypto/asn1/asn_mime.c
new file mode 100644
index 0000000..84475e9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn_mime.c
@@ -0,0 +1,981 @@
+/*
+ * Copyright 2008-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "internal/cryptlib.h"
+#include <openssl/rand.h>
+#include <openssl/x509.h>
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include "internal/evp_int.h"
+#include "internal/bio.h"
+#include "asn1_locl.h"
+
+/*
+ * Generalised MIME like utilities for streaming ASN1. Although many have a
+ * PKCS7/CMS like flavour others are more general purpose.
+ */
+
+/*
+ * MIME format structures Note that all are translated to lower case apart
+ * from parameter values. Quotes are stripped off
+ */
+
+struct mime_param_st {
+ char *param_name; /* Param name e.g. "micalg" */
+ char *param_value; /* Param value e.g. "sha1" */
+};
+
+struct mime_header_st {
+ char *name; /* Name of line e.g. "content-type" */
+ char *value; /* Value of line e.g. "text/plain" */
+ STACK_OF(MIME_PARAM) *params; /* Zero or more parameters */
+};
+
+static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
+ const ASN1_ITEM *it);
+static char *strip_ends(char *name);
+static char *strip_start(char *name);
+static char *strip_end(char *name);
+static MIME_HEADER *mime_hdr_new(const char *name, const char *value);
+static int mime_hdr_addparam(MIME_HEADER *mhdr, const char *name, const char *value);
+static STACK_OF(MIME_HEADER) *mime_parse_hdr(BIO *bio);
+static int mime_hdr_cmp(const MIME_HEADER *const *a,
+ const MIME_HEADER *const *b);
+static int mime_param_cmp(const MIME_PARAM *const *a,
+ const MIME_PARAM *const *b);
+static void mime_param_free(MIME_PARAM *param);
+static int mime_bound_check(char *line, int linelen, const char *bound, int blen);
+static int multi_split(BIO *bio, const char *bound, STACK_OF(BIO) **ret);
+static int strip_eol(char *linebuf, int *plen, int flags);
+static MIME_HEADER *mime_hdr_find(STACK_OF(MIME_HEADER) *hdrs, const char *name);
+static MIME_PARAM *mime_param_find(MIME_HEADER *hdr, const char *name);
+static void mime_hdr_free(MIME_HEADER *hdr);
+
+#define MAX_SMLEN 1024
+#define mime_debug(x) /* x */
+
+/* Output an ASN1 structure in BER format streaming if necessary */
+
+int i2d_ASN1_bio_stream(BIO *out, ASN1_VALUE *val, BIO *in, int flags,
+ const ASN1_ITEM *it)
+{
+ /* If streaming create stream BIO and copy all content through it */
+ if (flags & SMIME_STREAM) {
+ BIO *bio, *tbio;
+ bio = BIO_new_NDEF(out, val, it);
+ if (!bio) {
+ ASN1err(ASN1_F_I2D_ASN1_BIO_STREAM, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ SMIME_crlf_copy(in, bio, flags);
+ (void)BIO_flush(bio);
+ /* Free up successive BIOs until we hit the old output BIO */
+ do {
+ tbio = BIO_pop(bio);
+ BIO_free(bio);
+ bio = tbio;
+ } while (bio != out);
+ }
+ /*
+ * else just write out ASN1 structure which will have all content stored
+ * internally
+ */
+ else
+ ASN1_item_i2d_bio(it, out, val);
+ return 1;
+}
+
+/* Base 64 read and write of ASN1 structure */
+
+static int B64_write_ASN1(BIO *out, ASN1_VALUE *val, BIO *in, int flags,
+ const ASN1_ITEM *it)
+{
+ BIO *b64;
+ int r;
+ b64 = BIO_new(BIO_f_base64());
+ if (b64 == NULL) {
+ ASN1err(ASN1_F_B64_WRITE_ASN1, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ /*
+ * prepend the b64 BIO so all data is base64 encoded.
+ */
+ out = BIO_push(b64, out);
+ r = i2d_ASN1_bio_stream(out, val, in, flags, it);
+ (void)BIO_flush(out);
+ BIO_pop(out);
+ BIO_free(b64);
+ return r;
+}
+
+/* Streaming ASN1 PEM write */
+
+int PEM_write_bio_ASN1_stream(BIO *out, ASN1_VALUE *val, BIO *in, int flags,
+ const char *hdr, const ASN1_ITEM *it)
+{
+ int r;
+ BIO_printf(out, "-----BEGIN %s-----\n", hdr);
+ r = B64_write_ASN1(out, val, in, flags, it);
+ BIO_printf(out, "-----END %s-----\n", hdr);
+ return r;
+}
+
+static ASN1_VALUE *b64_read_asn1(BIO *bio, const ASN1_ITEM *it)
+{
+ BIO *b64;
+ ASN1_VALUE *val;
+
+ if ((b64 = BIO_new(BIO_f_base64())) == NULL) {
+ ASN1err(ASN1_F_B64_READ_ASN1, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ bio = BIO_push(b64, bio);
+ val = ASN1_item_d2i_bio(it, bio, NULL);
+ if (!val)
+ ASN1err(ASN1_F_B64_READ_ASN1, ASN1_R_DECODE_ERROR);
+ (void)BIO_flush(bio);
+ BIO_pop(bio);
+ BIO_free(b64);
+ return val;
+}
+
+/* Generate the MIME "micalg" parameter from RFC3851, RFC4490 */
+
+static int asn1_write_micalg(BIO *out, STACK_OF(X509_ALGOR) *mdalgs)
+{
+ const EVP_MD *md;
+ int i, have_unknown = 0, write_comma, ret = 0, md_nid;
+ have_unknown = 0;
+ write_comma = 0;
+ for (i = 0; i < sk_X509_ALGOR_num(mdalgs); i++) {
+ if (write_comma)
+ BIO_write(out, ",", 1);
+ write_comma = 1;
+ md_nid = OBJ_obj2nid(sk_X509_ALGOR_value(mdalgs, i)->algorithm);
+ md = EVP_get_digestbynid(md_nid);
+ if (md && md->md_ctrl) {
+ int rv;
+ char *micstr;
+ rv = md->md_ctrl(NULL, EVP_MD_CTRL_MICALG, 0, &micstr);
+ if (rv > 0) {
+ BIO_puts(out, micstr);
+ OPENSSL_free(micstr);
+ continue;
+ }
+ if (rv != -2)
+ goto err;
+ }
+ switch (md_nid) {
+ case NID_sha1:
+ BIO_puts(out, "sha1");
+ break;
+
+ case NID_md5:
+ BIO_puts(out, "md5");
+ break;
+
+ case NID_sha256:
+ BIO_puts(out, "sha-256");
+ break;
+
+ case NID_sha384:
+ BIO_puts(out, "sha-384");
+ break;
+
+ case NID_sha512:
+ BIO_puts(out, "sha-512");
+ break;
+
+ case NID_id_GostR3411_94:
+ BIO_puts(out, "gostr3411-94");
+ goto err;
+
+ default:
+ if (have_unknown)
+ write_comma = 0;
+ else {
+ BIO_puts(out, "unknown");
+ have_unknown = 1;
+ }
+ break;
+
+ }
+ }
+
+ ret = 1;
+ err:
+
+ return ret;
+
+}
+
+/* SMIME sender */
+
+int SMIME_write_ASN1(BIO *bio, ASN1_VALUE *val, BIO *data, int flags,
+ int ctype_nid, int econt_nid,
+ STACK_OF(X509_ALGOR) *mdalgs, const ASN1_ITEM *it)
+{
+ char bound[33], c;
+ int i;
+ const char *mime_prefix, *mime_eol, *cname = "smime.p7m";
+ const char *msg_type = NULL;
+ if (flags & SMIME_OLDMIME)
+ mime_prefix = "application/x-pkcs7-";
+ else
+ mime_prefix = "application/pkcs7-";
+
+ if (flags & SMIME_CRLFEOL)
+ mime_eol = "\r\n";
+ else
+ mime_eol = "\n";
+ if ((flags & SMIME_DETACHED) && data) {
+ /* We want multipart/signed */
+ /* Generate a random boundary */
+ if (RAND_bytes((unsigned char *)bound, 32) <= 0)
+ return 0;
+ for (i = 0; i < 32; i++) {
+ c = bound[i] & 0xf;
+ if (c < 10)
+ c += '0';
+ else
+ c += 'A' - 10;
+ bound[i] = c;
+ }
+ bound[32] = 0;
+ BIO_printf(bio, "MIME-Version: 1.0%s", mime_eol);
+ BIO_printf(bio, "Content-Type: multipart/signed;");
+ BIO_printf(bio, " protocol=\"%ssignature\";", mime_prefix);
+ BIO_puts(bio, " micalg=\"");
+ asn1_write_micalg(bio, mdalgs);
+ BIO_printf(bio, "\"; boundary=\"----%s\"%s%s",
+ bound, mime_eol, mime_eol);
+ BIO_printf(bio, "This is an S/MIME signed message%s%s",
+ mime_eol, mime_eol);
+ /* Now write out the first part */
+ BIO_printf(bio, "------%s%s", bound, mime_eol);
+ if (!asn1_output_data(bio, data, val, flags, it))
+ return 0;
+ BIO_printf(bio, "%s------%s%s", mime_eol, bound, mime_eol);
+
+ /* Headers for signature */
+
+ BIO_printf(bio, "Content-Type: %ssignature;", mime_prefix);
+ BIO_printf(bio, " name=\"smime.p7s\"%s", mime_eol);
+ BIO_printf(bio, "Content-Transfer-Encoding: base64%s", mime_eol);
+ BIO_printf(bio, "Content-Disposition: attachment;");
+ BIO_printf(bio, " filename=\"smime.p7s\"%s%s", mime_eol, mime_eol);
+ B64_write_ASN1(bio, val, NULL, 0, it);
+ BIO_printf(bio, "%s------%s--%s%s", mime_eol, bound,
+ mime_eol, mime_eol);
+ return 1;
+ }
+
+ /* Determine smime-type header */
+
+ if (ctype_nid == NID_pkcs7_enveloped)
+ msg_type = "enveloped-data";
+ else if (ctype_nid == NID_pkcs7_signed) {
+ if (econt_nid == NID_id_smime_ct_receipt)
+ msg_type = "signed-receipt";
+ else if (sk_X509_ALGOR_num(mdalgs) >= 0)
+ msg_type = "signed-data";
+ else
+ msg_type = "certs-only";
+ } else if (ctype_nid == NID_id_smime_ct_compressedData) {
+ msg_type = "compressed-data";
+ cname = "smime.p7z";
+ }
+ /* MIME headers */
+ BIO_printf(bio, "MIME-Version: 1.0%s", mime_eol);
+ BIO_printf(bio, "Content-Disposition: attachment;");
+ BIO_printf(bio, " filename=\"%s\"%s", cname, mime_eol);
+ BIO_printf(bio, "Content-Type: %smime;", mime_prefix);
+ if (msg_type)
+ BIO_printf(bio, " smime-type=%s;", msg_type);
+ BIO_printf(bio, " name=\"%s\"%s", cname, mime_eol);
+ BIO_printf(bio, "Content-Transfer-Encoding: base64%s%s",
+ mime_eol, mime_eol);
+ if (!B64_write_ASN1(bio, val, data, flags, it))
+ return 0;
+ BIO_printf(bio, "%s", mime_eol);
+ return 1;
+}
+
+/* Handle output of ASN1 data */
+
+static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
+ const ASN1_ITEM *it)
+{
+ BIO *tmpbio;
+ const ASN1_AUX *aux = it->funcs;
+ ASN1_STREAM_ARG sarg;
+ int rv = 1;
+
+ /*
+ * If data is not detached or resigning then the output BIO is already
+ * set up to finalise when it is written through.
+ */
+ if (!(flags & SMIME_DETACHED) || (flags & PKCS7_REUSE_DIGEST)) {
+ SMIME_crlf_copy(data, out, flags);
+ return 1;
+ }
+
+ if (!aux || !aux->asn1_cb) {
+ ASN1err(ASN1_F_ASN1_OUTPUT_DATA, ASN1_R_STREAMING_NOT_SUPPORTED);
+ return 0;
+ }
+
+ sarg.out = out;
+ sarg.ndef_bio = NULL;
+ sarg.boundary = NULL;
+
+ /* Let ASN1 code prepend any needed BIOs */
+
+ if (aux->asn1_cb(ASN1_OP_DETACHED_PRE, &val, it, &sarg) <= 0)
+ return 0;
+
+ /* Copy data across, passing through filter BIOs for processing */
+ SMIME_crlf_copy(data, sarg.ndef_bio, flags);
+
+ /* Finalize structure */
+ if (aux->asn1_cb(ASN1_OP_DETACHED_POST, &val, it, &sarg) <= 0)
+ rv = 0;
+
+ /* Now remove any digests prepended to the BIO */
+
+ while (sarg.ndef_bio != out) {
+ tmpbio = BIO_pop(sarg.ndef_bio);
+ BIO_free(sarg.ndef_bio);
+ sarg.ndef_bio = tmpbio;
+ }
+
+ return rv;
+
+}
+
+/*
+ * SMIME reader: handle multipart/signed and opaque signing. in multipart
+ * case the content is placed in a memory BIO pointed to by "bcont". In
+ * opaque this is set to NULL
+ */
+
+ASN1_VALUE *SMIME_read_ASN1(BIO *bio, BIO **bcont, const ASN1_ITEM *it)
+{
+ BIO *asnin;
+ STACK_OF(MIME_HEADER) *headers = NULL;
+ STACK_OF(BIO) *parts = NULL;
+ MIME_HEADER *hdr;
+ MIME_PARAM *prm;
+ ASN1_VALUE *val;
+ int ret;
+
+ if (bcont)
+ *bcont = NULL;
+
+ if ((headers = mime_parse_hdr(bio)) == NULL) {
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_MIME_PARSE_ERROR);
+ return NULL;
+ }
+
+ if ((hdr = mime_hdr_find(headers, "content-type")) == NULL
+ || hdr->value == NULL) {
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_NO_CONTENT_TYPE);
+ return NULL;
+ }
+
+ /* Handle multipart/signed */
+
+ if (strcmp(hdr->value, "multipart/signed") == 0) {
+ /* Split into two parts */
+ prm = mime_param_find(hdr, "boundary");
+ if (!prm || !prm->param_value) {
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_NO_MULTIPART_BOUNDARY);
+ return NULL;
+ }
+ ret = multi_split(bio, prm->param_value, &parts);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ if (!ret || (sk_BIO_num(parts) != 2)) {
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_NO_MULTIPART_BODY_FAILURE);
+ sk_BIO_pop_free(parts, BIO_vfree);
+ return NULL;
+ }
+
+ /* Parse the signature piece */
+ asnin = sk_BIO_value(parts, 1);
+
+ if ((headers = mime_parse_hdr(asnin)) == NULL) {
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_MIME_SIG_PARSE_ERROR);
+ sk_BIO_pop_free(parts, BIO_vfree);
+ return NULL;
+ }
+
+ /* Get content type */
+
+ if ((hdr = mime_hdr_find(headers, "content-type")) == NULL
+ || hdr->value == NULL) {
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_NO_SIG_CONTENT_TYPE);
+ sk_BIO_pop_free(parts, BIO_vfree);
+ return NULL;
+ }
+
+ if (strcmp(hdr->value, "application/x-pkcs7-signature") &&
+ strcmp(hdr->value, "application/pkcs7-signature")) {
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_SIG_INVALID_MIME_TYPE);
+ ERR_add_error_data(2, "type: ", hdr->value);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ sk_BIO_pop_free(parts, BIO_vfree);
+ return NULL;
+ }
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ /* Read in ASN1 */
+ if ((val = b64_read_asn1(asnin, it)) == NULL) {
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_ASN1_SIG_PARSE_ERROR);
+ sk_BIO_pop_free(parts, BIO_vfree);
+ return NULL;
+ }
+
+ if (bcont) {
+ *bcont = sk_BIO_value(parts, 0);
+ BIO_free(asnin);
+ sk_BIO_free(parts);
+ } else
+ sk_BIO_pop_free(parts, BIO_vfree);
+ return val;
+ }
+
+ /* OK, if not multipart/signed try opaque signature */
+
+ if (strcmp(hdr->value, "application/x-pkcs7-mime") &&
+ strcmp(hdr->value, "application/pkcs7-mime")) {
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_INVALID_MIME_TYPE);
+ ERR_add_error_data(2, "type: ", hdr->value);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ return NULL;
+ }
+
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+
+ if ((val = b64_read_asn1(bio, it)) == NULL) {
+ ASN1err(ASN1_F_SMIME_READ_ASN1, ASN1_R_ASN1_PARSE_ERROR);
+ return NULL;
+ }
+ return val;
+
+}
+
+/* Copy text from one BIO to another making the output CRLF at EOL */
+int SMIME_crlf_copy(BIO *in, BIO *out, int flags)
+{
+ BIO *bf;
+ char eol;
+ int len;
+ char linebuf[MAX_SMLEN];
+ /*
+ * Buffer output so we don't write one line at a time. This is useful
+ * when streaming as we don't end up with one OCTET STRING per line.
+ */
+ bf = BIO_new(BIO_f_buffer());
+ if (bf == NULL)
+ return 0;
+ out = BIO_push(bf, out);
+ if (flags & SMIME_BINARY) {
+ while ((len = BIO_read(in, linebuf, MAX_SMLEN)) > 0)
+ BIO_write(out, linebuf, len);
+ } else {
+ int eolcnt = 0;
+ if (flags & SMIME_TEXT)
+ BIO_printf(out, "Content-Type: text/plain\r\n\r\n");
+ while ((len = BIO_gets(in, linebuf, MAX_SMLEN)) > 0) {
+ eol = strip_eol(linebuf, &len, flags);
+ if (len) {
+ /* Not EOF: write out all CRLF */
+ if (flags & SMIME_ASCIICRLF) {
+ int i;
+ for (i = 0; i < eolcnt; i++)
+ BIO_write(out, "\r\n", 2);
+ eolcnt = 0;
+ }
+ BIO_write(out, linebuf, len);
+ if (eol)
+ BIO_write(out, "\r\n", 2);
+ } else if (flags & SMIME_ASCIICRLF)
+ eolcnt++;
+ else if (eol)
+ BIO_write(out, "\r\n", 2);
+ }
+ }
+ (void)BIO_flush(out);
+ BIO_pop(out);
+ BIO_free(bf);
+ return 1;
+}
+
+/* Strip off headers if they are text/plain */
+int SMIME_text(BIO *in, BIO *out)
+{
+ char iobuf[4096];
+ int len;
+ STACK_OF(MIME_HEADER) *headers;
+ MIME_HEADER *hdr;
+
+ if ((headers = mime_parse_hdr(in)) == NULL) {
+ ASN1err(ASN1_F_SMIME_TEXT, ASN1_R_MIME_PARSE_ERROR);
+ return 0;
+ }
+ if ((hdr = mime_hdr_find(headers, "content-type")) == NULL
+ || hdr->value == NULL) {
+ ASN1err(ASN1_F_SMIME_TEXT, ASN1_R_MIME_NO_CONTENT_TYPE);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ return 0;
+ }
+ if (strcmp(hdr->value, "text/plain")) {
+ ASN1err(ASN1_F_SMIME_TEXT, ASN1_R_INVALID_MIME_TYPE);
+ ERR_add_error_data(2, "type: ", hdr->value);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ return 0;
+ }
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ while ((len = BIO_read(in, iobuf, sizeof(iobuf))) > 0)
+ BIO_write(out, iobuf, len);
+ if (len < 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * Split a multipart/XXX message body into component parts: result is
+ * canonical parts in a STACK of bios
+ */
+
+static int multi_split(BIO *bio, const char *bound, STACK_OF(BIO) **ret)
+{
+ char linebuf[MAX_SMLEN];
+ int len, blen;
+ int eol = 0, next_eol = 0;
+ BIO *bpart = NULL;
+ STACK_OF(BIO) *parts;
+ char state, part, first;
+
+ blen = strlen(bound);
+ part = 0;
+ state = 0;
+ first = 1;
+ parts = sk_BIO_new_null();
+ *ret = parts;
+ if (*ret == NULL)
+ return 0;
+ while ((len = BIO_gets(bio, linebuf, MAX_SMLEN)) > 0) {
+ state = mime_bound_check(linebuf, len, bound, blen);
+ if (state == 1) {
+ first = 1;
+ part++;
+ } else if (state == 2) {
+ if (!sk_BIO_push(parts, bpart)) {
+ BIO_free(bpart);
+ return 0;
+ }
+ return 1;
+ } else if (part) {
+ /* Strip CR+LF from linebuf */
+ next_eol = strip_eol(linebuf, &len, 0);
+ if (first) {
+ first = 0;
+ if (bpart)
+ if (!sk_BIO_push(parts, bpart)) {
+ BIO_free(bpart);
+ return 0;
+ }
+ bpart = BIO_new(BIO_s_mem());
+ if (bpart == NULL)
+ return 0;
+ BIO_set_mem_eof_return(bpart, 0);
+ } else if (eol)
+ BIO_write(bpart, "\r\n", 2);
+ eol = next_eol;
+ if (len)
+ BIO_write(bpart, linebuf, len);
+ }
+ }
+ BIO_free(bpart);
+ return 0;
+}
+
+/* This is the big one: parse MIME header lines up to message body */
+
+#define MIME_INVALID 0
+#define MIME_START 1
+#define MIME_TYPE 2
+#define MIME_NAME 3
+#define MIME_VALUE 4
+#define MIME_QUOTE 5
+#define MIME_COMMENT 6
+
+static STACK_OF(MIME_HEADER) *mime_parse_hdr(BIO *bio)
+{
+ char *p, *q, c;
+ char *ntmp;
+ char linebuf[MAX_SMLEN];
+ MIME_HEADER *mhdr = NULL, *new_hdr = NULL;
+ STACK_OF(MIME_HEADER) *headers;
+ int len, state, save_state = 0;
+
+ headers = sk_MIME_HEADER_new(mime_hdr_cmp);
+ if (headers == NULL)
+ return NULL;
+ while ((len = BIO_gets(bio, linebuf, MAX_SMLEN)) > 0) {
+ /* If whitespace at line start then continuation line */
+ if (mhdr && isspace((unsigned char)linebuf[0]))
+ state = MIME_NAME;
+ else
+ state = MIME_START;
+ ntmp = NULL;
+ /* Go through all characters */
+ for (p = linebuf, q = linebuf; (c = *p) && (c != '\r') && (c != '\n');
+ p++) {
+
+ /*
+ * State machine to handle MIME headers if this looks horrible
+ * that's because it *is*
+ */
+
+ switch (state) {
+ case MIME_START:
+ if (c == ':') {
+ state = MIME_TYPE;
+ *p = 0;
+ ntmp = strip_ends(q);
+ q = p + 1;
+ }
+ break;
+
+ case MIME_TYPE:
+ if (c == ';') {
+ mime_debug("Found End Value\n");
+ *p = 0;
+ new_hdr = mime_hdr_new(ntmp, strip_ends(q));
+ if (new_hdr == NULL)
+ goto err;
+ if (!sk_MIME_HEADER_push(headers, new_hdr))
+ goto err;
+ mhdr = new_hdr;
+ new_hdr = NULL;
+ ntmp = NULL;
+ q = p + 1;
+ state = MIME_NAME;
+ } else if (c == '(') {
+ save_state = state;
+ state = MIME_COMMENT;
+ }
+ break;
+
+ case MIME_COMMENT:
+ if (c == ')') {
+ state = save_state;
+ }
+ break;
+
+ case MIME_NAME:
+ if (c == '=') {
+ state = MIME_VALUE;
+ *p = 0;
+ ntmp = strip_ends(q);
+ q = p + 1;
+ }
+ break;
+
+ case MIME_VALUE:
+ if (c == ';') {
+ state = MIME_NAME;
+ *p = 0;
+ mime_hdr_addparam(mhdr, ntmp, strip_ends(q));
+ ntmp = NULL;
+ q = p + 1;
+ } else if (c == '"') {
+ mime_debug("Found Quote\n");
+ state = MIME_QUOTE;
+ } else if (c == '(') {
+ save_state = state;
+ state = MIME_COMMENT;
+ }
+ break;
+
+ case MIME_QUOTE:
+ if (c == '"') {
+ mime_debug("Found Match Quote\n");
+ state = MIME_VALUE;
+ }
+ break;
+ }
+ }
+
+ if (state == MIME_TYPE) {
+ new_hdr = mime_hdr_new(ntmp, strip_ends(q));
+ if (new_hdr == NULL)
+ goto err;
+ if (!sk_MIME_HEADER_push(headers, new_hdr))
+ goto err;
+ mhdr = new_hdr;
+ new_hdr = NULL;
+ } else if (state == MIME_VALUE)
+ mime_hdr_addparam(mhdr, ntmp, strip_ends(q));
+ if (p == linebuf)
+ break; /* Blank line means end of headers */
+ }
+
+ return headers;
+
+err:
+ mime_hdr_free(new_hdr);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
+ return NULL;
+}
+
+static char *strip_ends(char *name)
+{
+ return strip_end(strip_start(name));
+}
+
+/* Strip a parameter of whitespace from start of param */
+static char *strip_start(char *name)
+{
+ char *p, c;
+ /* Look for first non white space or quote */
+ for (p = name; (c = *p); p++) {
+ if (c == '"') {
+ /* Next char is start of string if non null */
+ if (p[1])
+ return p + 1;
+ /* Else null string */
+ return NULL;
+ }
+ if (!isspace((unsigned char)c))
+ return p;
+ }
+ return NULL;
+}
+
+/* As above but strip from end of string : maybe should handle brackets? */
+static char *strip_end(char *name)
+{
+ char *p, c;
+ if (!name)
+ return NULL;
+ /* Look for first non white space or quote */
+ for (p = name + strlen(name) - 1; p >= name; p--) {
+ c = *p;
+ if (c == '"') {
+ if (p - 1 == name)
+ return NULL;
+ *p = 0;
+ return name;
+ }
+ if (isspace((unsigned char)c))
+ *p = 0;
+ else
+ return name;
+ }
+ return NULL;
+}
+
+static MIME_HEADER *mime_hdr_new(const char *name, const char *value)
+{
+ MIME_HEADER *mhdr = NULL;
+ char *tmpname = NULL, *tmpval = NULL, *p;
+ int c;
+
+ if (name) {
+ if ((tmpname = OPENSSL_strdup(name)) == NULL)
+ return NULL;
+ for (p = tmpname; *p; p++) {
+ c = (unsigned char)*p;
+ if (isupper(c)) {
+ c = tolower(c);
+ *p = c;
+ }
+ }
+ }
+ if (value) {
+ if ((tmpval = OPENSSL_strdup(value)) == NULL)
+ goto err;
+ for (p = tmpval; *p; p++) {
+ c = (unsigned char)*p;
+ if (isupper(c)) {
+ c = tolower(c);
+ *p = c;
+ }
+ }
+ }
+ mhdr = OPENSSL_malloc(sizeof(*mhdr));
+ if (mhdr == NULL)
+ goto err;
+ mhdr->name = tmpname;
+ mhdr->value = tmpval;
+ if ((mhdr->params = sk_MIME_PARAM_new(mime_param_cmp)) == NULL)
+ goto err;
+ return mhdr;
+
+ err:
+ OPENSSL_free(tmpname);
+ OPENSSL_free(tmpval);
+ OPENSSL_free(mhdr);
+ return NULL;
+}
+
+static int mime_hdr_addparam(MIME_HEADER *mhdr, const char *name, const char *value)
+{
+ char *tmpname = NULL, *tmpval = NULL, *p;
+ int c;
+ MIME_PARAM *mparam = NULL;
+ if (name) {
+ tmpname = OPENSSL_strdup(name);
+ if (!tmpname)
+ goto err;
+ for (p = tmpname; *p; p++) {
+ c = (unsigned char)*p;
+ if (isupper(c)) {
+ c = tolower(c);
+ *p = c;
+ }
+ }
+ }
+ if (value) {
+ tmpval = OPENSSL_strdup(value);
+ if (!tmpval)
+ goto err;
+ }
+ /* Parameter values are case sensitive so leave as is */
+ mparam = OPENSSL_malloc(sizeof(*mparam));
+ if (mparam == NULL)
+ goto err;
+ mparam->param_name = tmpname;
+ mparam->param_value = tmpval;
+ if (!sk_MIME_PARAM_push(mhdr->params, mparam))
+ goto err;
+ return 1;
+ err:
+ OPENSSL_free(tmpname);
+ OPENSSL_free(tmpval);
+ OPENSSL_free(mparam);
+ return 0;
+}
+
+static int mime_hdr_cmp(const MIME_HEADER *const *a,
+ const MIME_HEADER *const *b)
+{
+ if (!(*a)->name || !(*b)->name)
+ return ! !(*a)->name - ! !(*b)->name;
+
+ return (strcmp((*a)->name, (*b)->name));
+}
+
+static int mime_param_cmp(const MIME_PARAM *const *a,
+ const MIME_PARAM *const *b)
+{
+ if (!(*a)->param_name || !(*b)->param_name)
+ return ! !(*a)->param_name - ! !(*b)->param_name;
+ return (strcmp((*a)->param_name, (*b)->param_name));
+}
+
+/* Find a header with a given name (if possible) */
+
+static MIME_HEADER *mime_hdr_find(STACK_OF(MIME_HEADER) *hdrs, const char *name)
+{
+ MIME_HEADER htmp;
+ int idx;
+
+ htmp.name = (char *)name;
+ htmp.value = NULL;
+ htmp.params = NULL;
+
+ idx = sk_MIME_HEADER_find(hdrs, &htmp);
+ if (idx < 0)
+ return NULL;
+ return sk_MIME_HEADER_value(hdrs, idx);
+}
+
+static MIME_PARAM *mime_param_find(MIME_HEADER *hdr, const char *name)
+{
+ MIME_PARAM param;
+ int idx;
+
+ param.param_name = (char *)name;
+ param.param_value = NULL;
+ idx = sk_MIME_PARAM_find(hdr->params, &param);
+ if (idx < 0)
+ return NULL;
+ return sk_MIME_PARAM_value(hdr->params, idx);
+}
+
+static void mime_hdr_free(MIME_HEADER *hdr)
+{
+ if (hdr == NULL)
+ return;
+ OPENSSL_free(hdr->name);
+ OPENSSL_free(hdr->value);
+ if (hdr->params)
+ sk_MIME_PARAM_pop_free(hdr->params, mime_param_free);
+ OPENSSL_free(hdr);
+}
+
+static void mime_param_free(MIME_PARAM *param)
+{
+ OPENSSL_free(param->param_name);
+ OPENSSL_free(param->param_value);
+ OPENSSL_free(param);
+}
+
+/*-
+ * Check for a multipart boundary. Returns:
+ * 0 : no boundary
+ * 1 : part boundary
+ * 2 : final boundary
+ */
+static int mime_bound_check(char *line, int linelen, const char *bound, int blen)
+{
+ if (linelen == -1)
+ linelen = strlen(line);
+ if (blen == -1)
+ blen = strlen(bound);
+ /* Quickly eliminate if line length too short */
+ if (blen + 2 > linelen)
+ return 0;
+ /* Check for part boundary */
+ if ((strncmp(line, "--", 2) == 0)
+ && strncmp(line + 2, bound, blen) == 0) {
+ if (strncmp(line + blen + 2, "--", 2) == 0)
+ return 2;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+static int strip_eol(char *linebuf, int *plen, int flags)
+{
+ int len = *plen;
+ char *p, c;
+ int is_eol = 0;
+ p = linebuf + len - 1;
+ for (p = linebuf + len - 1; len > 0; len--, p--) {
+ c = *p;
+ if (c == '\n')
+ is_eol = 1;
+ else if (is_eol && flags & SMIME_ASCIICRLF && c < 33)
+ continue;
+ else if (c != '\r')
+ break;
+ }
+ *plen = len;
+ return is_eol;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/asn_moid.c b/openssl-1.1.0h/crypto/asn1/asn_moid.c
new file mode 100644
index 0000000..8176b76
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn_moid.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <openssl/crypto.h>
+#include "internal/cryptlib.h"
+#include <openssl/conf.h>
+#include <openssl/x509.h>
+#include "internal/asn1_int.h"
+#include "internal/objects.h"
+
+/* Simple ASN1 OID module: add all objects in a given section */
+
+static int do_create(const char *value, const char *name);
+
+static int oid_module_init(CONF_IMODULE *md, const CONF *cnf)
+{
+ int i;
+ const char *oid_section;
+ STACK_OF(CONF_VALUE) *sktmp;
+ CONF_VALUE *oval;
+
+ oid_section = CONF_imodule_get_value(md);
+ if ((sktmp = NCONF_get_section(cnf, oid_section)) == NULL) {
+ ASN1err(ASN1_F_OID_MODULE_INIT, ASN1_R_ERROR_LOADING_SECTION);
+ return 0;
+ }
+ for (i = 0; i < sk_CONF_VALUE_num(sktmp); i++) {
+ oval = sk_CONF_VALUE_value(sktmp, i);
+ if (!do_create(oval->value, oval->name)) {
+ ASN1err(ASN1_F_OID_MODULE_INIT, ASN1_R_ADDING_OBJECT);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static void oid_module_finish(CONF_IMODULE *md)
+{
+}
+
+void ASN1_add_oid_module(void)
+{
+ CONF_module_add("oid_section", oid_module_init, oid_module_finish);
+}
+
+/*-
+ * Create an OID based on a name value pair. Accept two formats.
+ * shortname = 1.2.3.4
+ * shortname = some long name, 1.2.3.4
+ */
+
+static int do_create(const char *value, const char *name)
+{
+ int nid;
+ ASN1_OBJECT *oid;
+ const char *ln, *ostr, *p;
+ char *lntmp;
+ p = strrchr(value, ',');
+ if (!p) {
+ ln = name;
+ ostr = value;
+ } else {
+ ln = NULL;
+ ostr = p + 1;
+ if (!*ostr)
+ return 0;
+ while (isspace((unsigned char)*ostr))
+ ostr++;
+ }
+
+ nid = OBJ_create(ostr, name, ln);
+
+ if (nid == NID_undef)
+ return 0;
+
+ if (p) {
+ ln = value;
+ while (isspace((unsigned char)*ln))
+ ln++;
+ p--;
+ while (isspace((unsigned char)*p)) {
+ if (p == ln)
+ return 0;
+ p--;
+ }
+ p++;
+ lntmp = OPENSSL_malloc((p - ln) + 1);
+ if (lntmp == NULL)
+ return 0;
+ memcpy(lntmp, ln, p - ln);
+ lntmp[p - ln] = 0;
+ oid = OBJ_nid2obj(nid);
+ oid->ln = lntmp;
+ }
+
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/asn_mstbl.c b/openssl-1.1.0h/crypto/asn1/asn_mstbl.c
new file mode 100644
index 0000000..8260939
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn_mstbl.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <openssl/crypto.h>
+#include "internal/cryptlib.h"
+#include <openssl/conf.h>
+#include <openssl/x509v3.h>
+
+/* Multi string module: add table entries from a given section */
+
+static int do_tcreate(const char *value, const char *name);
+
+static int stbl_module_init(CONF_IMODULE *md, const CONF *cnf)
+{
+ int i;
+ const char *stbl_section;
+ STACK_OF(CONF_VALUE) *sktmp;
+ CONF_VALUE *mval;
+
+ stbl_section = CONF_imodule_get_value(md);
+ if ((sktmp = NCONF_get_section(cnf, stbl_section)) == NULL) {
+ ASN1err(ASN1_F_STBL_MODULE_INIT, ASN1_R_ERROR_LOADING_SECTION);
+ return 0;
+ }
+ for (i = 0; i < sk_CONF_VALUE_num(sktmp); i++) {
+ mval = sk_CONF_VALUE_value(sktmp, i);
+ if (!do_tcreate(mval->value, mval->name)) {
+ ASN1err(ASN1_F_STBL_MODULE_INIT, ASN1_R_INVALID_VALUE);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static void stbl_module_finish(CONF_IMODULE *md)
+{
+ ASN1_STRING_TABLE_cleanup();
+}
+
+void ASN1_add_stable_module(void)
+{
+ CONF_module_add("stbl_section", stbl_module_init, stbl_module_finish);
+}
+
+/*
+ * Create an table entry based on a name value pair. format is oid_name =
+ * n1:v1, n2:v2,... where name is "min", "max", "mask" or "flags".
+ */
+
+static int do_tcreate(const char *value, const char *name)
+{
+ char *eptr;
+ int nid, i, rv = 0;
+ long tbl_min = -1, tbl_max = -1;
+ unsigned long tbl_mask = 0, tbl_flags = 0;
+ STACK_OF(CONF_VALUE) *lst = NULL;
+ CONF_VALUE *cnf = NULL;
+ nid = OBJ_sn2nid(name);
+ if (nid == NID_undef)
+ nid = OBJ_ln2nid(name);
+ if (nid == NID_undef)
+ goto err;
+ lst = X509V3_parse_list(value);
+ if (!lst)
+ goto err;
+ for (i = 0; i < sk_CONF_VALUE_num(lst); i++) {
+ cnf = sk_CONF_VALUE_value(lst, i);
+ if (strcmp(cnf->name, "min") == 0) {
+ tbl_min = strtoul(cnf->value, &eptr, 0);
+ if (*eptr)
+ goto err;
+ } else if (strcmp(cnf->name, "max") == 0) {
+ tbl_max = strtoul(cnf->value, &eptr, 0);
+ if (*eptr)
+ goto err;
+ } else if (strcmp(cnf->name, "mask") == 0) {
+ if (!ASN1_str2mask(cnf->value, &tbl_mask) || !tbl_mask)
+ goto err;
+ } else if (strcmp(cnf->name, "flags") == 0) {
+ if (strcmp(cnf->value, "nomask") == 0)
+ tbl_flags = STABLE_NO_MASK;
+ else if (strcmp(cnf->value, "none") == 0)
+ tbl_flags = STABLE_FLAGS_CLEAR;
+ else
+ goto err;
+ } else
+ goto err;
+ }
+ rv = 1;
+ err:
+ if (rv == 0) {
+ ASN1err(ASN1_F_DO_TCREATE, ASN1_R_INVALID_STRING_TABLE_VALUE);
+ if (cnf)
+ ERR_add_error_data(4, "field=", cnf->name,
+ ", value=", cnf->value);
+ else
+ ERR_add_error_data(4, "name=", name, ", value=", value);
+ } else {
+ rv = ASN1_STRING_TABLE_add(nid, tbl_min, tbl_max,
+ tbl_mask, tbl_flags);
+ if (!rv)
+ ASN1err(ASN1_F_DO_TCREATE, ERR_R_MALLOC_FAILURE);
+ }
+ sk_CONF_VALUE_pop_free(lst, X509V3_conf_free);
+ return rv;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/asn_pack.c b/openssl-1.1.0h/crypto/asn1/asn_pack.c
new file mode 100644
index 0000000..63bc306
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/asn_pack.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+
+/* ASN1 packing and unpacking functions */
+
+ASN1_STRING *ASN1_item_pack(void *obj, const ASN1_ITEM *it, ASN1_STRING **oct)
+{
+ ASN1_STRING *octmp;
+
+ if (oct == NULL || *oct == NULL) {
+ if ((octmp = ASN1_STRING_new()) == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_PACK, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+ } else {
+ octmp = *oct;
+ }
+
+ OPENSSL_free(octmp->data);
+ octmp->data = NULL;
+
+ if ((octmp->length = ASN1_item_i2d(obj, &octmp->data, it)) == 0) {
+ ASN1err(ASN1_F_ASN1_ITEM_PACK, ASN1_R_ENCODE_ERROR);
+ goto err;
+ }
+ if (octmp->data == NULL) {
+ ASN1err(ASN1_F_ASN1_ITEM_PACK, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ if (oct != NULL && *oct == NULL)
+ *oct = octmp;
+
+ return octmp;
+ err:
+ if (oct == NULL || *oct == NULL)
+ ASN1_STRING_free(octmp);
+ return NULL;
+}
+
+/* Extract an ASN1 object from an ASN1_STRING */
+
+void *ASN1_item_unpack(const ASN1_STRING *oct, const ASN1_ITEM *it)
+{
+ const unsigned char *p;
+ void *ret;
+
+ p = oct->data;
+ if ((ret = ASN1_item_d2i(NULL, &p, oct->length, it)) == NULL)
+ ASN1err(ASN1_F_ASN1_ITEM_UNPACK, ASN1_R_DECODE_ERROR);
+ return ret;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/bio_asn1.c b/openssl-1.1.0h/crypto/asn1/bio_asn1.c
new file mode 100644
index 0000000..2a8a41f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/bio_asn1.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Experimental ASN1 BIO. When written through the data is converted to an
+ * ASN1 string type: default is OCTET STRING. Additional functions can be
+ * provided to add prefix and suffix data.
+ */
+
+#include <string.h>
+#include <internal/bio.h>
+#include <openssl/asn1.h>
+
+/* Must be large enough for biggest tag+length */
+#define DEFAULT_ASN1_BUF_SIZE 20
+
+typedef enum {
+ ASN1_STATE_START,
+ ASN1_STATE_PRE_COPY,
+ ASN1_STATE_HEADER,
+ ASN1_STATE_HEADER_COPY,
+ ASN1_STATE_DATA_COPY,
+ ASN1_STATE_POST_COPY,
+ ASN1_STATE_DONE
+} asn1_bio_state_t;
+
+typedef struct BIO_ASN1_EX_FUNCS_st {
+ asn1_ps_func *ex_func;
+ asn1_ps_func *ex_free_func;
+} BIO_ASN1_EX_FUNCS;
+
+typedef struct BIO_ASN1_BUF_CTX_t {
+ /* Internal state */
+ asn1_bio_state_t state;
+ /* Internal buffer */
+ unsigned char *buf;
+ /* Size of buffer */
+ int bufsize;
+ /* Current position in buffer */
+ int bufpos;
+ /* Current buffer length */
+ int buflen;
+ /* Amount of data to copy */
+ int copylen;
+ /* Class and tag to use */
+ int asn1_class, asn1_tag;
+ asn1_ps_func *prefix, *prefix_free, *suffix, *suffix_free;
+ /* Extra buffer for prefix and suffix data */
+ unsigned char *ex_buf;
+ int ex_len;
+ int ex_pos;
+ void *ex_arg;
+} BIO_ASN1_BUF_CTX;
+
+static int asn1_bio_write(BIO *h, const char *buf, int num);
+static int asn1_bio_read(BIO *h, char *buf, int size);
+static int asn1_bio_puts(BIO *h, const char *str);
+static int asn1_bio_gets(BIO *h, char *str, int size);
+static long asn1_bio_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int asn1_bio_new(BIO *h);
+static int asn1_bio_free(BIO *data);
+static long asn1_bio_callback_ctrl(BIO *h, int cmd, BIO_info_cb *fp);
+
+static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size);
+static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+ asn1_ps_func *cleanup, asn1_bio_state_t next);
+static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+ asn1_ps_func *setup,
+ asn1_bio_state_t ex_state,
+ asn1_bio_state_t other_state);
+
+static const BIO_METHOD methods_asn1 = {
+ BIO_TYPE_ASN1,
+ "asn1",
+ asn1_bio_write,
+ asn1_bio_read,
+ asn1_bio_puts,
+ asn1_bio_gets,
+ asn1_bio_ctrl,
+ asn1_bio_new,
+ asn1_bio_free,
+ asn1_bio_callback_ctrl,
+};
+
+const BIO_METHOD *BIO_f_asn1(void)
+{
+ return (&methods_asn1);
+}
+
+static int asn1_bio_new(BIO *b)
+{
+ BIO_ASN1_BUF_CTX *ctx = OPENSSL_zalloc(sizeof(*ctx));
+
+ if (ctx == NULL)
+ return 0;
+ if (!asn1_bio_init(ctx, DEFAULT_ASN1_BUF_SIZE)) {
+ OPENSSL_free(ctx);
+ return 0;
+ }
+ BIO_set_data(b, ctx);
+ BIO_set_init(b, 1);
+
+ return 1;
+}
+
+static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size)
+{
+ ctx->buf = OPENSSL_malloc(size);
+ if (ctx->buf == NULL)
+ return 0;
+ ctx->bufsize = size;
+ ctx->asn1_class = V_ASN1_UNIVERSAL;
+ ctx->asn1_tag = V_ASN1_OCTET_STRING;
+ ctx->state = ASN1_STATE_START;
+ return 1;
+}
+
+static int asn1_bio_free(BIO *b)
+{
+ BIO_ASN1_BUF_CTX *ctx;
+
+ if (b == NULL)
+ return 0;
+
+ ctx = BIO_get_data(b);
+ if (ctx == NULL)
+ return 0;
+
+ OPENSSL_free(ctx->buf);
+ OPENSSL_free(ctx);
+ BIO_set_data(b, NULL);
+ BIO_set_init(b, 0);
+
+ return 1;
+}
+
+static int asn1_bio_write(BIO *b, const char *in, int inl)
+{
+ BIO_ASN1_BUF_CTX *ctx;
+ int wrmax, wrlen, ret;
+ unsigned char *p;
+ BIO *next;
+
+ ctx = BIO_get_data(b);
+ next = BIO_next(b);
+ if (in == NULL || inl < 0 || ctx == NULL || next == NULL)
+ return 0;
+
+ wrlen = 0;
+ ret = -1;
+
+ for (;;) {
+ switch (ctx->state) {
+
+ /* Setup prefix data, call it */
+ case ASN1_STATE_START:
+ if (!asn1_bio_setup_ex(b, ctx, ctx->prefix,
+ ASN1_STATE_PRE_COPY, ASN1_STATE_HEADER))
+ return 0;
+ break;
+
+ /* Copy any pre data first */
+ case ASN1_STATE_PRE_COPY:
+
+ ret = asn1_bio_flush_ex(b, ctx, ctx->prefix_free,
+ ASN1_STATE_HEADER);
+
+ if (ret <= 0)
+ goto done;
+
+ break;
+
+ case ASN1_STATE_HEADER:
+ ctx->buflen = ASN1_object_size(0, inl, ctx->asn1_tag) - inl;
+ OPENSSL_assert(ctx->buflen <= ctx->bufsize);
+ p = ctx->buf;
+ ASN1_put_object(&p, 0, inl, ctx->asn1_tag, ctx->asn1_class);
+ ctx->copylen = inl;
+ ctx->state = ASN1_STATE_HEADER_COPY;
+
+ break;
+
+ case ASN1_STATE_HEADER_COPY:
+ ret = BIO_write(next, ctx->buf + ctx->bufpos, ctx->buflen);
+ if (ret <= 0)
+ goto done;
+
+ ctx->buflen -= ret;
+ if (ctx->buflen)
+ ctx->bufpos += ret;
+ else {
+ ctx->bufpos = 0;
+ ctx->state = ASN1_STATE_DATA_COPY;
+ }
+
+ break;
+
+ case ASN1_STATE_DATA_COPY:
+
+ if (inl > ctx->copylen)
+ wrmax = ctx->copylen;
+ else
+ wrmax = inl;
+ ret = BIO_write(next, in, wrmax);
+ if (ret <= 0)
+ goto done;
+ wrlen += ret;
+ ctx->copylen -= ret;
+ in += ret;
+ inl -= ret;
+
+ if (ctx->copylen == 0)
+ ctx->state = ASN1_STATE_HEADER;
+
+ if (inl == 0)
+ goto done;
+
+ break;
+
+ default:
+ BIO_clear_retry_flags(b);
+ return 0;
+
+ }
+
+ }
+
+ done:
+ BIO_clear_retry_flags(b);
+ BIO_copy_next_retry(b);
+
+ return (wrlen > 0) ? wrlen : ret;
+
+}
+
+static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+ asn1_ps_func *cleanup, asn1_bio_state_t next)
+{
+ int ret;
+
+ if (ctx->ex_len <= 0)
+ return 1;
+ for (;;) {
+ ret = BIO_write(BIO_next(b), ctx->ex_buf + ctx->ex_pos, ctx->ex_len);
+ if (ret <= 0)
+ break;
+ ctx->ex_len -= ret;
+ if (ctx->ex_len > 0)
+ ctx->ex_pos += ret;
+ else {
+ if (cleanup)
+ cleanup(b, &ctx->ex_buf, &ctx->ex_len, &ctx->ex_arg);
+ ctx->state = next;
+ ctx->ex_pos = 0;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
+ asn1_ps_func *setup,
+ asn1_bio_state_t ex_state,
+ asn1_bio_state_t other_state)
+{
+ if (setup && !setup(b, &ctx->ex_buf, &ctx->ex_len, &ctx->ex_arg)) {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
+ if (ctx->ex_len > 0)
+ ctx->state = ex_state;
+ else
+ ctx->state = other_state;
+ return 1;
+}
+
+static int asn1_bio_read(BIO *b, char *in, int inl)
+{
+ BIO *next = BIO_next(b);
+ if (next == NULL)
+ return 0;
+ return BIO_read(next, in, inl);
+}
+
+static int asn1_bio_puts(BIO *b, const char *str)
+{
+ return asn1_bio_write(b, str, strlen(str));
+}
+
+static int asn1_bio_gets(BIO *b, char *str, int size)
+{
+ BIO *next = BIO_next(b);
+ if (next == NULL)
+ return 0;
+ return BIO_gets(next, str, size);
+}
+
+static long asn1_bio_callback_ctrl(BIO *b, int cmd, BIO_info_cb *fp)
+{
+ BIO *next = BIO_next(b);
+ if (next == NULL)
+ return 0;
+ return BIO_callback_ctrl(next, cmd, fp);
+}
+
+static long asn1_bio_ctrl(BIO *b, int cmd, long arg1, void *arg2)
+{
+ BIO_ASN1_BUF_CTX *ctx;
+ BIO_ASN1_EX_FUNCS *ex_func;
+ long ret = 1;
+ BIO *next;
+
+ ctx = BIO_get_data(b);
+ if (ctx == NULL)
+ return 0;
+ next = BIO_next(b);
+ switch (cmd) {
+
+ case BIO_C_SET_PREFIX:
+ ex_func = arg2;
+ ctx->prefix = ex_func->ex_func;
+ ctx->prefix_free = ex_func->ex_free_func;
+ break;
+
+ case BIO_C_GET_PREFIX:
+ ex_func = arg2;
+ ex_func->ex_func = ctx->prefix;
+ ex_func->ex_free_func = ctx->prefix_free;
+ break;
+
+ case BIO_C_SET_SUFFIX:
+ ex_func = arg2;
+ ctx->suffix = ex_func->ex_func;
+ ctx->suffix_free = ex_func->ex_free_func;
+ break;
+
+ case BIO_C_GET_SUFFIX:
+ ex_func = arg2;
+ ex_func->ex_func = ctx->suffix;
+ ex_func->ex_free_func = ctx->suffix_free;
+ break;
+
+ case BIO_C_SET_EX_ARG:
+ ctx->ex_arg = arg2;
+ break;
+
+ case BIO_C_GET_EX_ARG:
+ *(void **)arg2 = ctx->ex_arg;
+ break;
+
+ case BIO_CTRL_FLUSH:
+ if (next == NULL)
+ return 0;
+
+ /* Call post function if possible */
+ if (ctx->state == ASN1_STATE_HEADER) {
+ if (!asn1_bio_setup_ex(b, ctx, ctx->suffix,
+ ASN1_STATE_POST_COPY, ASN1_STATE_DONE))
+ return 0;
+ }
+
+ if (ctx->state == ASN1_STATE_POST_COPY) {
+ ret = asn1_bio_flush_ex(b, ctx, ctx->suffix_free,
+ ASN1_STATE_DONE);
+ if (ret <= 0)
+ return ret;
+ }
+
+ if (ctx->state == ASN1_STATE_DONE)
+ return BIO_ctrl(next, cmd, arg1, arg2);
+ else {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
+
+ default:
+ if (next == NULL)
+ return 0;
+ return BIO_ctrl(next, cmd, arg1, arg2);
+
+ }
+
+ return ret;
+}
+
+static int asn1_bio_set_ex(BIO *b, int cmd,
+ asn1_ps_func *ex_func, asn1_ps_func *ex_free_func)
+{
+ BIO_ASN1_EX_FUNCS extmp;
+ extmp.ex_func = ex_func;
+ extmp.ex_free_func = ex_free_func;
+ return BIO_ctrl(b, cmd, 0, &extmp);
+}
+
+static int asn1_bio_get_ex(BIO *b, int cmd,
+ asn1_ps_func **ex_func,
+ asn1_ps_func **ex_free_func)
+{
+ BIO_ASN1_EX_FUNCS extmp;
+ int ret;
+ ret = BIO_ctrl(b, cmd, 0, &extmp);
+ if (ret > 0) {
+ *ex_func = extmp.ex_func;
+ *ex_free_func = extmp.ex_free_func;
+ }
+ return ret;
+}
+
+int BIO_asn1_set_prefix(BIO *b, asn1_ps_func *prefix,
+ asn1_ps_func *prefix_free)
+{
+ return asn1_bio_set_ex(b, BIO_C_SET_PREFIX, prefix, prefix_free);
+}
+
+int BIO_asn1_get_prefix(BIO *b, asn1_ps_func **pprefix,
+ asn1_ps_func **pprefix_free)
+{
+ return asn1_bio_get_ex(b, BIO_C_GET_PREFIX, pprefix, pprefix_free);
+}
+
+int BIO_asn1_set_suffix(BIO *b, asn1_ps_func *suffix,
+ asn1_ps_func *suffix_free)
+{
+ return asn1_bio_set_ex(b, BIO_C_SET_SUFFIX, suffix, suffix_free);
+}
+
+int BIO_asn1_get_suffix(BIO *b, asn1_ps_func **psuffix,
+ asn1_ps_func **psuffix_free)
+{
+ return asn1_bio_get_ex(b, BIO_C_GET_SUFFIX, psuffix, psuffix_free);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/bio_ndef.c b/openssl-1.1.0h/crypto/asn1/bio_ndef.c
new file mode 100644
index 0000000..0f206b2
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/bio_ndef.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+
+#include <stdio.h>
+
+/* Experimental NDEF ASN1 BIO support routines */
+
+/*
+ * The usage is quite simple, initialize an ASN1 structure, get a BIO from it
+ * then any data written through the BIO will end up translated to
+ * appropriate format on the fly. The data is streamed out and does *not*
+ * need to be all held in memory at once. When the BIO is flushed the output
+ * is finalized and any signatures etc written out. The BIO is a 'proper'
+ * BIO and can handle non blocking I/O correctly. The usage is simple. The
+ * implementation is *not*...
+ */
+
+/* BIO support data stored in the ASN1 BIO ex_arg */
+
+typedef struct ndef_aux_st {
+ /* ASN1 structure this BIO refers to */
+ ASN1_VALUE *val;
+ const ASN1_ITEM *it;
+ /* Top of the BIO chain */
+ BIO *ndef_bio;
+ /* Output BIO */
+ BIO *out;
+ /* Boundary where content is inserted */
+ unsigned char **boundary;
+ /* DER buffer start */
+ unsigned char *derbuf;
+} NDEF_SUPPORT;
+
+static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen,
+ void *parg);
+static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen,
+ void *parg);
+
+BIO *BIO_new_NDEF(BIO *out, ASN1_VALUE *val, const ASN1_ITEM *it)
+{
+ NDEF_SUPPORT *ndef_aux = NULL;
+ BIO *asn_bio = NULL;
+ const ASN1_AUX *aux = it->funcs;
+ ASN1_STREAM_ARG sarg;
+
+ if (!aux || !aux->asn1_cb) {
+ ASN1err(ASN1_F_BIO_NEW_NDEF, ASN1_R_STREAMING_NOT_SUPPORTED);
+ return NULL;
+ }
+ ndef_aux = OPENSSL_zalloc(sizeof(*ndef_aux));
+ asn_bio = BIO_new(BIO_f_asn1());
+ if (ndef_aux == NULL || asn_bio == NULL)
+ goto err;
+
+ /* ASN1 bio needs to be next to output BIO */
+ out = BIO_push(asn_bio, out);
+ if (out == NULL)
+ goto err;
+
+ BIO_asn1_set_prefix(asn_bio, ndef_prefix, ndef_prefix_free);
+ BIO_asn1_set_suffix(asn_bio, ndef_suffix, ndef_suffix_free);
+
+ /*
+ * Now let callback prepends any digest, cipher etc BIOs ASN1 structure
+ * needs.
+ */
+
+ sarg.out = out;
+ sarg.ndef_bio = NULL;
+ sarg.boundary = NULL;
+
+ if (aux->asn1_cb(ASN1_OP_STREAM_PRE, &val, it, &sarg) <= 0)
+ goto err;
+
+ ndef_aux->val = val;
+ ndef_aux->it = it;
+ ndef_aux->ndef_bio = sarg.ndef_bio;
+ ndef_aux->boundary = sarg.boundary;
+ ndef_aux->out = out;
+
+ BIO_ctrl(asn_bio, BIO_C_SET_EX_ARG, 0, ndef_aux);
+
+ return sarg.ndef_bio;
+
+ err:
+ BIO_free(asn_bio);
+ OPENSSL_free(ndef_aux);
+ return NULL;
+}
+
+static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg)
+{
+ NDEF_SUPPORT *ndef_aux;
+ unsigned char *p;
+ int derlen;
+
+ if (!parg)
+ return 0;
+
+ ndef_aux = *(NDEF_SUPPORT **)parg;
+
+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it);
+ p = OPENSSL_malloc(derlen);
+ if (p == NULL)
+ return 0;
+
+ ndef_aux->derbuf = p;
+ *pbuf = p;
+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it);
+
+ if (!*ndef_aux->boundary)
+ return 0;
+
+ *plen = *ndef_aux->boundary - *pbuf;
+
+ return 1;
+}
+
+static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen,
+ void *parg)
+{
+ NDEF_SUPPORT *ndef_aux;
+
+ if (!parg)
+ return 0;
+
+ ndef_aux = *(NDEF_SUPPORT **)parg;
+
+ OPENSSL_free(ndef_aux->derbuf);
+
+ ndef_aux->derbuf = NULL;
+ *pbuf = NULL;
+ *plen = 0;
+ return 1;
+}
+
+static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen,
+ void *parg)
+{
+ NDEF_SUPPORT **pndef_aux = (NDEF_SUPPORT **)parg;
+ if (!ndef_prefix_free(b, pbuf, plen, parg))
+ return 0;
+ OPENSSL_free(*pndef_aux);
+ *pndef_aux = NULL;
+ return 1;
+}
+
+static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg)
+{
+ NDEF_SUPPORT *ndef_aux;
+ unsigned char *p;
+ int derlen;
+ const ASN1_AUX *aux;
+ ASN1_STREAM_ARG sarg;
+
+ if (!parg)
+ return 0;
+
+ ndef_aux = *(NDEF_SUPPORT **)parg;
+
+ aux = ndef_aux->it->funcs;
+
+ /* Finalize structures */
+ sarg.ndef_bio = ndef_aux->ndef_bio;
+ sarg.out = ndef_aux->out;
+ sarg.boundary = ndef_aux->boundary;
+ if (aux->asn1_cb(ASN1_OP_STREAM_POST,
+ &ndef_aux->val, ndef_aux->it, &sarg) <= 0)
+ return 0;
+
+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it);
+ p = OPENSSL_malloc(derlen);
+ if (p == NULL)
+ return 0;
+
+ ndef_aux->derbuf = p;
+ *pbuf = p;
+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it);
+
+ if (!*ndef_aux->boundary)
+ return 0;
+ *pbuf = *ndef_aux->boundary;
+ *plen = derlen - (*ndef_aux->boundary - ndef_aux->derbuf);
+
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/build.info b/openssl-1.1.0h/crypto/asn1/build.info
new file mode 100644
index 0000000..c1afb71
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/build.info
@@ -0,0 +1,16 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+ a_object.c a_bitstr.c a_utctm.c a_gentm.c a_time.c a_int.c a_octet.c \
+ a_print.c a_type.c a_dup.c a_d2i_fp.c a_i2d_fp.c \
+ a_utf8.c a_sign.c a_digest.c a_verify.c a_mbstr.c a_strex.c \
+ x_algor.c x_val.c x_sig.c x_bignum.c \
+ x_long.c x_int64.c x_info.c x_spki.c nsseq.c \
+ d2i_pu.c d2i_pr.c i2d_pu.c i2d_pr.c\
+ t_pkey.c t_spki.c t_bitst.c \
+ tasn_new.c tasn_fre.c tasn_enc.c tasn_dec.c tasn_utl.c tasn_typ.c \
+ tasn_prn.c tasn_scn.c ameth_lib.c \
+ f_int.c f_string.c n_pkey.c \
+ x_pkey.c bio_asn1.c bio_ndef.c asn_mime.c \
+ asn1_gen.c asn1_par.c asn1_lib.c asn1_err.c a_strnid.c \
+ evp_asn1.c asn_pack.c p5_pbe.c p5_pbev2.c p5_scrypt.c p8_pkey.c \
+ asn_moid.c asn_mstbl.c
diff --git a/openssl-1.1.0h/crypto/asn1/charmap.h b/openssl-1.1.0h/crypto/asn1/charmap.h
new file mode 100644
index 0000000..2a75925
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/charmap.h
@@ -0,0 +1,34 @@
+/*
+ * WARNING: do not edit!
+ * Generated by crypto/asn1/charmap.pl
+ *
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#define CHARTYPE_HOST_ANY 4096
+#define CHARTYPE_HOST_DOT 8192
+#define CHARTYPE_HOST_HYPHEN 16384
+#define CHARTYPE_HOST_WILD 32768
+
+/*
+ * Mask of various character properties
+ */
+
+static const unsigned short char_type[] = {
+ 1026, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 120, 0, 1, 40,
+ 0, 0, 0, 16, 1040, 1040, 33792, 25, 25, 16400, 8208, 16,
+ 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 16, 9,
+ 9, 16, 9, 16, 0, 4112, 4112, 4112, 4112, 4112, 4112, 4112,
+ 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112,
+ 4112, 4112, 4112, 4112, 4112, 4112, 4112, 0, 1025, 0, 0, 0,
+ 0, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112,
+ 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112, 4112,
+ 4112, 4112, 4112, 0, 0, 0, 0, 2
+};
diff --git a/openssl-1.1.0h/crypto/asn1/charmap.pl b/openssl-1.1.0h/crypto/asn1/charmap.pl
new file mode 100644
index 0000000..26ca325
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/charmap.pl
@@ -0,0 +1,117 @@
+#! /usr/bin/env perl
+# Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+use strict;
+
+my ($i, @arr);
+
+# Set up an array with the type of ASCII characters
+# Each set bit represents a character property.
+
+# RFC2253 character properties
+my $RFC2253_ESC = 1; # Character escaped with \
+my $ESC_CTRL = 2; # Escaped control character
+# These are used with RFC1779 quoting using "
+my $NOESC_QUOTE = 8; # Not escaped if quoted
+my $PSTRING_CHAR = 0x10; # Valid PrintableString character
+my $RFC2253_FIRST_ESC = 0x20; # Escaped with \ if first character
+my $RFC2253_LAST_ESC = 0x40; # Escaped with \ if last character
+my $RFC2254_ESC = 0x400; # Character escaped \XX
+my $HOST_ANY = 0x1000; # Valid hostname character anywhere in label
+my $HOST_DOT = 0x2000; # Dot: hostname label separator
+my $HOST_HYPHEN = 0x4000; # Hyphen: not valid at start or end.
+my $HOST_WILD = 0x8000; # Wildcard character
+
+for($i = 0; $i < 128; $i++) {
+ # Set the RFC2253 escape characters (control)
+ $arr[$i] = 0;
+ if(($i < 32) || ($i > 126)) {
+ $arr[$i] |= $ESC_CTRL;
+ }
+
+ # Some PrintableString characters
+ if( ( ( $i >= ord("a")) && ( $i <= ord("z")) )
+ || ( ( $i >= ord("A")) && ( $i <= ord("Z")) )
+ || ( ( $i >= ord("0")) && ( $i <= ord("9")) ) ) {
+ $arr[$i] |= $PSTRING_CHAR | $HOST_ANY;
+ }
+}
+
+# Now setup the rest
+
+# Remaining RFC2253 escaped characters
+
+$arr[ord(" ")] |= $NOESC_QUOTE | $RFC2253_FIRST_ESC | $RFC2253_LAST_ESC;
+$arr[ord("#")] |= $NOESC_QUOTE | $RFC2253_FIRST_ESC;
+
+$arr[ord(",")] |= $NOESC_QUOTE | $RFC2253_ESC;
+$arr[ord("+")] |= $NOESC_QUOTE | $RFC2253_ESC;
+$arr[ord("\"")] |= $RFC2253_ESC;
+$arr[ord("\\")] |= $RFC2253_ESC;
+$arr[ord("<")] |= $NOESC_QUOTE | $RFC2253_ESC;
+$arr[ord(">")] |= $NOESC_QUOTE | $RFC2253_ESC;
+$arr[ord(";")] |= $NOESC_QUOTE | $RFC2253_ESC;
+
+# Remaining RFC2254 characters
+
+$arr[0] |= $RFC2254_ESC;
+$arr[ord("(")] |= $RFC2254_ESC;
+$arr[ord(")")] |= $RFC2254_ESC;
+$arr[ord("*")] |= $RFC2254_ESC | $HOST_WILD;
+$arr[ord("\\")] |= $RFC2254_ESC;
+
+# Remaining PrintableString characters
+
+$arr[ord(" ")] |= $PSTRING_CHAR;
+$arr[ord("'")] |= $PSTRING_CHAR;
+$arr[ord("(")] |= $PSTRING_CHAR;
+$arr[ord(")")] |= $PSTRING_CHAR;
+$arr[ord("+")] |= $PSTRING_CHAR;
+$arr[ord(",")] |= $PSTRING_CHAR;
+$arr[ord("-")] |= $PSTRING_CHAR | $HOST_HYPHEN;
+$arr[ord(".")] |= $PSTRING_CHAR | $HOST_DOT;
+$arr[ord("/")] |= $PSTRING_CHAR;
+$arr[ord(":")] |= $PSTRING_CHAR;
+$arr[ord("=")] |= $PSTRING_CHAR;
+$arr[ord("?")] |= $PSTRING_CHAR;
+
+# Now generate the C code
+
+print <<EOF;
+/*
+ * WARNING: do not edit!
+ * Generated by crypto/asn1/charmap.pl
+ *
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#define CHARTYPE_HOST_ANY $HOST_ANY
+#define CHARTYPE_HOST_DOT $HOST_DOT
+#define CHARTYPE_HOST_HYPHEN $HOST_HYPHEN
+#define CHARTYPE_HOST_WILD $HOST_WILD
+
+/*
+ * Mask of various character properties
+ */
+
+static const unsigned short char_type[] = {
+EOF
+
+print " ";
+for($i = 0; $i < 128; $i++) {
+ print("\n ") if($i && (($i % 12) == 0));
+ printf(" %4d", $arr[$i]);
+ print(",") if ($i != 127);
+}
+print("\n};\n");
+
diff --git a/openssl-1.1.0h/crypto/asn1/d2i_pr.c b/openssl-1.1.0h/crypto/asn1/d2i_pr.c
new file mode 100644
index 0000000..e311b90
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/d2i_pr.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/bn.h>
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/engine.h>
+#include <openssl/x509.h>
+#include <openssl/asn1.h>
+#include "internal/asn1_int.h"
+#include "internal/evp_int.h"
+
+EVP_PKEY *d2i_PrivateKey(int type, EVP_PKEY **a, const unsigned char **pp,
+ long length)
+{
+ EVP_PKEY *ret;
+ const unsigned char *p = *pp;
+
+ if ((a == NULL) || (*a == NULL)) {
+ if ((ret = EVP_PKEY_new()) == NULL) {
+ ASN1err(ASN1_F_D2I_PRIVATEKEY, ERR_R_EVP_LIB);
+ return (NULL);
+ }
+ } else {
+ ret = *a;
+#ifndef OPENSSL_NO_ENGINE
+ ENGINE_finish(ret->engine);
+ ret->engine = NULL;
+#endif
+ }
+
+ if (!EVP_PKEY_set_type(ret, type)) {
+ ASN1err(ASN1_F_D2I_PRIVATEKEY, ASN1_R_UNKNOWN_PUBLIC_KEY_TYPE);
+ goto err;
+ }
+
+ if (!ret->ameth->old_priv_decode ||
+ !ret->ameth->old_priv_decode(ret, &p, length)) {
+ if (ret->ameth->priv_decode) {
+ EVP_PKEY *tmp;
+ PKCS8_PRIV_KEY_INFO *p8 = NULL;
+ p8 = d2i_PKCS8_PRIV_KEY_INFO(NULL, &p, length);
+ if (!p8)
+ goto err;
+ tmp = EVP_PKCS82PKEY(p8);
+ PKCS8_PRIV_KEY_INFO_free(p8);
+ if (tmp == NULL)
+ goto err;
+ EVP_PKEY_free(ret);
+ ret = tmp;
+ } else {
+ ASN1err(ASN1_F_D2I_PRIVATEKEY, ERR_R_ASN1_LIB);
+ goto err;
+ }
+ }
+ *pp = p;
+ if (a != NULL)
+ (*a) = ret;
+ return (ret);
+ err:
+ if (a == NULL || *a != ret)
+ EVP_PKEY_free(ret);
+ return (NULL);
+}
+
+/*
+ * This works like d2i_PrivateKey() except it automatically works out the
+ * type
+ */
+
+EVP_PKEY *d2i_AutoPrivateKey(EVP_PKEY **a, const unsigned char **pp,
+ long length)
+{
+ STACK_OF(ASN1_TYPE) *inkey;
+ const unsigned char *p;
+ int keytype;
+ p = *pp;
+ /*
+ * Dirty trick: read in the ASN1 data into a STACK_OF(ASN1_TYPE): by
+ * analyzing it we can determine the passed structure: this assumes the
+ * input is surrounded by an ASN1 SEQUENCE.
+ */
+ inkey = d2i_ASN1_SEQUENCE_ANY(NULL, &p, length);
+ p = *pp;
+ /*
+ * Since we only need to discern "traditional format" RSA and DSA keys we
+ * can just count the elements.
+ */
+ if (sk_ASN1_TYPE_num(inkey) == 6)
+ keytype = EVP_PKEY_DSA;
+ else if (sk_ASN1_TYPE_num(inkey) == 4)
+ keytype = EVP_PKEY_EC;
+ else if (sk_ASN1_TYPE_num(inkey) == 3) { /* This seems to be PKCS8, not
+ * traditional format */
+ PKCS8_PRIV_KEY_INFO *p8 = d2i_PKCS8_PRIV_KEY_INFO(NULL, &p, length);
+ EVP_PKEY *ret;
+
+ sk_ASN1_TYPE_pop_free(inkey, ASN1_TYPE_free);
+ if (!p8) {
+ ASN1err(ASN1_F_D2I_AUTOPRIVATEKEY,
+ ASN1_R_UNSUPPORTED_PUBLIC_KEY_TYPE);
+ return NULL;
+ }
+ ret = EVP_PKCS82PKEY(p8);
+ PKCS8_PRIV_KEY_INFO_free(p8);
+ if (ret == NULL)
+ return NULL;
+ *pp = p;
+ if (a) {
+ *a = ret;
+ }
+ return ret;
+ } else
+ keytype = EVP_PKEY_RSA;
+ sk_ASN1_TYPE_pop_free(inkey, ASN1_TYPE_free);
+ return d2i_PrivateKey(keytype, a, pp, length);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/d2i_pu.c b/openssl-1.1.0h/crypto/asn1/d2i_pu.c
new file mode 100644
index 0000000..dfdc1a6
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/d2i_pu.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/bn.h>
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/asn1.h>
+#include <openssl/rsa.h>
+#include <openssl/dsa.h>
+#include <openssl/ec.h>
+
+#include "internal/evp_int.h"
+
+EVP_PKEY *d2i_PublicKey(int type, EVP_PKEY **a, const unsigned char **pp,
+ long length)
+{
+ EVP_PKEY *ret;
+
+ if ((a == NULL) || (*a == NULL)) {
+ if ((ret = EVP_PKEY_new()) == NULL) {
+ ASN1err(ASN1_F_D2I_PUBLICKEY, ERR_R_EVP_LIB);
+ return (NULL);
+ }
+ } else
+ ret = *a;
+
+ if (!EVP_PKEY_set_type(ret, type)) {
+ ASN1err(ASN1_F_D2I_PUBLICKEY, ERR_R_EVP_LIB);
+ goto err;
+ }
+
+ switch (EVP_PKEY_id(ret)) {
+#ifndef OPENSSL_NO_RSA
+ case EVP_PKEY_RSA:
+ if ((ret->pkey.rsa = d2i_RSAPublicKey(NULL, pp, length)) == NULL) {
+ ASN1err(ASN1_F_D2I_PUBLICKEY, ERR_R_ASN1_LIB);
+ goto err;
+ }
+ break;
+#endif
+#ifndef OPENSSL_NO_DSA
+ case EVP_PKEY_DSA:
+ /* TMP UGLY CAST */
+ if (!d2i_DSAPublicKey(&ret->pkey.dsa, pp, length)) {
+ ASN1err(ASN1_F_D2I_PUBLICKEY, ERR_R_ASN1_LIB);
+ goto err;
+ }
+ break;
+#endif
+#ifndef OPENSSL_NO_EC
+ case EVP_PKEY_EC:
+ if (!o2i_ECPublicKey(&ret->pkey.ec, pp, length)) {
+ ASN1err(ASN1_F_D2I_PUBLICKEY, ERR_R_ASN1_LIB);
+ goto err;
+ }
+ break;
+#endif
+ default:
+ ASN1err(ASN1_F_D2I_PUBLICKEY, ASN1_R_UNKNOWN_PUBLIC_KEY_TYPE);
+ goto err;
+ /* break; */
+ }
+ if (a != NULL)
+ (*a) = ret;
+ return (ret);
+ err:
+ if (a == NULL || *a != ret)
+ EVP_PKEY_free(ret);
+ return (NULL);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/evp_asn1.c b/openssl-1.1.0h/crypto/asn1/evp_asn1.c
new file mode 100644
index 0000000..a458367
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/evp_asn1.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+
+int ASN1_TYPE_set_octetstring(ASN1_TYPE *a, unsigned char *data, int len)
+{
+ ASN1_STRING *os;
+
+ if ((os = ASN1_OCTET_STRING_new()) == NULL)
+ return (0);
+ if (!ASN1_OCTET_STRING_set(os, data, len)) {
+ ASN1_OCTET_STRING_free(os);
+ return 0;
+ }
+ ASN1_TYPE_set(a, V_ASN1_OCTET_STRING, os);
+ return (1);
+}
+
+/* int max_len: for returned value */
+int ASN1_TYPE_get_octetstring(const ASN1_TYPE *a, unsigned char *data, int max_len)
+{
+ int ret, num;
+ const unsigned char *p;
+
+ if ((a->type != V_ASN1_OCTET_STRING) || (a->value.octet_string == NULL)) {
+ ASN1err(ASN1_F_ASN1_TYPE_GET_OCTETSTRING, ASN1_R_DATA_IS_WRONG);
+ return (-1);
+ }
+ p = ASN1_STRING_get0_data(a->value.octet_string);
+ ret = ASN1_STRING_length(a->value.octet_string);
+ if (ret < max_len)
+ num = ret;
+ else
+ num = max_len;
+ memcpy(data, p, num);
+ return (ret);
+}
+
+typedef struct {
+ long num;
+ ASN1_OCTET_STRING *oct;
+} asn1_int_oct;
+
+ASN1_SEQUENCE(asn1_int_oct) = {
+ ASN1_SIMPLE(asn1_int_oct, num, LONG),
+ ASN1_SIMPLE(asn1_int_oct, oct, ASN1_OCTET_STRING)
+} static_ASN1_SEQUENCE_END(asn1_int_oct)
+
+DECLARE_ASN1_ITEM(asn1_int_oct)
+
+int ASN1_TYPE_set_int_octetstring(ASN1_TYPE *a, long num, unsigned char *data,
+ int len)
+{
+ asn1_int_oct atmp;
+ ASN1_OCTET_STRING oct;
+
+ atmp.num = num;
+ atmp.oct = &oct;
+ oct.data = data;
+ oct.type = V_ASN1_OCTET_STRING;
+ oct.length = len;
+ oct.flags = 0;
+
+ if (ASN1_TYPE_pack_sequence(ASN1_ITEM_rptr(asn1_int_oct), &atmp, &a))
+ return 1;
+ return 0;
+}
+
+/*
+ * we return the actual length...
+ */
+/* int max_len: for returned value */
+int ASN1_TYPE_get_int_octetstring(const ASN1_TYPE *a, long *num,
+ unsigned char *data, int max_len)
+{
+ asn1_int_oct *atmp = NULL;
+ int ret = -1, n;
+
+ if ((a->type != V_ASN1_SEQUENCE) || (a->value.sequence == NULL)) {
+ goto err;
+ }
+
+ atmp = ASN1_TYPE_unpack_sequence(ASN1_ITEM_rptr(asn1_int_oct), a);
+
+ if (atmp == NULL)
+ goto err;
+
+ if (num != NULL)
+ *num = atmp->num;
+
+ ret = ASN1_STRING_length(atmp->oct);
+ if (max_len > ret)
+ n = ret;
+ else
+ n = max_len;
+
+ if (data != NULL)
+ memcpy(data, ASN1_STRING_get0_data(atmp->oct), n);
+ if (ret == -1) {
+ err:
+ ASN1err(ASN1_F_ASN1_TYPE_GET_INT_OCTETSTRING, ASN1_R_DATA_IS_WRONG);
+ }
+ M_ASN1_free_of(atmp, asn1_int_oct);
+ return ret;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/f_int.c b/openssl-1.1.0h/crypto/asn1/f_int.c
new file mode 100644
index 0000000..ec556c9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/f_int.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "internal/cryptlib.h"
+#include <openssl/buffer.h>
+#include <openssl/asn1.h>
+
+int i2a_ASN1_INTEGER(BIO *bp, const ASN1_INTEGER *a)
+{
+ int i, n = 0;
+ static const char *h = "0123456789ABCDEF";
+ char buf[2];
+
+ if (a == NULL)
+ return (0);
+
+ if (a->type & V_ASN1_NEG) {
+ if (BIO_write(bp, "-", 1) != 1)
+ goto err;
+ n = 1;
+ }
+
+ if (a->length == 0) {
+ if (BIO_write(bp, "00", 2) != 2)
+ goto err;
+ n += 2;
+ } else {
+ for (i = 0; i < a->length; i++) {
+ if ((i != 0) && (i % 35 == 0)) {
+ if (BIO_write(bp, "\\\n", 2) != 2)
+ goto err;
+ n += 2;
+ }
+ buf[0] = h[((unsigned char)a->data[i] >> 4) & 0x0f];
+ buf[1] = h[((unsigned char)a->data[i]) & 0x0f];
+ if (BIO_write(bp, buf, 2) != 2)
+ goto err;
+ n += 2;
+ }
+ }
+ return (n);
+ err:
+ return (-1);
+}
+
+int a2i_ASN1_INTEGER(BIO *bp, ASN1_INTEGER *bs, char *buf, int size)
+{
+ int i, j, k, m, n, again, bufsize;
+ unsigned char *s = NULL, *sp;
+ unsigned char *bufp;
+ int num = 0, slen = 0, first = 1;
+
+ bs->type = V_ASN1_INTEGER;
+
+ bufsize = BIO_gets(bp, buf, size);
+ for (;;) {
+ if (bufsize < 1)
+ goto err;
+ i = bufsize;
+ if (buf[i - 1] == '\n')
+ buf[--i] = '\0';
+ if (i == 0)
+ goto err;
+ if (buf[i - 1] == '\r')
+ buf[--i] = '\0';
+ if (i == 0)
+ goto err;
+ again = (buf[i - 1] == '\\');
+
+ for (j = 0; j < i; j++) {
+#ifndef CHARSET_EBCDIC
+ if (!(((buf[j] >= '0') && (buf[j] <= '9')) ||
+ ((buf[j] >= 'a') && (buf[j] <= 'f')) ||
+ ((buf[j] >= 'A') && (buf[j] <= 'F'))))
+#else
+ /*
+ * This #ifdef is not strictly necessary, since the characters
+ * A...F a...f 0...9 are contiguous (yes, even in EBCDIC - but
+ * not the whole alphabet). Nevertheless, isxdigit() is faster.
+ */
+ if (!isxdigit(buf[j]))
+#endif
+ {
+ i = j;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ /*
+ * We have now cleared all the crap off the end of the line
+ */
+ if (i < 2)
+ goto err;
+
+ bufp = (unsigned char *)buf;
+ if (first) {
+ first = 0;
+ if ((bufp[0] == '0') && (bufp[1] == '0')) {
+ bufp += 2;
+ i -= 2;
+ }
+ }
+ k = 0;
+ i -= again;
+ if (i % 2 != 0) {
+ ASN1err(ASN1_F_A2I_ASN1_INTEGER, ASN1_R_ODD_NUMBER_OF_CHARS);
+ OPENSSL_free(s);
+ return 0;
+ }
+ i /= 2;
+ if (num + i > slen) {
+ sp = OPENSSL_clear_realloc(s, slen, num + i * 2);
+ if (sp == NULL) {
+ ASN1err(ASN1_F_A2I_ASN1_INTEGER, ERR_R_MALLOC_FAILURE);
+ OPENSSL_free(s);
+ return 0;
+ }
+ s = sp;
+ slen = num + i * 2;
+ }
+ for (j = 0; j < i; j++, k += 2) {
+ for (n = 0; n < 2; n++) {
+ m = OPENSSL_hexchar2int(bufp[k + n]);
+ if (m < 0) {
+ ASN1err(ASN1_F_A2I_ASN1_INTEGER,
+ ASN1_R_NON_HEX_CHARACTERS);
+ goto err;
+ }
+ s[num + j] <<= 4;
+ s[num + j] |= m;
+ }
+ }
+ num += i;
+ if (again)
+ bufsize = BIO_gets(bp, buf, size);
+ else
+ break;
+ }
+ bs->length = num;
+ bs->data = s;
+ return 1;
+ err:
+ ASN1err(ASN1_F_A2I_ASN1_INTEGER, ASN1_R_SHORT_LINE);
+ OPENSSL_free(s);
+ return 0;
+}
+
+int i2a_ASN1_ENUMERATED(BIO *bp, const ASN1_ENUMERATED *a)
+{
+ return i2a_ASN1_INTEGER(bp, a);
+}
+
+int a2i_ASN1_ENUMERATED(BIO *bp, ASN1_ENUMERATED *bs, char *buf, int size)
+{
+ int rv = a2i_ASN1_INTEGER(bp, bs, buf, size);
+ if (rv == 1)
+ bs->type = V_ASN1_INTEGER | (bs->type & V_ASN1_NEG);
+ return rv;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/f_string.c b/openssl-1.1.0h/crypto/asn1/f_string.c
new file mode 100644
index 0000000..b9258bb
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/f_string.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "internal/cryptlib.h"
+#include <openssl/buffer.h>
+#include <openssl/asn1.h>
+
+int i2a_ASN1_STRING(BIO *bp, const ASN1_STRING *a, int type)
+{
+ int i, n = 0;
+ static const char *h = "0123456789ABCDEF";
+ char buf[2];
+
+ if (a == NULL)
+ return (0);
+
+ if (a->length == 0) {
+ if (BIO_write(bp, "0", 1) != 1)
+ goto err;
+ n = 1;
+ } else {
+ for (i = 0; i < a->length; i++) {
+ if ((i != 0) && (i % 35 == 0)) {
+ if (BIO_write(bp, "\\\n", 2) != 2)
+ goto err;
+ n += 2;
+ }
+ buf[0] = h[((unsigned char)a->data[i] >> 4) & 0x0f];
+ buf[1] = h[((unsigned char)a->data[i]) & 0x0f];
+ if (BIO_write(bp, buf, 2) != 2)
+ goto err;
+ n += 2;
+ }
+ }
+ return (n);
+ err:
+ return (-1);
+}
+
+int a2i_ASN1_STRING(BIO *bp, ASN1_STRING *bs, char *buf, int size)
+{
+ int i, j, k, m, n, again, bufsize, spec_char;
+ unsigned char *s = NULL, *sp;
+ unsigned char *bufp;
+ int num = 0, slen = 0, first = 1;
+
+ bufsize = BIO_gets(bp, buf, size);
+ for (;;) {
+ if (bufsize < 1) {
+ if (first)
+ break;
+ else
+ goto err;
+ }
+ first = 0;
+
+ i = bufsize;
+ if (buf[i - 1] == '\n')
+ buf[--i] = '\0';
+ if (i == 0)
+ goto err;
+ if (buf[i - 1] == '\r')
+ buf[--i] = '\0';
+ if (i == 0)
+ goto err;
+ again = (buf[i - 1] == '\\');
+
+ for (j = i - 1; j > 0; j--) {
+#ifndef CHARSET_EBCDIC
+ spec_char = (!(((buf[j] >= '0') && (buf[j] <= '9')) ||
+ ((buf[j] >= 'a') && (buf[j] <= 'f')) ||
+ ((buf[j] >= 'A') && (buf[j] <= 'F'))));
+#else
+ /*
+ * This #ifdef is not strictly necessary, since the characters
+ * A...F a...f 0...9 are contiguous (yes, even in EBCDIC - but
+ * not the whole alphabet). Nevertheless, isxdigit() is faster.
+ */
+ spec_char = (!isxdigit(buf[j]));
+#endif
+ if (spec_char) {
+ i = j;
+ break;
+ }
+ }
+ buf[i] = '\0';
+ /*
+ * We have now cleared all the crap off the end of the line
+ */
+ if (i < 2)
+ goto err;
+
+ bufp = (unsigned char *)buf;
+
+ k = 0;
+ i -= again;
+ if (i % 2 != 0) {
+ ASN1err(ASN1_F_A2I_ASN1_STRING, ASN1_R_ODD_NUMBER_OF_CHARS);
+ OPENSSL_free(s);
+ return 0;
+ }
+ i /= 2;
+ if (num + i > slen) {
+ sp = OPENSSL_realloc(s, (unsigned int)num + i * 2);
+ if (sp == NULL) {
+ ASN1err(ASN1_F_A2I_ASN1_STRING, ERR_R_MALLOC_FAILURE);
+ OPENSSL_free(s);
+ return 0;
+ }
+ s = sp;
+ slen = num + i * 2;
+ }
+ for (j = 0; j < i; j++, k += 2) {
+ for (n = 0; n < 2; n++) {
+ m = OPENSSL_hexchar2int(bufp[k + n]);
+ if (m < 0) {
+ ASN1err(ASN1_F_A2I_ASN1_STRING,
+ ASN1_R_NON_HEX_CHARACTERS);
+ OPENSSL_free(s);
+ return 0;
+ }
+ s[num + j] <<= 4;
+ s[num + j] |= m;
+ }
+ }
+ num += i;
+ if (again)
+ bufsize = BIO_gets(bp, buf, size);
+ else
+ break;
+ }
+ bs->length = num;
+ bs->data = s;
+ return 1;
+
+ err:
+ ASN1err(ASN1_F_A2I_ASN1_STRING, ASN1_R_SHORT_LINE);
+ OPENSSL_free(s);
+ return 0;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/i2d_pr.c b/openssl-1.1.0h/crypto/asn1/i2d_pr.c
new file mode 100644
index 0000000..445b0c8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/i2d_pr.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/x509.h>
+#include "internal/asn1_int.h"
+#include "internal/evp_int.h"
+
+int i2d_PrivateKey(EVP_PKEY *a, unsigned char **pp)
+{
+ if (a->ameth && a->ameth->old_priv_encode) {
+ return a->ameth->old_priv_encode(a, pp);
+ }
+ if (a->ameth && a->ameth->priv_encode) {
+ PKCS8_PRIV_KEY_INFO *p8 = EVP_PKEY2PKCS8(a);
+ int ret = 0;
+ if (p8 != NULL) {
+ ret = i2d_PKCS8_PRIV_KEY_INFO(p8, pp);
+ PKCS8_PRIV_KEY_INFO_free(p8);
+ }
+ return ret;
+ }
+ ASN1err(ASN1_F_I2D_PRIVATEKEY, ASN1_R_UNSUPPORTED_PUBLIC_KEY_TYPE);
+ return -1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/i2d_pu.c b/openssl-1.1.0h/crypto/asn1/i2d_pu.c
new file mode 100644
index 0000000..8986c43
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/i2d_pu.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/bn.h>
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rsa.h>
+#include <openssl/dsa.h>
+#include <openssl/ec.h>
+
+int i2d_PublicKey(EVP_PKEY *a, unsigned char **pp)
+{
+ switch (EVP_PKEY_id(a)) {
+#ifndef OPENSSL_NO_RSA
+ case EVP_PKEY_RSA:
+ return i2d_RSAPublicKey(EVP_PKEY_get0_RSA(a), pp);
+#endif
+#ifndef OPENSSL_NO_DSA
+ case EVP_PKEY_DSA:
+ return i2d_DSAPublicKey(EVP_PKEY_get0_DSA(a), pp);
+#endif
+#ifndef OPENSSL_NO_EC
+ case EVP_PKEY_EC:
+ return i2o_ECPublicKey(EVP_PKEY_get0_EC_KEY(a), pp);
+#endif
+ default:
+ ASN1err(ASN1_F_I2D_PUBLICKEY, ASN1_R_UNSUPPORTED_PUBLIC_KEY_TYPE);
+ return -1;
+ }
+}
diff --git a/openssl-1.1.0h/crypto/asn1/n_pkey.c b/openssl-1.1.0h/crypto/asn1/n_pkey.c
new file mode 100644
index 0000000..267ce60
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/n_pkey.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "openssl/opensslconf.h"
+#ifdef OPENSSL_NO_RSA
+NON_EMPTY_TRANSLATION_UNIT
+#else
+
+# include "internal/cryptlib.h"
+# include <stdio.h>
+# include <openssl/rsa.h>
+# include <openssl/objects.h>
+# include <openssl/asn1t.h>
+# include <openssl/evp.h>
+# include <openssl/x509.h>
+
+# ifndef OPENSSL_NO_RC4
+
+typedef struct netscape_pkey_st {
+ long version;
+ X509_ALGOR *algor;
+ ASN1_OCTET_STRING *private_key;
+} NETSCAPE_PKEY;
+
+typedef struct netscape_encrypted_pkey_st {
+ ASN1_OCTET_STRING *os;
+ /*
+ * This is the same structure as DigestInfo so use it: although this
+ * isn't really anything to do with digests.
+ */
+ X509_SIG *enckey;
+} NETSCAPE_ENCRYPTED_PKEY;
+
+
+ASN1_BROKEN_SEQUENCE(NETSCAPE_ENCRYPTED_PKEY) = {
+ ASN1_SIMPLE(NETSCAPE_ENCRYPTED_PKEY, os, ASN1_OCTET_STRING),
+ ASN1_SIMPLE(NETSCAPE_ENCRYPTED_PKEY, enckey, X509_SIG)
+} static_ASN1_BROKEN_SEQUENCE_END(NETSCAPE_ENCRYPTED_PKEY)
+
+DECLARE_ASN1_FUNCTIONS_const(NETSCAPE_ENCRYPTED_PKEY)
+DECLARE_ASN1_ENCODE_FUNCTIONS_const(NETSCAPE_ENCRYPTED_PKEY,NETSCAPE_ENCRYPTED_PKEY)
+IMPLEMENT_ASN1_FUNCTIONS_const(NETSCAPE_ENCRYPTED_PKEY)
+
+ASN1_SEQUENCE(NETSCAPE_PKEY) = {
+ ASN1_SIMPLE(NETSCAPE_PKEY, version, LONG),
+ ASN1_SIMPLE(NETSCAPE_PKEY, algor, X509_ALGOR),
+ ASN1_SIMPLE(NETSCAPE_PKEY, private_key, ASN1_OCTET_STRING)
+} static_ASN1_SEQUENCE_END(NETSCAPE_PKEY)
+
+DECLARE_ASN1_FUNCTIONS_const(NETSCAPE_PKEY)
+DECLARE_ASN1_ENCODE_FUNCTIONS_const(NETSCAPE_PKEY,NETSCAPE_PKEY)
+IMPLEMENT_ASN1_FUNCTIONS_const(NETSCAPE_PKEY)
+
+# endif /* OPENSSL_NO_RC4 */
+
+#endif
diff --git a/openssl-1.1.0h/crypto/asn1/nsseq.c b/openssl-1.1.0h/crypto/asn1/nsseq.c
new file mode 100644
index 0000000..c7baf40
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/nsseq.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/objects.h>
+
+static int nsseq_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ void *exarg)
+{
+ if (operation == ASN1_OP_NEW_POST) {
+ NETSCAPE_CERT_SEQUENCE *nsseq;
+ nsseq = (NETSCAPE_CERT_SEQUENCE *)*pval;
+ nsseq->type = OBJ_nid2obj(NID_netscape_cert_sequence);
+ }
+ return 1;
+}
+
+/* Netscape certificate sequence structure */
+
+ASN1_SEQUENCE_cb(NETSCAPE_CERT_SEQUENCE, nsseq_cb) = {
+ ASN1_SIMPLE(NETSCAPE_CERT_SEQUENCE, type, ASN1_OBJECT),
+ ASN1_EXP_SEQUENCE_OF_OPT(NETSCAPE_CERT_SEQUENCE, certs, X509, 0)
+} ASN1_SEQUENCE_END_cb(NETSCAPE_CERT_SEQUENCE, NETSCAPE_CERT_SEQUENCE)
+
+IMPLEMENT_ASN1_FUNCTIONS(NETSCAPE_CERT_SEQUENCE)
diff --git a/openssl-1.1.0h/crypto/asn1/p5_pbe.c b/openssl-1.1.0h/crypto/asn1/p5_pbe.c
new file mode 100644
index 0000000..ab7e168
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/p5_pbe.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/rand.h>
+
+/* PKCS#5 password based encryption structure */
+
+ASN1_SEQUENCE(PBEPARAM) = {
+ ASN1_SIMPLE(PBEPARAM, salt, ASN1_OCTET_STRING),
+ ASN1_SIMPLE(PBEPARAM, iter, ASN1_INTEGER)
+} ASN1_SEQUENCE_END(PBEPARAM)
+
+IMPLEMENT_ASN1_FUNCTIONS(PBEPARAM)
+
+/* Set an algorithm identifier for a PKCS#5 PBE algorithm */
+
+int PKCS5_pbe_set0_algor(X509_ALGOR *algor, int alg, int iter,
+ const unsigned char *salt, int saltlen)
+{
+ PBEPARAM *pbe = NULL;
+ ASN1_STRING *pbe_str = NULL;
+ unsigned char *sstr = NULL;
+
+ pbe = PBEPARAM_new();
+ if (pbe == NULL) {
+ ASN1err(ASN1_F_PKCS5_PBE_SET0_ALGOR, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (iter <= 0)
+ iter = PKCS5_DEFAULT_ITER;
+ if (!ASN1_INTEGER_set(pbe->iter, iter)) {
+ ASN1err(ASN1_F_PKCS5_PBE_SET0_ALGOR, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (!saltlen)
+ saltlen = PKCS5_SALT_LEN;
+
+ sstr = OPENSSL_malloc(saltlen);
+ if (sstr == NULL) {
+ ASN1err(ASN1_F_PKCS5_PBE_SET0_ALGOR, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (salt)
+ memcpy(sstr, salt, saltlen);
+ else if (RAND_bytes(sstr, saltlen) <= 0)
+ goto err;
+
+ ASN1_STRING_set0(pbe->salt, sstr, saltlen);
+ sstr = NULL;
+
+ if (!ASN1_item_pack(pbe, ASN1_ITEM_rptr(PBEPARAM), &pbe_str)) {
+ ASN1err(ASN1_F_PKCS5_PBE_SET0_ALGOR, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ PBEPARAM_free(pbe);
+ pbe = NULL;
+
+ if (X509_ALGOR_set0(algor, OBJ_nid2obj(alg), V_ASN1_SEQUENCE, pbe_str))
+ return 1;
+
+ err:
+ OPENSSL_free(sstr);
+ PBEPARAM_free(pbe);
+ ASN1_STRING_free(pbe_str);
+ return 0;
+}
+
+/* Return an algorithm identifier for a PKCS#5 PBE algorithm */
+
+X509_ALGOR *PKCS5_pbe_set(int alg, int iter,
+ const unsigned char *salt, int saltlen)
+{
+ X509_ALGOR *ret;
+ ret = X509_ALGOR_new();
+ if (ret == NULL) {
+ ASN1err(ASN1_F_PKCS5_PBE_SET, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ if (PKCS5_pbe_set0_algor(ret, alg, iter, salt, saltlen))
+ return ret;
+
+ X509_ALGOR_free(ret);
+ return NULL;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/p5_pbev2.c b/openssl-1.1.0h/crypto/asn1/p5_pbev2.c
new file mode 100644
index 0000000..14e8700
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/p5_pbev2.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/rand.h>
+
+/* PKCS#5 v2.0 password based encryption structures */
+
+ASN1_SEQUENCE(PBE2PARAM) = {
+ ASN1_SIMPLE(PBE2PARAM, keyfunc, X509_ALGOR),
+ ASN1_SIMPLE(PBE2PARAM, encryption, X509_ALGOR)
+} ASN1_SEQUENCE_END(PBE2PARAM)
+
+IMPLEMENT_ASN1_FUNCTIONS(PBE2PARAM)
+
+ASN1_SEQUENCE(PBKDF2PARAM) = {
+ ASN1_SIMPLE(PBKDF2PARAM, salt, ASN1_ANY),
+ ASN1_SIMPLE(PBKDF2PARAM, iter, ASN1_INTEGER),
+ ASN1_OPT(PBKDF2PARAM, keylength, ASN1_INTEGER),
+ ASN1_OPT(PBKDF2PARAM, prf, X509_ALGOR)
+} ASN1_SEQUENCE_END(PBKDF2PARAM)
+
+IMPLEMENT_ASN1_FUNCTIONS(PBKDF2PARAM)
+
+/*
+ * Return an algorithm identifier for a PKCS#5 v2.0 PBE algorithm: yes I know
+ * this is horrible! Extended version to allow application supplied PRF NID
+ * and IV.
+ */
+
+X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
+ unsigned char *salt, int saltlen,
+ unsigned char *aiv, int prf_nid)
+{
+ X509_ALGOR *scheme = NULL, *ret = NULL;
+ int alg_nid, keylen;
+ EVP_CIPHER_CTX *ctx = NULL;
+ unsigned char iv[EVP_MAX_IV_LENGTH];
+ PBE2PARAM *pbe2 = NULL;
+
+ alg_nid = EVP_CIPHER_type(cipher);
+ if (alg_nid == NID_undef) {
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_IV,
+ ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER);
+ goto err;
+ }
+
+ if ((pbe2 = PBE2PARAM_new()) == NULL)
+ goto merr;
+
+ /* Setup the AlgorithmIdentifier for the encryption scheme */
+ scheme = pbe2->encryption;
+ scheme->algorithm = OBJ_nid2obj(alg_nid);
+ if ((scheme->parameter = ASN1_TYPE_new()) == NULL)
+ goto merr;
+
+ /* Create random IV */
+ if (EVP_CIPHER_iv_length(cipher)) {
+ if (aiv)
+ memcpy(iv, aiv, EVP_CIPHER_iv_length(cipher));
+ else if (RAND_bytes(iv, EVP_CIPHER_iv_length(cipher)) <= 0)
+ goto err;
+ }
+
+ ctx = EVP_CIPHER_CTX_new();
+ if (ctx == NULL)
+ goto merr;
+
+ /* Dummy cipherinit to just setup the IV, and PRF */
+ if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, iv, 0))
+ goto err;
+ if (EVP_CIPHER_param_to_asn1(ctx, scheme->parameter) < 0) {
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_IV, ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
+ goto err;
+ }
+ /*
+ * If prf NID unspecified see if cipher has a preference. An error is OK
+ * here: just means use default PRF.
+ */
+ if ((prf_nid == -1) &&
+ EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_PBE_PRF_NID, 0, &prf_nid) <= 0) {
+ ERR_clear_error();
+ prf_nid = NID_hmacWithSHA256;
+ }
+ EVP_CIPHER_CTX_free(ctx);
+ ctx = NULL;
+
+ /* If its RC2 then we'd better setup the key length */
+
+ if (alg_nid == NID_rc2_cbc)
+ keylen = EVP_CIPHER_key_length(cipher);
+ else
+ keylen = -1;
+
+ /* Setup keyfunc */
+
+ X509_ALGOR_free(pbe2->keyfunc);
+
+ pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
+
+ if (!pbe2->keyfunc)
+ goto merr;
+
+ /* Now set up top level AlgorithmIdentifier */
+
+ if ((ret = X509_ALGOR_new()) == NULL)
+ goto merr;
+
+ ret->algorithm = OBJ_nid2obj(NID_pbes2);
+
+ /* Encode PBE2PARAM into parameter */
+
+ if (!ASN1_TYPE_pack_sequence(ASN1_ITEM_rptr(PBE2PARAM), pbe2,
+ &ret->parameter))
+ goto merr;
+
+ PBE2PARAM_free(pbe2);
+ pbe2 = NULL;
+
+ return ret;
+
+ merr:
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_IV, ERR_R_MALLOC_FAILURE);
+
+ err:
+ EVP_CIPHER_CTX_free(ctx);
+ PBE2PARAM_free(pbe2);
+ /* Note 'scheme' is freed as part of pbe2 */
+ X509_ALGOR_free(ret);
+
+ return NULL;
+}
+
+X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
+ unsigned char *salt, int saltlen)
+{
+ return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
+}
+
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+ int prf_nid, int keylen)
+{
+ X509_ALGOR *keyfunc = NULL;
+ PBKDF2PARAM *kdf = NULL;
+ ASN1_OCTET_STRING *osalt = NULL;
+
+ if ((kdf = PBKDF2PARAM_new()) == NULL)
+ goto merr;
+ if ((osalt = ASN1_OCTET_STRING_new()) == NULL)
+ goto merr;
+
+ kdf->salt->value.octet_string = osalt;
+ kdf->salt->type = V_ASN1_OCTET_STRING;
+
+ if (saltlen == 0)
+ saltlen = PKCS5_SALT_LEN;
+ if ((osalt->data = OPENSSL_malloc(saltlen)) == NULL)
+ goto merr;
+
+ osalt->length = saltlen;
+
+ if (salt)
+ memcpy(osalt->data, salt, saltlen);
+ else if (RAND_bytes(osalt->data, saltlen) <= 0)
+ goto merr;
+
+ if (iter <= 0)
+ iter = PKCS5_DEFAULT_ITER;
+
+ if (!ASN1_INTEGER_set(kdf->iter, iter))
+ goto merr;
+
+ /* If have a key len set it up */
+
+ if (keylen > 0) {
+ if ((kdf->keylength = ASN1_INTEGER_new()) == NULL)
+ goto merr;
+ if (!ASN1_INTEGER_set(kdf->keylength, keylen))
+ goto merr;
+ }
+
+ /* prf can stay NULL if we are using hmacWithSHA1 */
+ if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1) {
+ kdf->prf = X509_ALGOR_new();
+ if (kdf->prf == NULL)
+ goto merr;
+ X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid), V_ASN1_NULL, NULL);
+ }
+
+ /* Finally setup the keyfunc structure */
+
+ keyfunc = X509_ALGOR_new();
+ if (keyfunc == NULL)
+ goto merr;
+
+ keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+
+ /* Encode PBKDF2PARAM into parameter of pbe2 */
+
+ if (!ASN1_TYPE_pack_sequence(ASN1_ITEM_rptr(PBKDF2PARAM), kdf,
+ &keyfunc->parameter))
+ goto merr;
+
+ PBKDF2PARAM_free(kdf);
+ return keyfunc;
+
+ merr:
+ ASN1err(ASN1_F_PKCS5_PBKDF2_SET, ERR_R_MALLOC_FAILURE);
+ PBKDF2PARAM_free(kdf);
+ X509_ALGOR_free(keyfunc);
+ return NULL;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/p5_scrypt.c b/openssl-1.1.0h/crypto/asn1/p5_scrypt.c
new file mode 100644
index 0000000..4cb7837
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/p5_scrypt.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/err.h>
+#include <openssl/evp.h>
+#include <openssl/x509.h>
+#include <openssl/rand.h>
+
+#ifndef OPENSSL_NO_SCRYPT
+/* PKCS#5 scrypt password based encryption structures */
+
+typedef struct {
+ ASN1_OCTET_STRING *salt;
+ ASN1_INTEGER *costParameter;
+ ASN1_INTEGER *blockSize;
+ ASN1_INTEGER *parallelizationParameter;
+ ASN1_INTEGER *keyLength;
+} SCRYPT_PARAMS;
+
+ASN1_SEQUENCE(SCRYPT_PARAMS) = {
+ ASN1_SIMPLE(SCRYPT_PARAMS, salt, ASN1_OCTET_STRING),
+ ASN1_SIMPLE(SCRYPT_PARAMS, costParameter, ASN1_INTEGER),
+ ASN1_SIMPLE(SCRYPT_PARAMS, blockSize, ASN1_INTEGER),
+ ASN1_SIMPLE(SCRYPT_PARAMS, parallelizationParameter, ASN1_INTEGER),
+ ASN1_OPT(SCRYPT_PARAMS, keyLength, ASN1_INTEGER),
+} static_ASN1_SEQUENCE_END(SCRYPT_PARAMS)
+
+DECLARE_ASN1_ALLOC_FUNCTIONS(SCRYPT_PARAMS)
+IMPLEMENT_ASN1_ALLOC_FUNCTIONS(SCRYPT_PARAMS)
+
+static X509_ALGOR *pkcs5_scrypt_set(const unsigned char *salt, size_t saltlen,
+ size_t keylen, uint64_t N, uint64_t r,
+ uint64_t p);
+
+/*
+ * Return an algorithm identifier for a PKCS#5 v2.0 PBE algorithm using scrypt
+ */
+
+X509_ALGOR *PKCS5_pbe2_set_scrypt(const EVP_CIPHER *cipher,
+ const unsigned char *salt, int saltlen,
+ unsigned char *aiv, uint64_t N, uint64_t r,
+ uint64_t p)
+{
+ X509_ALGOR *scheme = NULL, *ret = NULL;
+ int alg_nid;
+ size_t keylen = 0;
+ EVP_CIPHER_CTX *ctx = NULL;
+ unsigned char iv[EVP_MAX_IV_LENGTH];
+ PBE2PARAM *pbe2 = NULL;
+
+ if (!cipher) {
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_SCRYPT, ERR_R_PASSED_NULL_PARAMETER);
+ goto err;
+ }
+
+ if (EVP_PBE_scrypt(NULL, 0, NULL, 0, N, r, p, 0, NULL, 0) == 0) {
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_SCRYPT,
+ ASN1_R_INVALID_SCRYPT_PARAMETERS);
+ goto err;
+ }
+
+ alg_nid = EVP_CIPHER_type(cipher);
+ if (alg_nid == NID_undef) {
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_SCRYPT,
+ ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER);
+ goto err;
+ }
+
+ pbe2 = PBE2PARAM_new();
+ if (pbe2 == NULL)
+ goto merr;
+
+ /* Setup the AlgorithmIdentifier for the encryption scheme */
+ scheme = pbe2->encryption;
+
+ scheme->algorithm = OBJ_nid2obj(alg_nid);
+ scheme->parameter = ASN1_TYPE_new();
+ if (scheme->parameter == NULL)
+ goto merr;
+
+ /* Create random IV */
+ if (EVP_CIPHER_iv_length(cipher)) {
+ if (aiv)
+ memcpy(iv, aiv, EVP_CIPHER_iv_length(cipher));
+ else if (RAND_bytes(iv, EVP_CIPHER_iv_length(cipher)) < 0)
+ goto err;
+ }
+
+ ctx = EVP_CIPHER_CTX_new();
+ if (ctx == NULL)
+ goto merr;
+
+ /* Dummy cipherinit to just setup the IV */
+ if (EVP_CipherInit_ex(ctx, cipher, NULL, NULL, iv, 0) == 0)
+ goto err;
+ if (EVP_CIPHER_param_to_asn1(ctx, scheme->parameter) < 0) {
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_SCRYPT,
+ ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
+ goto err;
+ }
+ EVP_CIPHER_CTX_free(ctx);
+ ctx = NULL;
+
+ /* If its RC2 then we'd better setup the key length */
+
+ if (alg_nid == NID_rc2_cbc)
+ keylen = EVP_CIPHER_key_length(cipher);
+
+ /* Setup keyfunc */
+
+ X509_ALGOR_free(pbe2->keyfunc);
+
+ pbe2->keyfunc = pkcs5_scrypt_set(salt, saltlen, keylen, N, r, p);
+
+ if (pbe2->keyfunc == NULL)
+ goto merr;
+
+ /* Now set up top level AlgorithmIdentifier */
+
+ ret = X509_ALGOR_new();
+ if (ret == NULL)
+ goto merr;
+
+ ret->algorithm = OBJ_nid2obj(NID_pbes2);
+
+ /* Encode PBE2PARAM into parameter */
+
+ if (ASN1_TYPE_pack_sequence(ASN1_ITEM_rptr(PBE2PARAM), pbe2,
+ &ret->parameter) == NULL)
+ goto merr;
+
+ PBE2PARAM_free(pbe2);
+ pbe2 = NULL;
+
+ return ret;
+
+ merr:
+ ASN1err(ASN1_F_PKCS5_PBE2_SET_SCRYPT, ERR_R_MALLOC_FAILURE);
+
+ err:
+ PBE2PARAM_free(pbe2);
+ X509_ALGOR_free(ret);
+ EVP_CIPHER_CTX_free(ctx);
+
+ return NULL;
+}
+
+static X509_ALGOR *pkcs5_scrypt_set(const unsigned char *salt, size_t saltlen,
+ size_t keylen, uint64_t N, uint64_t r,
+ uint64_t p)
+{
+ X509_ALGOR *keyfunc = NULL;
+ SCRYPT_PARAMS *sparam = SCRYPT_PARAMS_new();
+
+ if (sparam == NULL)
+ goto merr;
+
+ if (!saltlen)
+ saltlen = PKCS5_SALT_LEN;
+
+ /* This will either copy salt or grow the buffer */
+ if (ASN1_STRING_set(sparam->salt, salt, saltlen) == 0)
+ goto merr;
+
+ if (salt == NULL && RAND_bytes(sparam->salt->data, saltlen) <= 0)
+ goto err;
+
+ if (ASN1_INTEGER_set_uint64(sparam->costParameter, N) == 0)
+ goto merr;
+
+ if (ASN1_INTEGER_set_uint64(sparam->blockSize, r) == 0)
+ goto merr;
+
+ if (ASN1_INTEGER_set_uint64(sparam->parallelizationParameter, p) == 0)
+ goto merr;
+
+ /* If have a key len set it up */
+
+ if (keylen > 0) {
+ sparam->keyLength = ASN1_INTEGER_new();
+ if (sparam->keyLength == NULL)
+ goto merr;
+ if (ASN1_INTEGER_set_int64(sparam->keyLength, keylen) == 0)
+ goto merr;
+ }
+
+ /* Finally setup the keyfunc structure */
+
+ keyfunc = X509_ALGOR_new();
+ if (keyfunc == NULL)
+ goto merr;
+
+ keyfunc->algorithm = OBJ_nid2obj(NID_id_scrypt);
+
+ /* Encode SCRYPT_PARAMS into parameter of pbe2 */
+
+ if (ASN1_TYPE_pack_sequence(ASN1_ITEM_rptr(SCRYPT_PARAMS), sparam,
+ &keyfunc->parameter) == NULL)
+ goto merr;
+
+ SCRYPT_PARAMS_free(sparam);
+ return keyfunc;
+
+ merr:
+ ASN1err(ASN1_F_PKCS5_SCRYPT_SET, ERR_R_MALLOC_FAILURE);
+ err:
+ SCRYPT_PARAMS_free(sparam);
+ X509_ALGOR_free(keyfunc);
+ return NULL;
+}
+
+int PKCS5_v2_scrypt_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass,
+ int passlen, ASN1_TYPE *param,
+ const EVP_CIPHER *c, const EVP_MD *md, int en_de)
+{
+ unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
+ uint64_t p, r, N;
+ size_t saltlen;
+ size_t keylen = 0;
+ int rv = 0;
+ SCRYPT_PARAMS *sparam = NULL;
+
+ if (EVP_CIPHER_CTX_cipher(ctx) == NULL) {
+ EVPerr(EVP_F_PKCS5_V2_SCRYPT_KEYIVGEN, EVP_R_NO_CIPHER_SET);
+ goto err;
+ }
+
+ /* Decode parameter */
+
+ sparam = ASN1_TYPE_unpack_sequence(ASN1_ITEM_rptr(SCRYPT_PARAMS), param);
+
+ if (sparam == NULL) {
+ EVPerr(EVP_F_PKCS5_V2_SCRYPT_KEYIVGEN, EVP_R_DECODE_ERROR);
+ goto err;
+ }
+
+ keylen = EVP_CIPHER_CTX_key_length(ctx);
+
+ /* Now check the parameters of sparam */
+
+ if (sparam->keyLength) {
+ uint64_t spkeylen;
+ if ((ASN1_INTEGER_get_uint64(&spkeylen, sparam->keyLength) == 0)
+ || (spkeylen != keylen)) {
+ EVPerr(EVP_F_PKCS5_V2_SCRYPT_KEYIVGEN,
+ EVP_R_UNSUPPORTED_KEYLENGTH);
+ goto err;
+ }
+ }
+ /* Check all parameters fit in uint64_t and are acceptable to scrypt */
+ if (ASN1_INTEGER_get_uint64(&N, sparam->costParameter) == 0
+ || ASN1_INTEGER_get_uint64(&r, sparam->blockSize) == 0
+ || ASN1_INTEGER_get_uint64(&p, sparam->parallelizationParameter) == 0
+ || EVP_PBE_scrypt(NULL, 0, NULL, 0, N, r, p, 0, NULL, 0) == 0) {
+ EVPerr(EVP_F_PKCS5_V2_SCRYPT_KEYIVGEN,
+ EVP_R_ILLEGAL_SCRYPT_PARAMETERS);
+ goto err;
+ }
+
+ /* it seems that its all OK */
+
+ salt = sparam->salt->data;
+ saltlen = sparam->salt->length;
+ if (EVP_PBE_scrypt(pass, passlen, salt, saltlen, N, r, p, 0, key, keylen)
+ == 0)
+ goto err;
+ rv = EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
+ err:
+ if (keylen)
+ OPENSSL_cleanse(key, keylen);
+ SCRYPT_PARAMS_free(sparam);
+ return rv;
+}
+#endif /* OPENSSL_NO_SCRYPT */
diff --git a/openssl-1.1.0h/crypto/asn1/p8_pkey.c b/openssl-1.1.0h/crypto/asn1/p8_pkey.c
new file mode 100644
index 0000000..dbee827
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/p8_pkey.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include "internal/x509_int.h"
+
+/* Minor tweak to operation: zero private key data */
+static int pkey_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ void *exarg)
+{
+ /* Since the structure must still be valid use ASN1_OP_FREE_PRE */
+ if (operation == ASN1_OP_FREE_PRE) {
+ PKCS8_PRIV_KEY_INFO *key = (PKCS8_PRIV_KEY_INFO *)*pval;
+ if (key->pkey)
+ OPENSSL_cleanse(key->pkey->data, key->pkey->length);
+ }
+ return 1;
+}
+
+ASN1_SEQUENCE_cb(PKCS8_PRIV_KEY_INFO, pkey_cb) = {
+ ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, version, ASN1_INTEGER),
+ ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, pkeyalg, X509_ALGOR),
+ ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, pkey, ASN1_OCTET_STRING),
+ ASN1_IMP_SET_OF_OPT(PKCS8_PRIV_KEY_INFO, attributes, X509_ATTRIBUTE, 0)
+} ASN1_SEQUENCE_END_cb(PKCS8_PRIV_KEY_INFO, PKCS8_PRIV_KEY_INFO)
+
+IMPLEMENT_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)
+
+int PKCS8_pkey_set0(PKCS8_PRIV_KEY_INFO *priv, ASN1_OBJECT *aobj,
+ int version,
+ int ptype, void *pval, unsigned char *penc, int penclen)
+{
+ if (version >= 0) {
+ if (!ASN1_INTEGER_set(priv->version, version))
+ return 0;
+ }
+ if (!X509_ALGOR_set0(priv->pkeyalg, aobj, ptype, pval))
+ return 0;
+ if (penc)
+ ASN1_STRING_set0(priv->pkey, penc, penclen);
+ return 1;
+}
+
+int PKCS8_pkey_get0(const ASN1_OBJECT **ppkalg,
+ const unsigned char **pk, int *ppklen,
+ const X509_ALGOR **pa, const PKCS8_PRIV_KEY_INFO *p8)
+{
+ if (ppkalg)
+ *ppkalg = p8->pkeyalg->algorithm;
+ if (pk) {
+ *pk = ASN1_STRING_get0_data(p8->pkey);
+ *ppklen = ASN1_STRING_length(p8->pkey);
+ }
+ if (pa)
+ *pa = p8->pkeyalg;
+ return 1;
+}
+
+const STACK_OF(X509_ATTRIBUTE) *
+PKCS8_pkey_get0_attrs(const PKCS8_PRIV_KEY_INFO *p8)
+{
+ return p8->attributes;
+}
+
+int PKCS8_pkey_add1_attr_by_NID(PKCS8_PRIV_KEY_INFO *p8, int nid, int type,
+ const unsigned char *bytes, int len)
+{
+ if (X509at_add1_attr_by_NID(&p8->attributes, nid, type, bytes, len) != NULL)
+ return 1;
+ return 0;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/t_bitst.c b/openssl-1.1.0h/crypto/asn1/t_bitst.c
new file mode 100644
index 0000000..c0aeca4
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/t_bitst.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/conf.h>
+#include <openssl/x509v3.h>
+
+int ASN1_BIT_STRING_name_print(BIO *out, ASN1_BIT_STRING *bs,
+ BIT_STRING_BITNAME *tbl, int indent)
+{
+ BIT_STRING_BITNAME *bnam;
+ char first = 1;
+ BIO_printf(out, "%*s", indent, "");
+ for (bnam = tbl; bnam->lname; bnam++) {
+ if (ASN1_BIT_STRING_get_bit(bs, bnam->bitnum)) {
+ if (!first)
+ BIO_puts(out, ", ");
+ BIO_puts(out, bnam->lname);
+ first = 0;
+ }
+ }
+ BIO_puts(out, "\n");
+ return 1;
+}
+
+int ASN1_BIT_STRING_set_asc(ASN1_BIT_STRING *bs, const char *name, int value,
+ BIT_STRING_BITNAME *tbl)
+{
+ int bitnum;
+ bitnum = ASN1_BIT_STRING_num_asc(name, tbl);
+ if (bitnum < 0)
+ return 0;
+ if (bs) {
+ if (!ASN1_BIT_STRING_set_bit(bs, bitnum, value))
+ return 0;
+ }
+ return 1;
+}
+
+int ASN1_BIT_STRING_num_asc(const char *name, BIT_STRING_BITNAME *tbl)
+{
+ BIT_STRING_BITNAME *bnam;
+ for (bnam = tbl; bnam->lname; bnam++) {
+ if ((strcmp(bnam->sname, name) == 0)
+ || (strcmp(bnam->lname, name) == 0))
+ return bnam->bitnum;
+ }
+ return -1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/t_pkey.c b/openssl-1.1.0h/crypto/asn1/t_pkey.c
new file mode 100644
index 0000000..3b2c9df
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/t_pkey.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/objects.h>
+#include <openssl/buffer.h>
+#include "internal/bn_int.h"
+
+/* Number of octets per line */
+#define ASN1_BUF_PRINT_WIDTH 15
+/* Maximum indent */
+#define ASN1_PRINT_MAX_INDENT 128
+
+int ASN1_buf_print(BIO *bp, const unsigned char *buf, size_t buflen, int indent)
+{
+ size_t i;
+
+ for (i = 0; i < buflen; i++) {
+ if ((i % ASN1_BUF_PRINT_WIDTH) == 0) {
+ if (i > 0 && BIO_puts(bp, "\n") <= 0)
+ return 0;
+ if (!BIO_indent(bp, indent, ASN1_PRINT_MAX_INDENT))
+ return 0;
+ }
+ /*
+ * Use colon separators for each octet for compatibility as
+ * this function is used to print out key components.
+ */
+ if (BIO_printf(bp, "%02x%s", buf[i],
+ (i == buflen - 1) ? "" : ":") <= 0)
+ return 0;
+ }
+ if (BIO_write(bp, "\n", 1) <= 0)
+ return 0;
+ return 1;
+}
+
+int ASN1_bn_print(BIO *bp, const char *number, const BIGNUM *num,
+ unsigned char *ign, int indent)
+{
+ int n, rv = 0;
+ const char *neg;
+ unsigned char *buf = NULL, *tmp = NULL;
+ int buflen;
+
+ if (num == NULL)
+ return 1;
+ neg = BN_is_negative(num) ? "-" : "";
+ if (!BIO_indent(bp, indent, ASN1_PRINT_MAX_INDENT))
+ return 0;
+ if (BN_is_zero(num)) {
+ if (BIO_printf(bp, "%s 0\n", number) <= 0)
+ return 0;
+ return 1;
+ }
+
+ if (BN_num_bytes(num) <= BN_BYTES) {
+ if (BIO_printf(bp, "%s %s%lu (%s0x%lx)\n", number, neg,
+ (unsigned long)bn_get_words(num)[0], neg,
+ (unsigned long)bn_get_words(num)[0]) <= 0)
+ return 0;
+ return 1;
+ }
+
+ buflen = BN_num_bytes(num) + 1;
+ buf = tmp = OPENSSL_malloc(buflen);
+ if (buf == NULL)
+ goto err;
+ buf[0] = 0;
+ if (BIO_printf(bp, "%s%s\n", number,
+ (neg[0] == '-') ? " (Negative)" : "") <= 0)
+ goto err;
+ n = BN_bn2bin(num, buf + 1);
+
+ if (buf[1] & 0x80)
+ n++;
+ else
+ tmp++;
+
+ if (ASN1_buf_print(bp, tmp, n, indent + 4) == 0)
+ goto err;
+ rv = 1;
+ err:
+ OPENSSL_clear_free(buf, buflen);
+ return rv;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/t_spki.c b/openssl-1.1.0h/crypto/asn1/t_spki.c
new file mode 100644
index 0000000..51b56d0
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/t_spki.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/asn1.h>
+#include <openssl/rsa.h>
+#include <openssl/dsa.h>
+#include <openssl/bn.h>
+
+/* Print out an SPKI */
+
+int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki)
+{
+ EVP_PKEY *pkey;
+ ASN1_IA5STRING *chal;
+ ASN1_OBJECT *spkioid;
+ int i, n;
+ char *s;
+ BIO_printf(out, "Netscape SPKI:\n");
+ X509_PUBKEY_get0_param(&spkioid, NULL, NULL, NULL, spki->spkac->pubkey);
+ i = OBJ_obj2nid(spkioid);
+ BIO_printf(out, " Public Key Algorithm: %s\n",
+ (i == NID_undef) ? "UNKNOWN" : OBJ_nid2ln(i));
+ pkey = X509_PUBKEY_get(spki->spkac->pubkey);
+ if (!pkey)
+ BIO_printf(out, " Unable to load public key\n");
+ else {
+ EVP_PKEY_print_public(out, pkey, 4, NULL);
+ EVP_PKEY_free(pkey);
+ }
+ chal = spki->spkac->challenge;
+ if (chal->length)
+ BIO_printf(out, " Challenge String: %s\n", chal->data);
+ i = OBJ_obj2nid(spki->sig_algor.algorithm);
+ BIO_printf(out, " Signature Algorithm: %s",
+ (i == NID_undef) ? "UNKNOWN" : OBJ_nid2ln(i));
+
+ n = spki->signature->length;
+ s = (char *)spki->signature->data;
+ for (i = 0; i < n; i++) {
+ if ((i % 18) == 0)
+ BIO_write(out, "\n ", 7);
+ BIO_printf(out, "%02x%s", (unsigned char)s[i],
+ ((i + 1) == n) ? "" : ":");
+ }
+ BIO_write(out, "\n", 1);
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_dec.c b/openssl-1.1.0h/crypto/asn1/tasn_dec.c
new file mode 100644
index 0000000..af8641e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_dec.c
@@ -0,0 +1,1159 @@
+/*
+ * Copyright 2000-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/objects.h>
+#include <openssl/buffer.h>
+#include <openssl/err.h>
+#include "internal/numbers.h"
+#include "asn1_locl.h"
+
+/*
+ * Constructed types with a recursive definition (such as can be found in PKCS7)
+ * could eventually exceed the stack given malicious input with excessive
+ * recursion. Therefore we limit the stack depth. This is the maximum number of
+ * recursive invocations of asn1_item_embed_d2i().
+ */
+#define ASN1_MAX_CONSTRUCTED_NEST 30
+
+static int asn1_item_embed_d2i(ASN1_VALUE **pval, const unsigned char **in,
+ long len, const ASN1_ITEM *it,
+ int tag, int aclass, char opt, ASN1_TLC *ctx,
+ int depth);
+
+static int asn1_check_eoc(const unsigned char **in, long len);
+static int asn1_find_end(const unsigned char **in, long len, char inf);
+
+static int asn1_collect(BUF_MEM *buf, const unsigned char **in, long len,
+ char inf, int tag, int aclass, int depth);
+
+static int collect_data(BUF_MEM *buf, const unsigned char **p, long plen);
+
+static int asn1_check_tlen(long *olen, int *otag, unsigned char *oclass,
+ char *inf, char *cst,
+ const unsigned char **in, long len,
+ int exptag, int expclass, char opt, ASN1_TLC *ctx);
+
+static int asn1_template_ex_d2i(ASN1_VALUE **pval,
+ const unsigned char **in, long len,
+ const ASN1_TEMPLATE *tt, char opt,
+ ASN1_TLC *ctx, int depth);
+static int asn1_template_noexp_d2i(ASN1_VALUE **val,
+ const unsigned char **in, long len,
+ const ASN1_TEMPLATE *tt, char opt,
+ ASN1_TLC *ctx, int depth);
+static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
+ const unsigned char **in, long len,
+ const ASN1_ITEM *it,
+ int tag, int aclass, char opt,
+ ASN1_TLC *ctx);
+static int asn1_ex_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it);
+
+/* Table to convert tags to bit values, used for MSTRING type */
+static const unsigned long tag2bit[32] = {
+ /* tags 0 - 3 */
+ 0, 0, 0, B_ASN1_BIT_STRING,
+ /* tags 4- 7 */
+ B_ASN1_OCTET_STRING, 0, 0, B_ASN1_UNKNOWN,
+ /* tags 8-11 */
+ B_ASN1_UNKNOWN, B_ASN1_UNKNOWN, B_ASN1_UNKNOWN, B_ASN1_UNKNOWN,
+ /* tags 12-15 */
+ B_ASN1_UTF8STRING, B_ASN1_UNKNOWN, B_ASN1_UNKNOWN, B_ASN1_UNKNOWN,
+ /* tags 16-19 */
+ B_ASN1_SEQUENCE, 0, B_ASN1_NUMERICSTRING, B_ASN1_PRINTABLESTRING,
+ /* tags 20-22 */
+ B_ASN1_T61STRING, B_ASN1_VIDEOTEXSTRING, B_ASN1_IA5STRING,
+ /* tags 23-24 */
+ B_ASN1_UTCTIME, B_ASN1_GENERALIZEDTIME,
+ /* tags 25-27 */
+ B_ASN1_GRAPHICSTRING, B_ASN1_ISO64STRING, B_ASN1_GENERALSTRING,
+ /* tags 28-31 */
+ B_ASN1_UNIVERSALSTRING, B_ASN1_UNKNOWN, B_ASN1_BMPSTRING, B_ASN1_UNKNOWN,
+};
+
+unsigned long ASN1_tag2bit(int tag)
+{
+ if ((tag < 0) || (tag > 30))
+ return 0;
+ return tag2bit[tag];
+}
+
+/* Macro to initialize and invalidate the cache */
+
+#define asn1_tlc_clear(c) if (c) (c)->valid = 0
+/* Version to avoid compiler warning about 'c' always non-NULL */
+#define asn1_tlc_clear_nc(c) (c)->valid = 0
+
+/*
+ * Decode an ASN1 item, this currently behaves just like a standard 'd2i'
+ * function. 'in' points to a buffer to read the data from, in future we
+ * will have more advanced versions that can input data a piece at a time and
+ * this will simply be a special case.
+ */
+
+ASN1_VALUE *ASN1_item_d2i(ASN1_VALUE **pval,
+ const unsigned char **in, long len,
+ const ASN1_ITEM *it)
+{
+ ASN1_TLC c;
+ ASN1_VALUE *ptmpval = NULL;
+ if (!pval)
+ pval = &ptmpval;
+ asn1_tlc_clear_nc(&c);
+ if (ASN1_item_ex_d2i(pval, in, len, it, -1, 0, 0, &c) > 0)
+ return *pval;
+ return NULL;
+}
+
+int ASN1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, long len,
+ const ASN1_ITEM *it,
+ int tag, int aclass, char opt, ASN1_TLC *ctx)
+{
+ int rv;
+ rv = asn1_item_embed_d2i(pval, in, len, it, tag, aclass, opt, ctx, 0);
+ if (rv <= 0)
+ ASN1_item_ex_free(pval, it);
+ return rv;
+}
+
+/*
+ * Decode an item, taking care of IMPLICIT tagging, if any. If 'opt' set and
+ * tag mismatch return -1 to handle OPTIONAL
+ */
+
+static int asn1_item_embed_d2i(ASN1_VALUE **pval, const unsigned char **in,
+ long len, const ASN1_ITEM *it,
+ int tag, int aclass, char opt, ASN1_TLC *ctx,
+ int depth)
+{
+ const ASN1_TEMPLATE *tt, *errtt = NULL;
+ const ASN1_EXTERN_FUNCS *ef;
+ const ASN1_AUX *aux = it->funcs;
+ ASN1_aux_cb *asn1_cb;
+ const unsigned char *p = NULL, *q;
+ unsigned char oclass;
+ char seq_eoc, seq_nolen, cst, isopt;
+ long tmplen;
+ int i;
+ int otag;
+ int ret = 0;
+ ASN1_VALUE **pchptr;
+ if (!pval)
+ return 0;
+ if (aux && aux->asn1_cb)
+ asn1_cb = aux->asn1_cb;
+ else
+ asn1_cb = 0;
+
+ if (++depth > ASN1_MAX_CONSTRUCTED_NEST) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_NESTED_TOO_DEEP);
+ goto err;
+ }
+
+ switch (it->itype) {
+ case ASN1_ITYPE_PRIMITIVE:
+ if (it->templates) {
+ /*
+ * tagging or OPTIONAL is currently illegal on an item template
+ * because the flags can't get passed down. In practice this
+ * isn't a problem: we include the relevant flags from the item
+ * template in the template itself.
+ */
+ if ((tag != -1) || opt) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I,
+ ASN1_R_ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE);
+ goto err;
+ }
+ return asn1_template_ex_d2i(pval, in, len,
+ it->templates, opt, ctx, depth);
+ }
+ return asn1_d2i_ex_primitive(pval, in, len, it,
+ tag, aclass, opt, ctx);
+
+ case ASN1_ITYPE_MSTRING:
+ p = *in;
+ /* Just read in tag and class */
+ ret = asn1_check_tlen(NULL, &otag, &oclass, NULL, NULL,
+ &p, len, -1, 0, 1, ctx);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ }
+
+ /* Must be UNIVERSAL class */
+ if (oclass != V_ASN1_UNIVERSAL) {
+ /* If OPTIONAL, assume this is OK */
+ if (opt)
+ return -1;
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_MSTRING_NOT_UNIVERSAL);
+ goto err;
+ }
+ /* Check tag matches bit map */
+ if (!(ASN1_tag2bit(otag) & it->utype)) {
+ /* If OPTIONAL, assume this is OK */
+ if (opt)
+ return -1;
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_MSTRING_WRONG_TAG);
+ goto err;
+ }
+ return asn1_d2i_ex_primitive(pval, in, len, it, otag, 0, 0, ctx);
+
+ case ASN1_ITYPE_EXTERN:
+ /* Use new style d2i */
+ ef = it->funcs;
+ return ef->asn1_ex_d2i(pval, in, len, it, tag, aclass, opt, ctx);
+
+ case ASN1_ITYPE_CHOICE:
+ if (asn1_cb && !asn1_cb(ASN1_OP_D2I_PRE, pval, it, NULL))
+ goto auxerr;
+ if (*pval) {
+ /* Free up and zero CHOICE value if initialised */
+ i = asn1_get_choice_selector(pval, it);
+ if ((i >= 0) && (i < it->tcount)) {
+ tt = it->templates + i;
+ pchptr = asn1_get_field_ptr(pval, tt);
+ asn1_template_free(pchptr, tt);
+ asn1_set_choice_selector(pval, -1, it);
+ }
+ } else if (!ASN1_item_ex_new(pval, it)) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ }
+ /* CHOICE type, try each possibility in turn */
+ p = *in;
+ for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) {
+ pchptr = asn1_get_field_ptr(pval, tt);
+ /*
+ * We mark field as OPTIONAL so its absence can be recognised.
+ */
+ ret = asn1_template_ex_d2i(pchptr, &p, len, tt, 1, ctx, depth);
+ /* If field not present, try the next one */
+ if (ret == -1)
+ continue;
+ /* If positive return, read OK, break loop */
+ if (ret > 0)
+ break;
+ /*
+ * Must be an ASN1 parsing error.
+ * Free up any partial choice value
+ */
+ asn1_template_free(pchptr, tt);
+ errtt = tt;
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ }
+
+ /* Did we fall off the end without reading anything? */
+ if (i == it->tcount) {
+ /* If OPTIONAL, this is OK */
+ if (opt) {
+ /* Free and zero it */
+ ASN1_item_ex_free(pval, it);
+ return -1;
+ }
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_NO_MATCHING_CHOICE_TYPE);
+ goto err;
+ }
+
+ asn1_set_choice_selector(pval, i, it);
+
+ if (asn1_cb && !asn1_cb(ASN1_OP_D2I_POST, pval, it, NULL))
+ goto auxerr;
+ *in = p;
+ return 1;
+
+ case ASN1_ITYPE_NDEF_SEQUENCE:
+ case ASN1_ITYPE_SEQUENCE:
+ p = *in;
+ tmplen = len;
+
+ /* If no IMPLICIT tagging set to SEQUENCE, UNIVERSAL */
+ if (tag == -1) {
+ tag = V_ASN1_SEQUENCE;
+ aclass = V_ASN1_UNIVERSAL;
+ }
+ /* Get SEQUENCE length and update len, p */
+ ret = asn1_check_tlen(&len, NULL, NULL, &seq_eoc, &cst,
+ &p, len, tag, aclass, opt, ctx);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ } else if (ret == -1)
+ return -1;
+ if (aux && (aux->flags & ASN1_AFLG_BROKEN)) {
+ len = tmplen - (p - *in);
+ seq_nolen = 1;
+ }
+ /* If indefinite we don't do a length check */
+ else
+ seq_nolen = seq_eoc;
+ if (!cst) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_SEQUENCE_NOT_CONSTRUCTED);
+ goto err;
+ }
+
+ if (!*pval && !ASN1_item_ex_new(pval, it)) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ }
+
+ if (asn1_cb && !asn1_cb(ASN1_OP_D2I_PRE, pval, it, NULL))
+ goto auxerr;
+
+ /* Free up and zero any ADB found */
+ for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) {
+ if (tt->flags & ASN1_TFLG_ADB_MASK) {
+ const ASN1_TEMPLATE *seqtt;
+ ASN1_VALUE **pseqval;
+ seqtt = asn1_do_adb(pval, tt, 0);
+ if (seqtt == NULL)
+ continue;
+ pseqval = asn1_get_field_ptr(pval, seqtt);
+ asn1_template_free(pseqval, seqtt);
+ }
+ }
+
+ /* Get each field entry */
+ for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) {
+ const ASN1_TEMPLATE *seqtt;
+ ASN1_VALUE **pseqval;
+ seqtt = asn1_do_adb(pval, tt, 1);
+ if (seqtt == NULL)
+ goto err;
+ pseqval = asn1_get_field_ptr(pval, seqtt);
+ /* Have we ran out of data? */
+ if (!len)
+ break;
+ q = p;
+ if (asn1_check_eoc(&p, len)) {
+ if (!seq_eoc) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_UNEXPECTED_EOC);
+ goto err;
+ }
+ len -= p - q;
+ seq_eoc = 0;
+ q = p;
+ break;
+ }
+ /*
+ * This determines the OPTIONAL flag value. The field cannot be
+ * omitted if it is the last of a SEQUENCE and there is still
+ * data to be read. This isn't strictly necessary but it
+ * increases efficiency in some cases.
+ */
+ if (i == (it->tcount - 1))
+ isopt = 0;
+ else
+ isopt = (char)(seqtt->flags & ASN1_TFLG_OPTIONAL);
+ /*
+ * attempt to read in field, allowing each to be OPTIONAL
+ */
+
+ ret = asn1_template_ex_d2i(pseqval, &p, len, seqtt, isopt, ctx,
+ depth);
+ if (!ret) {
+ errtt = seqtt;
+ goto err;
+ } else if (ret == -1) {
+ /*
+ * OPTIONAL component absent. Free and zero the field.
+ */
+ asn1_template_free(pseqval, seqtt);
+ continue;
+ }
+ /* Update length */
+ len -= p - q;
+ }
+
+ /* Check for EOC if expecting one */
+ if (seq_eoc && !asn1_check_eoc(&p, len)) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_MISSING_EOC);
+ goto err;
+ }
+ /* Check all data read */
+ if (!seq_nolen && len) {
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_SEQUENCE_LENGTH_MISMATCH);
+ goto err;
+ }
+
+ /*
+ * If we get here we've got no more data in the SEQUENCE, however we
+ * may not have read all fields so check all remaining are OPTIONAL
+ * and clear any that are.
+ */
+ for (; i < it->tcount; tt++, i++) {
+ const ASN1_TEMPLATE *seqtt;
+ seqtt = asn1_do_adb(pval, tt, 1);
+ if (seqtt == NULL)
+ goto err;
+ if (seqtt->flags & ASN1_TFLG_OPTIONAL) {
+ ASN1_VALUE **pseqval;
+ pseqval = asn1_get_field_ptr(pval, seqtt);
+ asn1_template_free(pseqval, seqtt);
+ } else {
+ errtt = seqtt;
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_FIELD_MISSING);
+ goto err;
+ }
+ }
+ /* Save encoding */
+ if (!asn1_enc_save(pval, *in, p - *in, it))
+ goto auxerr;
+ if (asn1_cb && !asn1_cb(ASN1_OP_D2I_POST, pval, it, NULL))
+ goto auxerr;
+ *in = p;
+ return 1;
+
+ default:
+ return 0;
+ }
+ auxerr:
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_D2I, ASN1_R_AUX_ERROR);
+ err:
+ if (errtt)
+ ERR_add_error_data(4, "Field=", errtt->field_name,
+ ", Type=", it->sname);
+ else
+ ERR_add_error_data(2, "Type=", it->sname);
+ return 0;
+}
+
+/*
+ * Templates are handled with two separate functions. One handles any
+ * EXPLICIT tag and the other handles the rest.
+ */
+
+static int asn1_template_ex_d2i(ASN1_VALUE **val,
+ const unsigned char **in, long inlen,
+ const ASN1_TEMPLATE *tt, char opt,
+ ASN1_TLC *ctx, int depth)
+{
+ int flags, aclass;
+ int ret;
+ long len;
+ const unsigned char *p, *q;
+ char exp_eoc;
+ if (!val)
+ return 0;
+ flags = tt->flags;
+ aclass = flags & ASN1_TFLG_TAG_CLASS;
+
+ p = *in;
+
+ /* Check if EXPLICIT tag expected */
+ if (flags & ASN1_TFLG_EXPTAG) {
+ char cst;
+ /*
+ * Need to work out amount of data available to the inner content and
+ * where it starts: so read in EXPLICIT header to get the info.
+ */
+ ret = asn1_check_tlen(&len, NULL, NULL, &exp_eoc, &cst,
+ &p, inlen, tt->tag, aclass, opt, ctx);
+ q = p;
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_EX_D2I, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ } else if (ret == -1)
+ return -1;
+ if (!cst) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_EX_D2I,
+ ASN1_R_EXPLICIT_TAG_NOT_CONSTRUCTED);
+ return 0;
+ }
+ /* We've found the field so it can't be OPTIONAL now */
+ ret = asn1_template_noexp_d2i(val, &p, len, tt, 0, ctx, depth);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_EX_D2I, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ }
+ /* We read the field in OK so update length */
+ len -= p - q;
+ if (exp_eoc) {
+ /* If NDEF we must have an EOC here */
+ if (!asn1_check_eoc(&p, len)) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_EX_D2I, ASN1_R_MISSING_EOC);
+ goto err;
+ }
+ } else {
+ /*
+ * Otherwise we must hit the EXPLICIT tag end or its an error
+ */
+ if (len) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_EX_D2I,
+ ASN1_R_EXPLICIT_LENGTH_MISMATCH);
+ goto err;
+ }
+ }
+ } else
+ return asn1_template_noexp_d2i(val, in, inlen, tt, opt, ctx, depth);
+
+ *in = p;
+ return 1;
+
+ err:
+ return 0;
+}
+
+static int asn1_template_noexp_d2i(ASN1_VALUE **val,
+ const unsigned char **in, long len,
+ const ASN1_TEMPLATE *tt, char opt,
+ ASN1_TLC *ctx, int depth)
+{
+ int flags, aclass;
+ int ret;
+ ASN1_VALUE *tval;
+ const unsigned char *p, *q;
+ if (!val)
+ return 0;
+ flags = tt->flags;
+ aclass = flags & ASN1_TFLG_TAG_CLASS;
+
+ p = *in;
+ q = p;
+
+ /*
+ * If field is embedded then val needs fixing so it is a pointer to
+ * a pointer to a field.
+ */
+ if (tt->flags & ASN1_TFLG_EMBED) {
+ tval = (ASN1_VALUE *)val;
+ val = &tval;
+ }
+
+ if (flags & ASN1_TFLG_SK_MASK) {
+ /* SET OF, SEQUENCE OF */
+ int sktag, skaclass;
+ char sk_eoc;
+ /* First work out expected inner tag value */
+ if (flags & ASN1_TFLG_IMPTAG) {
+ sktag = tt->tag;
+ skaclass = aclass;
+ } else {
+ skaclass = V_ASN1_UNIVERSAL;
+ if (flags & ASN1_TFLG_SET_OF)
+ sktag = V_ASN1_SET;
+ else
+ sktag = V_ASN1_SEQUENCE;
+ }
+ /* Get the tag */
+ ret = asn1_check_tlen(&len, NULL, NULL, &sk_eoc, NULL,
+ &p, len, sktag, skaclass, opt, ctx);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ } else if (ret == -1)
+ return -1;
+ if (!*val)
+ *val = (ASN1_VALUE *)OPENSSL_sk_new_null();
+ else {
+ /*
+ * We've got a valid STACK: free up any items present
+ */
+ STACK_OF(ASN1_VALUE) *sktmp = (STACK_OF(ASN1_VALUE) *)*val;
+ ASN1_VALUE *vtmp;
+ while (sk_ASN1_VALUE_num(sktmp) > 0) {
+ vtmp = sk_ASN1_VALUE_pop(sktmp);
+ ASN1_item_ex_free(&vtmp, ASN1_ITEM_ptr(tt->item));
+ }
+ }
+
+ if (!*val) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ /* Read as many items as we can */
+ while (len > 0) {
+ ASN1_VALUE *skfield;
+ q = p;
+ /* See if EOC found */
+ if (asn1_check_eoc(&p, len)) {
+ if (!sk_eoc) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I,
+ ASN1_R_UNEXPECTED_EOC);
+ goto err;
+ }
+ len -= p - q;
+ sk_eoc = 0;
+ break;
+ }
+ skfield = NULL;
+ if (!asn1_item_embed_d2i(&skfield, &p, len,
+ ASN1_ITEM_ptr(tt->item), -1, 0, 0, ctx,
+ depth)) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I,
+ ERR_R_NESTED_ASN1_ERROR);
+ /* |skfield| may be partially allocated despite failure. */
+ ASN1_item_free(skfield, ASN1_ITEM_ptr(tt->item));
+ goto err;
+ }
+ len -= p - q;
+ if (!sk_ASN1_VALUE_push((STACK_OF(ASN1_VALUE) *)*val, skfield)) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I, ERR_R_MALLOC_FAILURE);
+ ASN1_item_free(skfield, ASN1_ITEM_ptr(tt->item));
+ goto err;
+ }
+ }
+ if (sk_eoc) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I, ASN1_R_MISSING_EOC);
+ goto err;
+ }
+ } else if (flags & ASN1_TFLG_IMPTAG) {
+ /* IMPLICIT tagging */
+ ret = asn1_item_embed_d2i(val, &p, len,
+ ASN1_ITEM_ptr(tt->item), tt->tag, aclass, opt,
+ ctx, depth);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ } else if (ret == -1)
+ return -1;
+ } else {
+ /* Nothing special */
+ ret = asn1_item_embed_d2i(val, &p, len, ASN1_ITEM_ptr(tt->item),
+ -1, 0, opt, ctx, depth);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NOEXP_D2I, ERR_R_NESTED_ASN1_ERROR);
+ goto err;
+ } else if (ret == -1)
+ return -1;
+ }
+
+ *in = p;
+ return 1;
+
+ err:
+ return 0;
+}
+
+static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
+ const unsigned char **in, long inlen,
+ const ASN1_ITEM *it,
+ int tag, int aclass, char opt, ASN1_TLC *ctx)
+{
+ int ret = 0, utype;
+ long plen;
+ char cst, inf, free_cont = 0;
+ const unsigned char *p;
+ BUF_MEM buf = { 0, NULL, 0, 0 };
+ const unsigned char *cont = NULL;
+ long len;
+ if (!pval) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_ILLEGAL_NULL);
+ return 0; /* Should never happen */
+ }
+
+ if (it->itype == ASN1_ITYPE_MSTRING) {
+ utype = tag;
+ tag = -1;
+ } else
+ utype = it->utype;
+
+ if (utype == V_ASN1_ANY) {
+ /* If type is ANY need to figure out type from tag */
+ unsigned char oclass;
+ if (tag >= 0) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_ILLEGAL_TAGGED_ANY);
+ return 0;
+ }
+ if (opt) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE,
+ ASN1_R_ILLEGAL_OPTIONAL_ANY);
+ return 0;
+ }
+ p = *in;
+ ret = asn1_check_tlen(NULL, &utype, &oclass, NULL, NULL,
+ &p, inlen, -1, 0, 0, ctx);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ }
+ if (oclass != V_ASN1_UNIVERSAL)
+ utype = V_ASN1_OTHER;
+ }
+ if (tag == -1) {
+ tag = utype;
+ aclass = V_ASN1_UNIVERSAL;
+ }
+ p = *in;
+ /* Check header */
+ ret = asn1_check_tlen(&plen, NULL, NULL, &inf, &cst,
+ &p, inlen, tag, aclass, opt, ctx);
+ if (!ret) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ } else if (ret == -1)
+ return -1;
+ ret = 0;
+ /* SEQUENCE, SET and "OTHER" are left in encoded form */
+ if ((utype == V_ASN1_SEQUENCE)
+ || (utype == V_ASN1_SET) || (utype == V_ASN1_OTHER)) {
+ /*
+ * Clear context cache for type OTHER because the auto clear when we
+ * have a exact match won't work
+ */
+ if (utype == V_ASN1_OTHER) {
+ asn1_tlc_clear(ctx);
+ }
+ /* SEQUENCE and SET must be constructed */
+ else if (!cst) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE,
+ ASN1_R_TYPE_NOT_CONSTRUCTED);
+ return 0;
+ }
+
+ cont = *in;
+ /* If indefinite length constructed find the real end */
+ if (inf) {
+ if (!asn1_find_end(&p, plen, inf))
+ goto err;
+ len = p - cont;
+ } else {
+ len = p - cont + plen;
+ p += plen;
+ }
+ } else if (cst) {
+ if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
+ || utype == V_ASN1_OBJECT || utype == V_ASN1_INTEGER
+ || utype == V_ASN1_ENUMERATED) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE);
+ return 0;
+ }
+
+ /* Free any returned 'buf' content */
+ free_cont = 1;
+ /*
+ * Should really check the internal tags are correct but some things
+ * may get this wrong. The relevant specs say that constructed string
+ * types should be OCTET STRINGs internally irrespective of the type.
+ * So instead just check for UNIVERSAL class and ignore the tag.
+ */
+ if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
+ goto err;
+ }
+ len = buf.length;
+ /* Append a final null to string */
+ if (!BUF_MEM_grow_clean(&buf, len + 1)) {
+ ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ buf.data[len] = 0;
+ cont = (const unsigned char *)buf.data;
+ } else {
+ cont = p;
+ len = plen;
+ p += plen;
+ }
+
+ /* We now have content length and type: translate into a structure */
+ /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
+ if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
+ goto err;
+
+ *in = p;
+ ret = 1;
+ err:
+ if (free_cont)
+ OPENSSL_free(buf.data);
+ return ret;
+}
+
+/* Translate ASN1 content octets into a structure */
+
+static int asn1_ex_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it)
+{
+ ASN1_VALUE **opval = NULL;
+ ASN1_STRING *stmp;
+ ASN1_TYPE *typ = NULL;
+ int ret = 0;
+ const ASN1_PRIMITIVE_FUNCS *pf;
+ ASN1_INTEGER **tint;
+ pf = it->funcs;
+
+ if (pf && pf->prim_c2i)
+ return pf->prim_c2i(pval, cont, len, utype, free_cont, it);
+ /* If ANY type clear type and set pointer to internal value */
+ if (it->utype == V_ASN1_ANY) {
+ if (!*pval) {
+ typ = ASN1_TYPE_new();
+ if (typ == NULL)
+ goto err;
+ *pval = (ASN1_VALUE *)typ;
+ } else
+ typ = (ASN1_TYPE *)*pval;
+
+ if (utype != typ->type)
+ ASN1_TYPE_set(typ, utype, NULL);
+ opval = pval;
+ pval = &typ->value.asn1_value;
+ }
+ switch (utype) {
+ case V_ASN1_OBJECT:
+ if (!c2i_ASN1_OBJECT((ASN1_OBJECT **)pval, &cont, len))
+ goto err;
+ break;
+
+ case V_ASN1_NULL:
+ if (len) {
+ ASN1err(ASN1_F_ASN1_EX_C2I, ASN1_R_NULL_IS_WRONG_LENGTH);
+ goto err;
+ }
+ *pval = (ASN1_VALUE *)1;
+ break;
+
+ case V_ASN1_BOOLEAN:
+ if (len != 1) {
+ ASN1err(ASN1_F_ASN1_EX_C2I, ASN1_R_BOOLEAN_IS_WRONG_LENGTH);
+ goto err;
+ } else {
+ ASN1_BOOLEAN *tbool;
+ tbool = (ASN1_BOOLEAN *)pval;
+ *tbool = *cont;
+ }
+ break;
+
+ case V_ASN1_BIT_STRING:
+ if (!c2i_ASN1_BIT_STRING((ASN1_BIT_STRING **)pval, &cont, len))
+ goto err;
+ break;
+
+ case V_ASN1_INTEGER:
+ case V_ASN1_ENUMERATED:
+ tint = (ASN1_INTEGER **)pval;
+ if (!c2i_ASN1_INTEGER(tint, &cont, len))
+ goto err;
+ /* Fixup type to match the expected form */
+ (*tint)->type = utype | ((*tint)->type & V_ASN1_NEG);
+ break;
+
+ case V_ASN1_OCTET_STRING:
+ case V_ASN1_NUMERICSTRING:
+ case V_ASN1_PRINTABLESTRING:
+ case V_ASN1_T61STRING:
+ case V_ASN1_VIDEOTEXSTRING:
+ case V_ASN1_IA5STRING:
+ case V_ASN1_UTCTIME:
+ case V_ASN1_GENERALIZEDTIME:
+ case V_ASN1_GRAPHICSTRING:
+ case V_ASN1_VISIBLESTRING:
+ case V_ASN1_GENERALSTRING:
+ case V_ASN1_UNIVERSALSTRING:
+ case V_ASN1_BMPSTRING:
+ case V_ASN1_UTF8STRING:
+ case V_ASN1_OTHER:
+ case V_ASN1_SET:
+ case V_ASN1_SEQUENCE:
+ default:
+ if (utype == V_ASN1_BMPSTRING && (len & 1)) {
+ ASN1err(ASN1_F_ASN1_EX_C2I, ASN1_R_BMPSTRING_IS_WRONG_LENGTH);
+ goto err;
+ }
+ if (utype == V_ASN1_UNIVERSALSTRING && (len & 3)) {
+ ASN1err(ASN1_F_ASN1_EX_C2I,
+ ASN1_R_UNIVERSALSTRING_IS_WRONG_LENGTH);
+ goto err;
+ }
+ /* All based on ASN1_STRING and handled the same */
+ if (!*pval) {
+ stmp = ASN1_STRING_type_new(utype);
+ if (stmp == NULL) {
+ ASN1err(ASN1_F_ASN1_EX_C2I, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ *pval = (ASN1_VALUE *)stmp;
+ } else {
+ stmp = (ASN1_STRING *)*pval;
+ stmp->type = utype;
+ }
+ /* If we've already allocated a buffer use it */
+ if (*free_cont) {
+ OPENSSL_free(stmp->data);
+ stmp->data = (unsigned char *)cont; /* UGLY CAST! RL */
+ stmp->length = len;
+ *free_cont = 0;
+ } else {
+ if (!ASN1_STRING_set(stmp, cont, len)) {
+ ASN1err(ASN1_F_ASN1_EX_C2I, ERR_R_MALLOC_FAILURE);
+ ASN1_STRING_free(stmp);
+ *pval = NULL;
+ goto err;
+ }
+ }
+ break;
+ }
+ /* If ASN1_ANY and NULL type fix up value */
+ if (typ && (utype == V_ASN1_NULL))
+ typ->value.ptr = NULL;
+
+ ret = 1;
+ err:
+ if (!ret) {
+ ASN1_TYPE_free(typ);
+ if (opval)
+ *opval = NULL;
+ }
+ return ret;
+}
+
+/*
+ * This function finds the end of an ASN1 structure when passed its maximum
+ * length, whether it is indefinite length and a pointer to the content. This
+ * is more efficient than calling asn1_collect because it does not recurse on
+ * each indefinite length header.
+ */
+
+static int asn1_find_end(const unsigned char **in, long len, char inf)
+{
+ uint32_t expected_eoc;
+ long plen;
+ const unsigned char *p = *in, *q;
+ /* If not indefinite length constructed just add length */
+ if (inf == 0) {
+ *in += len;
+ return 1;
+ }
+ expected_eoc = 1;
+ /*
+ * Indefinite length constructed form. Find the end when enough EOCs are
+ * found. If more indefinite length constructed headers are encountered
+ * increment the expected eoc count otherwise just skip to the end of the
+ * data.
+ */
+ while (len > 0) {
+ if (asn1_check_eoc(&p, len)) {
+ expected_eoc--;
+ if (expected_eoc == 0)
+ break;
+ len -= 2;
+ continue;
+ }
+ q = p;
+ /* Just read in a header: only care about the length */
+ if (!asn1_check_tlen(&plen, NULL, NULL, &inf, NULL, &p, len,
+ -1, 0, 0, NULL)) {
+ ASN1err(ASN1_F_ASN1_FIND_END, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ }
+ if (inf) {
+ if (expected_eoc == UINT32_MAX) {
+ ASN1err(ASN1_F_ASN1_FIND_END, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ }
+ expected_eoc++;
+ } else {
+ p += plen;
+ }
+ len -= p - q;
+ }
+ if (expected_eoc) {
+ ASN1err(ASN1_F_ASN1_FIND_END, ASN1_R_MISSING_EOC);
+ return 0;
+ }
+ *in = p;
+ return 1;
+}
+
+/*
+ * This function collects the asn1 data from a constructed string type into
+ * a buffer. The values of 'in' and 'len' should refer to the contents of the
+ * constructed type and 'inf' should be set if it is indefinite length.
+ */
+
+#ifndef ASN1_MAX_STRING_NEST
+/*
+ * This determines how many levels of recursion are permitted in ASN1 string
+ * types. If it is not limited stack overflows can occur. If set to zero no
+ * recursion is allowed at all. Although zero should be adequate examples
+ * exist that require a value of 1. So 5 should be more than enough.
+ */
+# define ASN1_MAX_STRING_NEST 5
+#endif
+
+static int asn1_collect(BUF_MEM *buf, const unsigned char **in, long len,
+ char inf, int tag, int aclass, int depth)
+{
+ const unsigned char *p, *q;
+ long plen;
+ char cst, ininf;
+ p = *in;
+ inf &= 1;
+ /*
+ * If no buffer and not indefinite length constructed just pass over the
+ * encoded data
+ */
+ if (!buf && !inf) {
+ *in += len;
+ return 1;
+ }
+ while (len > 0) {
+ q = p;
+ /* Check for EOC */
+ if (asn1_check_eoc(&p, len)) {
+ /*
+ * EOC is illegal outside indefinite length constructed form
+ */
+ if (!inf) {
+ ASN1err(ASN1_F_ASN1_COLLECT, ASN1_R_UNEXPECTED_EOC);
+ return 0;
+ }
+ inf = 0;
+ break;
+ }
+
+ if (!asn1_check_tlen(&plen, NULL, NULL, &ininf, &cst, &p,
+ len, tag, aclass, 0, NULL)) {
+ ASN1err(ASN1_F_ASN1_COLLECT, ERR_R_NESTED_ASN1_ERROR);
+ return 0;
+ }
+
+ /* If indefinite length constructed update max length */
+ if (cst) {
+ if (depth >= ASN1_MAX_STRING_NEST) {
+ ASN1err(ASN1_F_ASN1_COLLECT, ASN1_R_NESTED_ASN1_STRING);
+ return 0;
+ }
+ if (!asn1_collect(buf, &p, plen, ininf, tag, aclass, depth + 1))
+ return 0;
+ } else if (plen && !collect_data(buf, &p, plen))
+ return 0;
+ len -= p - q;
+ }
+ if (inf) {
+ ASN1err(ASN1_F_ASN1_COLLECT, ASN1_R_MISSING_EOC);
+ return 0;
+ }
+ *in = p;
+ return 1;
+}
+
+static int collect_data(BUF_MEM *buf, const unsigned char **p, long plen)
+{
+ int len;
+ if (buf) {
+ len = buf->length;
+ if (!BUF_MEM_grow_clean(buf, len + plen)) {
+ ASN1err(ASN1_F_COLLECT_DATA, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ memcpy(buf->data + len, *p, plen);
+ }
+ *p += plen;
+ return 1;
+}
+
+/* Check for ASN1 EOC and swallow it if found */
+
+static int asn1_check_eoc(const unsigned char **in, long len)
+{
+ const unsigned char *p;
+ if (len < 2)
+ return 0;
+ p = *in;
+ if (!p[0] && !p[1]) {
+ *in += 2;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Check an ASN1 tag and length: a bit like ASN1_get_object but it sets the
+ * length for indefinite length constructed form, we don't know the exact
+ * length but we can set an upper bound to the amount of data available minus
+ * the header length just read.
+ */
+
+static int asn1_check_tlen(long *olen, int *otag, unsigned char *oclass,
+ char *inf, char *cst,
+ const unsigned char **in, long len,
+ int exptag, int expclass, char opt, ASN1_TLC *ctx)
+{
+ int i;
+ int ptag, pclass;
+ long plen;
+ const unsigned char *p, *q;
+ p = *in;
+ q = p;
+
+ if (ctx && ctx->valid) {
+ i = ctx->ret;
+ plen = ctx->plen;
+ pclass = ctx->pclass;
+ ptag = ctx->ptag;
+ p += ctx->hdrlen;
+ } else {
+ i = ASN1_get_object(&p, &plen, &ptag, &pclass, len);
+ if (ctx) {
+ ctx->ret = i;
+ ctx->plen = plen;
+ ctx->pclass = pclass;
+ ctx->ptag = ptag;
+ ctx->hdrlen = p - q;
+ ctx->valid = 1;
+ /*
+ * If definite length, and no error, length + header can't exceed
+ * total amount of data available.
+ */
+ if (!(i & 0x81) && ((plen + ctx->hdrlen) > len)) {
+ ASN1err(ASN1_F_ASN1_CHECK_TLEN, ASN1_R_TOO_LONG);
+ asn1_tlc_clear(ctx);
+ return 0;
+ }
+ }
+ }
+
+ if (i & 0x80) {
+ ASN1err(ASN1_F_ASN1_CHECK_TLEN, ASN1_R_BAD_OBJECT_HEADER);
+ asn1_tlc_clear(ctx);
+ return 0;
+ }
+ if (exptag >= 0) {
+ if ((exptag != ptag) || (expclass != pclass)) {
+ /*
+ * If type is OPTIONAL, not an error: indicate missing type.
+ */
+ if (opt)
+ return -1;
+ asn1_tlc_clear(ctx);
+ ASN1err(ASN1_F_ASN1_CHECK_TLEN, ASN1_R_WRONG_TAG);
+ return 0;
+ }
+ /*
+ * We have a tag and class match: assume we are going to do something
+ * with it
+ */
+ asn1_tlc_clear(ctx);
+ }
+
+ if (i & 1)
+ plen = len - (p - q);
+
+ if (inf)
+ *inf = i & 1;
+
+ if (cst)
+ *cst = i & V_ASN1_CONSTRUCTED;
+
+ if (olen)
+ *olen = plen;
+
+ if (oclass)
+ *oclass = pclass;
+
+ if (otag)
+ *otag = ptag;
+
+ *in = p;
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_enc.c b/openssl-1.1.0h/crypto/asn1/tasn_enc.c
new file mode 100644
index 0000000..caa4869
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_enc.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/objects.h>
+#include "internal/asn1_int.h"
+#include "asn1_locl.h"
+
+static int asn1_i2d_ex_primitive(ASN1_VALUE **pval, unsigned char **out,
+ const ASN1_ITEM *it, int tag, int aclass);
+static int asn1_set_seq_out(STACK_OF(ASN1_VALUE) *sk, unsigned char **out,
+ int skcontlen, const ASN1_ITEM *item,
+ int do_sort, int iclass);
+static int asn1_template_ex_i2d(ASN1_VALUE **pval, unsigned char **out,
+ const ASN1_TEMPLATE *tt, int tag, int aclass);
+static int asn1_item_flags_i2d(ASN1_VALUE *val, unsigned char **out,
+ const ASN1_ITEM *it, int flags);
+static int asn1_ex_i2c(ASN1_VALUE **pval, unsigned char *cout, int *putype,
+ const ASN1_ITEM *it);
+
+/*
+ * Top level i2d equivalents: the 'ndef' variant instructs the encoder to use
+ * indefinite length constructed encoding, where appropriate
+ */
+
+int ASN1_item_ndef_i2d(ASN1_VALUE *val, unsigned char **out,
+ const ASN1_ITEM *it)
+{
+ return asn1_item_flags_i2d(val, out, it, ASN1_TFLG_NDEF);
+}
+
+int ASN1_item_i2d(ASN1_VALUE *val, unsigned char **out, const ASN1_ITEM *it)
+{
+ return asn1_item_flags_i2d(val, out, it, 0);
+}
+
+/*
+ * Encode an ASN1 item, this is use by the standard 'i2d' function. 'out'
+ * points to a buffer to output the data to. The new i2d has one additional
+ * feature. If the output buffer is NULL (i.e. *out == NULL) then a buffer is
+ * allocated and populated with the encoding.
+ */
+
+static int asn1_item_flags_i2d(ASN1_VALUE *val, unsigned char **out,
+ const ASN1_ITEM *it, int flags)
+{
+ if (out && !*out) {
+ unsigned char *p, *buf;
+ int len;
+ len = ASN1_item_ex_i2d(&val, NULL, it, -1, flags);
+ if (len <= 0)
+ return len;
+ buf = OPENSSL_malloc(len);
+ if (buf == NULL)
+ return -1;
+ p = buf;
+ ASN1_item_ex_i2d(&val, &p, it, -1, flags);
+ *out = buf;
+ return len;
+ }
+
+ return ASN1_item_ex_i2d(&val, out, it, -1, flags);
+}
+
+/*
+ * Encode an item, taking care of IMPLICIT tagging (if any). This function
+ * performs the normal item handling: it can be used in external types.
+ */
+
+int ASN1_item_ex_i2d(ASN1_VALUE **pval, unsigned char **out,
+ const ASN1_ITEM *it, int tag, int aclass)
+{
+ const ASN1_TEMPLATE *tt = NULL;
+ int i, seqcontlen, seqlen, ndef = 1;
+ const ASN1_EXTERN_FUNCS *ef;
+ const ASN1_AUX *aux = it->funcs;
+ ASN1_aux_cb *asn1_cb = 0;
+
+ if ((it->itype != ASN1_ITYPE_PRIMITIVE) && !*pval)
+ return 0;
+
+ if (aux && aux->asn1_cb)
+ asn1_cb = aux->asn1_cb;
+
+ switch (it->itype) {
+
+ case ASN1_ITYPE_PRIMITIVE:
+ if (it->templates)
+ return asn1_template_ex_i2d(pval, out, it->templates,
+ tag, aclass);
+ return asn1_i2d_ex_primitive(pval, out, it, tag, aclass);
+
+ case ASN1_ITYPE_MSTRING:
+ return asn1_i2d_ex_primitive(pval, out, it, -1, aclass);
+
+ case ASN1_ITYPE_CHOICE:
+ if (asn1_cb && !asn1_cb(ASN1_OP_I2D_PRE, pval, it, NULL))
+ return 0;
+ i = asn1_get_choice_selector(pval, it);
+ if ((i >= 0) && (i < it->tcount)) {
+ ASN1_VALUE **pchval;
+ const ASN1_TEMPLATE *chtt;
+ chtt = it->templates + i;
+ pchval = asn1_get_field_ptr(pval, chtt);
+ return asn1_template_ex_i2d(pchval, out, chtt, -1, aclass);
+ }
+ /* Fixme: error condition if selector out of range */
+ if (asn1_cb && !asn1_cb(ASN1_OP_I2D_POST, pval, it, NULL))
+ return 0;
+ break;
+
+ case ASN1_ITYPE_EXTERN:
+ /* If new style i2d it does all the work */
+ ef = it->funcs;
+ return ef->asn1_ex_i2d(pval, out, it, tag, aclass);
+
+ case ASN1_ITYPE_NDEF_SEQUENCE:
+ /* Use indefinite length constructed if requested */
+ if (aclass & ASN1_TFLG_NDEF)
+ ndef = 2;
+ /* fall through */
+
+ case ASN1_ITYPE_SEQUENCE:
+ i = asn1_enc_restore(&seqcontlen, out, pval, it);
+ /* An error occurred */
+ if (i < 0)
+ return 0;
+ /* We have a valid cached encoding... */
+ if (i > 0)
+ return seqcontlen;
+ /* Otherwise carry on */
+ seqcontlen = 0;
+ /* If no IMPLICIT tagging set to SEQUENCE, UNIVERSAL */
+ if (tag == -1) {
+ tag = V_ASN1_SEQUENCE;
+ /* Retain any other flags in aclass */
+ aclass = (aclass & ~ASN1_TFLG_TAG_CLASS)
+ | V_ASN1_UNIVERSAL;
+ }
+ if (asn1_cb && !asn1_cb(ASN1_OP_I2D_PRE, pval, it, NULL))
+ return 0;
+ /* First work out sequence content length */
+ for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) {
+ const ASN1_TEMPLATE *seqtt;
+ ASN1_VALUE **pseqval;
+ int tmplen;
+ seqtt = asn1_do_adb(pval, tt, 1);
+ if (!seqtt)
+ return 0;
+ pseqval = asn1_get_field_ptr(pval, seqtt);
+ tmplen = asn1_template_ex_i2d(pseqval, NULL, seqtt, -1, aclass);
+ if (tmplen == -1 || (tmplen > INT_MAX - seqcontlen))
+ return -1;
+ seqcontlen += tmplen;
+ }
+
+ seqlen = ASN1_object_size(ndef, seqcontlen, tag);
+ if (!out || seqlen == -1)
+ return seqlen;
+ /* Output SEQUENCE header */
+ ASN1_put_object(out, ndef, seqcontlen, tag, aclass);
+ for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) {
+ const ASN1_TEMPLATE *seqtt;
+ ASN1_VALUE **pseqval;
+ seqtt = asn1_do_adb(pval, tt, 1);
+ if (!seqtt)
+ return 0;
+ pseqval = asn1_get_field_ptr(pval, seqtt);
+ /* FIXME: check for errors in enhanced version */
+ asn1_template_ex_i2d(pseqval, out, seqtt, -1, aclass);
+ }
+ if (ndef == 2)
+ ASN1_put_eoc(out);
+ if (asn1_cb && !asn1_cb(ASN1_OP_I2D_POST, pval, it, NULL))
+ return 0;
+ return seqlen;
+
+ default:
+ return 0;
+
+ }
+ return 0;
+}
+
+static int asn1_template_ex_i2d(ASN1_VALUE **pval, unsigned char **out,
+ const ASN1_TEMPLATE *tt, int tag, int iclass)
+{
+ int i, ret, flags, ttag, tclass, ndef;
+ ASN1_VALUE *tval;
+ flags = tt->flags;
+
+ /*
+ * If field is embedded then val needs fixing so it is a pointer to
+ * a pointer to a field.
+ */
+ if (flags & ASN1_TFLG_EMBED) {
+ tval = (ASN1_VALUE *)pval;
+ pval = &tval;
+ }
+ /*
+ * Work out tag and class to use: tagging may come either from the
+ * template or the arguments, not both because this would create
+ * ambiguity. Additionally the iclass argument may contain some
+ * additional flags which should be noted and passed down to other
+ * levels.
+ */
+ if (flags & ASN1_TFLG_TAG_MASK) {
+ /* Error if argument and template tagging */
+ if (tag != -1)
+ /* FIXME: error code here */
+ return -1;
+ /* Get tagging from template */
+ ttag = tt->tag;
+ tclass = flags & ASN1_TFLG_TAG_CLASS;
+ } else if (tag != -1) {
+ /* No template tagging, get from arguments */
+ ttag = tag;
+ tclass = iclass & ASN1_TFLG_TAG_CLASS;
+ } else {
+ ttag = -1;
+ tclass = 0;
+ }
+ /*
+ * Remove any class mask from iflag.
+ */
+ iclass &= ~ASN1_TFLG_TAG_CLASS;
+
+ /*
+ * At this point 'ttag' contains the outer tag to use, 'tclass' is the
+ * class and iclass is any flags passed to this function.
+ */
+
+ /* if template and arguments require ndef, use it */
+ if ((flags & ASN1_TFLG_NDEF) && (iclass & ASN1_TFLG_NDEF))
+ ndef = 2;
+ else
+ ndef = 1;
+
+ if (flags & ASN1_TFLG_SK_MASK) {
+ /* SET OF, SEQUENCE OF */
+ STACK_OF(ASN1_VALUE) *sk = (STACK_OF(ASN1_VALUE) *)*pval;
+ int isset, sktag, skaclass;
+ int skcontlen, sklen;
+ ASN1_VALUE *skitem;
+
+ if (!*pval)
+ return 0;
+
+ if (flags & ASN1_TFLG_SET_OF) {
+ isset = 1;
+ /* 2 means we reorder */
+ if (flags & ASN1_TFLG_SEQUENCE_OF)
+ isset = 2;
+ } else
+ isset = 0;
+
+ /*
+ * Work out inner tag value: if EXPLICIT or no tagging use underlying
+ * type.
+ */
+ if ((ttag != -1) && !(flags & ASN1_TFLG_EXPTAG)) {
+ sktag = ttag;
+ skaclass = tclass;
+ } else {
+ skaclass = V_ASN1_UNIVERSAL;
+ if (isset)
+ sktag = V_ASN1_SET;
+ else
+ sktag = V_ASN1_SEQUENCE;
+ }
+
+ /* Determine total length of items */
+ skcontlen = 0;
+ for (i = 0; i < sk_ASN1_VALUE_num(sk); i++) {
+ int tmplen;
+ skitem = sk_ASN1_VALUE_value(sk, i);
+ tmplen = ASN1_item_ex_i2d(&skitem, NULL, ASN1_ITEM_ptr(tt->item),
+ -1, iclass);
+ if (tmplen == -1 || (skcontlen > INT_MAX - tmplen))
+ return -1;
+ skcontlen += tmplen;
+ }
+ sklen = ASN1_object_size(ndef, skcontlen, sktag);
+ if (sklen == -1)
+ return -1;
+ /* If EXPLICIT need length of surrounding tag */
+ if (flags & ASN1_TFLG_EXPTAG)
+ ret = ASN1_object_size(ndef, sklen, ttag);
+ else
+ ret = sklen;
+
+ if (!out || ret == -1)
+ return ret;
+
+ /* Now encode this lot... */
+ /* EXPLICIT tag */
+ if (flags & ASN1_TFLG_EXPTAG)
+ ASN1_put_object(out, ndef, sklen, ttag, tclass);
+ /* SET or SEQUENCE and IMPLICIT tag */
+ ASN1_put_object(out, ndef, skcontlen, sktag, skaclass);
+ /* And the stuff itself */
+ asn1_set_seq_out(sk, out, skcontlen, ASN1_ITEM_ptr(tt->item),
+ isset, iclass);
+ if (ndef == 2) {
+ ASN1_put_eoc(out);
+ if (flags & ASN1_TFLG_EXPTAG)
+ ASN1_put_eoc(out);
+ }
+
+ return ret;
+ }
+
+ if (flags & ASN1_TFLG_EXPTAG) {
+ /* EXPLICIT tagging */
+ /* Find length of tagged item */
+ i = ASN1_item_ex_i2d(pval, NULL, ASN1_ITEM_ptr(tt->item), -1, iclass);
+ if (!i)
+ return 0;
+ /* Find length of EXPLICIT tag */
+ ret = ASN1_object_size(ndef, i, ttag);
+ if (out && ret != -1) {
+ /* Output tag and item */
+ ASN1_put_object(out, ndef, i, ttag, tclass);
+ ASN1_item_ex_i2d(pval, out, ASN1_ITEM_ptr(tt->item), -1, iclass);
+ if (ndef == 2)
+ ASN1_put_eoc(out);
+ }
+ return ret;
+ }
+
+ /* Either normal or IMPLICIT tagging: combine class and flags */
+ return ASN1_item_ex_i2d(pval, out, ASN1_ITEM_ptr(tt->item),
+ ttag, tclass | iclass);
+
+}
+
+/* Temporary structure used to hold DER encoding of items for SET OF */
+
+typedef struct {
+ unsigned char *data;
+ int length;
+ ASN1_VALUE *field;
+} DER_ENC;
+
+static int der_cmp(const void *a, const void *b)
+{
+ const DER_ENC *d1 = a, *d2 = b;
+ int cmplen, i;
+ cmplen = (d1->length < d2->length) ? d1->length : d2->length;
+ i = memcmp(d1->data, d2->data, cmplen);
+ if (i)
+ return i;
+ return d1->length - d2->length;
+}
+
+/* Output the content octets of SET OF or SEQUENCE OF */
+
+static int asn1_set_seq_out(STACK_OF(ASN1_VALUE) *sk, unsigned char **out,
+ int skcontlen, const ASN1_ITEM *item,
+ int do_sort, int iclass)
+{
+ int i;
+ ASN1_VALUE *skitem;
+ unsigned char *tmpdat = NULL, *p = NULL;
+ DER_ENC *derlst = NULL, *tder;
+ if (do_sort) {
+ /* Don't need to sort less than 2 items */
+ if (sk_ASN1_VALUE_num(sk) < 2)
+ do_sort = 0;
+ else {
+ derlst = OPENSSL_malloc(sk_ASN1_VALUE_num(sk)
+ * sizeof(*derlst));
+ if (derlst == NULL)
+ return 0;
+ tmpdat = OPENSSL_malloc(skcontlen);
+ if (tmpdat == NULL) {
+ OPENSSL_free(derlst);
+ return 0;
+ }
+ }
+ }
+ /* If not sorting just output each item */
+ if (!do_sort) {
+ for (i = 0; i < sk_ASN1_VALUE_num(sk); i++) {
+ skitem = sk_ASN1_VALUE_value(sk, i);
+ ASN1_item_ex_i2d(&skitem, out, item, -1, iclass);
+ }
+ return 1;
+ }
+ p = tmpdat;
+
+ /* Doing sort: build up a list of each member's DER encoding */
+ for (i = 0, tder = derlst; i < sk_ASN1_VALUE_num(sk); i++, tder++) {
+ skitem = sk_ASN1_VALUE_value(sk, i);
+ tder->data = p;
+ tder->length = ASN1_item_ex_i2d(&skitem, &p, item, -1, iclass);
+ tder->field = skitem;
+ }
+
+ /* Now sort them */
+ qsort(derlst, sk_ASN1_VALUE_num(sk), sizeof(*derlst), der_cmp);
+ /* Output sorted DER encoding */
+ p = *out;
+ for (i = 0, tder = derlst; i < sk_ASN1_VALUE_num(sk); i++, tder++) {
+ memcpy(p, tder->data, tder->length);
+ p += tder->length;
+ }
+ *out = p;
+ /* If do_sort is 2 then reorder the STACK */
+ if (do_sort == 2) {
+ for (i = 0, tder = derlst; i < sk_ASN1_VALUE_num(sk); i++, tder++)
+ (void)sk_ASN1_VALUE_set(sk, i, tder->field);
+ }
+ OPENSSL_free(derlst);
+ OPENSSL_free(tmpdat);
+ return 1;
+}
+
+static int asn1_i2d_ex_primitive(ASN1_VALUE **pval, unsigned char **out,
+ const ASN1_ITEM *it, int tag, int aclass)
+{
+ int len;
+ int utype;
+ int usetag;
+ int ndef = 0;
+
+ utype = it->utype;
+
+ /*
+ * Get length of content octets and maybe find out the underlying type.
+ */
+
+ len = asn1_ex_i2c(pval, NULL, &utype, it);
+
+ /*
+ * If SEQUENCE, SET or OTHER then header is included in pseudo content
+ * octets so don't include tag+length. We need to check here because the
+ * call to asn1_ex_i2c() could change utype.
+ */
+ if ((utype == V_ASN1_SEQUENCE) || (utype == V_ASN1_SET) ||
+ (utype == V_ASN1_OTHER))
+ usetag = 0;
+ else
+ usetag = 1;
+
+ /* -1 means omit type */
+
+ if (len == -1)
+ return 0;
+
+ /* -2 return is special meaning use ndef */
+ if (len == -2) {
+ ndef = 2;
+ len = 0;
+ }
+
+ /* If not implicitly tagged get tag from underlying type */
+ if (tag == -1)
+ tag = utype;
+
+ /* Output tag+length followed by content octets */
+ if (out) {
+ if (usetag)
+ ASN1_put_object(out, ndef, len, tag, aclass);
+ asn1_ex_i2c(pval, *out, &utype, it);
+ if (ndef)
+ ASN1_put_eoc(out);
+ else
+ *out += len;
+ }
+
+ if (usetag)
+ return ASN1_object_size(ndef, len, tag);
+ return len;
+}
+
+/* Produce content octets from a structure */
+
+static int asn1_ex_i2c(ASN1_VALUE **pval, unsigned char *cout, int *putype,
+ const ASN1_ITEM *it)
+{
+ ASN1_BOOLEAN *tbool = NULL;
+ ASN1_STRING *strtmp;
+ ASN1_OBJECT *otmp;
+ int utype;
+ const unsigned char *cont;
+ unsigned char c;
+ int len;
+ const ASN1_PRIMITIVE_FUNCS *pf;
+ pf = it->funcs;
+ if (pf && pf->prim_i2c)
+ return pf->prim_i2c(pval, cout, putype, it);
+
+ /* Should type be omitted? */
+ if ((it->itype != ASN1_ITYPE_PRIMITIVE)
+ || (it->utype != V_ASN1_BOOLEAN)) {
+ if (!*pval)
+ return -1;
+ }
+
+ if (it->itype == ASN1_ITYPE_MSTRING) {
+ /* If MSTRING type set the underlying type */
+ strtmp = (ASN1_STRING *)*pval;
+ utype = strtmp->type;
+ *putype = utype;
+ } else if (it->utype == V_ASN1_ANY) {
+ /* If ANY set type and pointer to value */
+ ASN1_TYPE *typ;
+ typ = (ASN1_TYPE *)*pval;
+ utype = typ->type;
+ *putype = utype;
+ pval = &typ->value.asn1_value;
+ } else
+ utype = *putype;
+
+ switch (utype) {
+ case V_ASN1_OBJECT:
+ otmp = (ASN1_OBJECT *)*pval;
+ cont = otmp->data;
+ len = otmp->length;
+ break;
+
+ case V_ASN1_NULL:
+ cont = NULL;
+ len = 0;
+ break;
+
+ case V_ASN1_BOOLEAN:
+ tbool = (ASN1_BOOLEAN *)pval;
+ if (*tbool == -1)
+ return -1;
+ if (it->utype != V_ASN1_ANY) {
+ /*
+ * Default handling if value == size field then omit
+ */
+ if (*tbool && (it->size > 0))
+ return -1;
+ if (!*tbool && !it->size)
+ return -1;
+ }
+ c = (unsigned char)*tbool;
+ cont = &c;
+ len = 1;
+ break;
+
+ case V_ASN1_BIT_STRING:
+ return i2c_ASN1_BIT_STRING((ASN1_BIT_STRING *)*pval,
+ cout ? &cout : NULL);
+
+ case V_ASN1_INTEGER:
+ case V_ASN1_ENUMERATED:
+ /*
+ * These are all have the same content format as ASN1_INTEGER
+ */
+ return i2c_ASN1_INTEGER((ASN1_INTEGER *)*pval, cout ? &cout : NULL);
+
+ case V_ASN1_OCTET_STRING:
+ case V_ASN1_NUMERICSTRING:
+ case V_ASN1_PRINTABLESTRING:
+ case V_ASN1_T61STRING:
+ case V_ASN1_VIDEOTEXSTRING:
+ case V_ASN1_IA5STRING:
+ case V_ASN1_UTCTIME:
+ case V_ASN1_GENERALIZEDTIME:
+ case V_ASN1_GRAPHICSTRING:
+ case V_ASN1_VISIBLESTRING:
+ case V_ASN1_GENERALSTRING:
+ case V_ASN1_UNIVERSALSTRING:
+ case V_ASN1_BMPSTRING:
+ case V_ASN1_UTF8STRING:
+ case V_ASN1_SEQUENCE:
+ case V_ASN1_SET:
+ default:
+ /* All based on ASN1_STRING and handled the same */
+ strtmp = (ASN1_STRING *)*pval;
+ /* Special handling for NDEF */
+ if ((it->size == ASN1_TFLG_NDEF)
+ && (strtmp->flags & ASN1_STRING_FLAG_NDEF)) {
+ if (cout) {
+ strtmp->data = cout;
+ strtmp->length = 0;
+ }
+ /* Special return code */
+ return -2;
+ }
+ cont = strtmp->data;
+ len = strtmp->length;
+
+ break;
+
+ }
+ if (cout && len)
+ memcpy(cout, cont, len);
+ return len;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_fre.c b/openssl-1.1.0h/crypto/asn1/tasn_fre.c
new file mode 100644
index 0000000..bbce489
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_fre.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/objects.h>
+#include "asn1_locl.h"
+
+/* Free up an ASN1 structure */
+
+void ASN1_item_free(ASN1_VALUE *val, const ASN1_ITEM *it)
+{
+ asn1_item_embed_free(&val, it, 0);
+}
+
+void ASN1_item_ex_free(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ asn1_item_embed_free(pval, it, 0);
+}
+
+void asn1_item_embed_free(ASN1_VALUE **pval, const ASN1_ITEM *it, int embed)
+{
+ const ASN1_TEMPLATE *tt = NULL, *seqtt;
+ const ASN1_EXTERN_FUNCS *ef;
+ const ASN1_AUX *aux = it->funcs;
+ ASN1_aux_cb *asn1_cb;
+ int i;
+
+ if (!pval)
+ return;
+ if ((it->itype != ASN1_ITYPE_PRIMITIVE) && !*pval)
+ return;
+ if (aux && aux->asn1_cb)
+ asn1_cb = aux->asn1_cb;
+ else
+ asn1_cb = 0;
+
+ switch (it->itype) {
+
+ case ASN1_ITYPE_PRIMITIVE:
+ if (it->templates)
+ asn1_template_free(pval, it->templates);
+ else
+ asn1_primitive_free(pval, it, embed);
+ break;
+
+ case ASN1_ITYPE_MSTRING:
+ asn1_primitive_free(pval, it, embed);
+ break;
+
+ case ASN1_ITYPE_CHOICE:
+ if (asn1_cb) {
+ i = asn1_cb(ASN1_OP_FREE_PRE, pval, it, NULL);
+ if (i == 2)
+ return;
+ }
+ i = asn1_get_choice_selector(pval, it);
+ if ((i >= 0) && (i < it->tcount)) {
+ ASN1_VALUE **pchval;
+
+ tt = it->templates + i;
+ pchval = asn1_get_field_ptr(pval, tt);
+ asn1_template_free(pchval, tt);
+ }
+ if (asn1_cb)
+ asn1_cb(ASN1_OP_FREE_POST, pval, it, NULL);
+ if (embed == 0) {
+ OPENSSL_free(*pval);
+ *pval = NULL;
+ }
+ break;
+
+ case ASN1_ITYPE_EXTERN:
+ ef = it->funcs;
+ if (ef && ef->asn1_ex_free)
+ ef->asn1_ex_free(pval, it);
+ break;
+
+ case ASN1_ITYPE_NDEF_SEQUENCE:
+ case ASN1_ITYPE_SEQUENCE:
+ if (asn1_do_lock(pval, -1, it) != 0) /* if error or ref-counter > 0 */
+ return;
+ if (asn1_cb) {
+ i = asn1_cb(ASN1_OP_FREE_PRE, pval, it, NULL);
+ if (i == 2)
+ return;
+ }
+ asn1_enc_free(pval, it);
+ /*
+ * If we free up as normal we will invalidate any ANY DEFINED BY
+ * field and we won't be able to determine the type of the field it
+ * defines. So free up in reverse order.
+ */
+ tt = it->templates + it->tcount;
+ for (i = 0; i < it->tcount; i++) {
+ ASN1_VALUE **pseqval;
+
+ tt--;
+ seqtt = asn1_do_adb(pval, tt, 0);
+ if (!seqtt)
+ continue;
+ pseqval = asn1_get_field_ptr(pval, seqtt);
+ asn1_template_free(pseqval, seqtt);
+ }
+ if (asn1_cb)
+ asn1_cb(ASN1_OP_FREE_POST, pval, it, NULL);
+ if (embed == 0) {
+ OPENSSL_free(*pval);
+ *pval = NULL;
+ }
+ break;
+ }
+}
+
+void asn1_template_free(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt)
+{
+ int embed = tt->flags & ASN1_TFLG_EMBED;
+ ASN1_VALUE *tval;
+ if (embed) {
+ tval = (ASN1_VALUE *)pval;
+ pval = &tval;
+ }
+ if (tt->flags & ASN1_TFLG_SK_MASK) {
+ STACK_OF(ASN1_VALUE) *sk = (STACK_OF(ASN1_VALUE) *)*pval;
+ int i;
+
+ for (i = 0; i < sk_ASN1_VALUE_num(sk); i++) {
+ ASN1_VALUE *vtmp = sk_ASN1_VALUE_value(sk, i);
+
+ asn1_item_embed_free(&vtmp, ASN1_ITEM_ptr(tt->item), embed);
+ }
+ sk_ASN1_VALUE_free(sk);
+ *pval = NULL;
+ } else {
+ asn1_item_embed_free(pval, ASN1_ITEM_ptr(tt->item), embed);
+ }
+}
+
+void asn1_primitive_free(ASN1_VALUE **pval, const ASN1_ITEM *it, int embed)
+{
+ int utype;
+
+ /* Special case: if 'it' is a primitive with a free_func, use that. */
+ if (it) {
+ const ASN1_PRIMITIVE_FUNCS *pf = it->funcs;
+
+ if (embed) {
+ if (pf && pf->prim_clear) {
+ pf->prim_clear(pval, it);
+ return;
+ }
+ } else if (pf && pf->prim_free) {
+ pf->prim_free(pval, it);
+ return;
+ }
+ }
+
+ /* Special case: if 'it' is NULL, free contents of ASN1_TYPE */
+ if (!it) {
+ ASN1_TYPE *typ = (ASN1_TYPE *)*pval;
+
+ utype = typ->type;
+ pval = &typ->value.asn1_value;
+ if (!*pval)
+ return;
+ } else if (it->itype == ASN1_ITYPE_MSTRING) {
+ utype = -1;
+ if (!*pval)
+ return;
+ } else {
+ utype = it->utype;
+ if ((utype != V_ASN1_BOOLEAN) && !*pval)
+ return;
+ }
+
+ switch (utype) {
+ case V_ASN1_OBJECT:
+ ASN1_OBJECT_free((ASN1_OBJECT *)*pval);
+ break;
+
+ case V_ASN1_BOOLEAN:
+ if (it)
+ *(ASN1_BOOLEAN *)pval = it->size;
+ else
+ *(ASN1_BOOLEAN *)pval = -1;
+ return;
+
+ case V_ASN1_NULL:
+ break;
+
+ case V_ASN1_ANY:
+ asn1_primitive_free(pval, NULL, 0);
+ OPENSSL_free(*pval);
+ break;
+
+ default:
+ asn1_string_embed_free((ASN1_STRING *)*pval, embed);
+ break;
+ }
+ *pval = NULL;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_new.c b/openssl-1.1.0h/crypto/asn1/tasn_new.c
new file mode 100644
index 0000000..11c8040
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_new.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include <openssl/asn1.h>
+#include <openssl/objects.h>
+#include <openssl/err.h>
+#include <openssl/asn1t.h>
+#include <string.h>
+#include "asn1_locl.h"
+
+static int asn1_item_embed_new(ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int embed);
+static int asn1_primitive_new(ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int embed);
+static void asn1_item_clear(ASN1_VALUE **pval, const ASN1_ITEM *it);
+static int asn1_template_new(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt);
+static void asn1_template_clear(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt);
+static void asn1_primitive_clear(ASN1_VALUE **pval, const ASN1_ITEM *it);
+
+ASN1_VALUE *ASN1_item_new(const ASN1_ITEM *it)
+{
+ ASN1_VALUE *ret = NULL;
+ if (ASN1_item_ex_new(&ret, it) > 0)
+ return ret;
+ return NULL;
+}
+
+/* Allocate an ASN1 structure */
+
+int ASN1_item_ex_new(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ return asn1_item_embed_new(pval, it, 0);
+}
+
+int asn1_item_embed_new(ASN1_VALUE **pval, const ASN1_ITEM *it, int embed)
+{
+ const ASN1_TEMPLATE *tt = NULL;
+ const ASN1_EXTERN_FUNCS *ef;
+ const ASN1_AUX *aux = it->funcs;
+ ASN1_aux_cb *asn1_cb;
+ ASN1_VALUE **pseqval;
+ int i;
+ if (aux && aux->asn1_cb)
+ asn1_cb = aux->asn1_cb;
+ else
+ asn1_cb = 0;
+
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_push(it->sname ? it->sname : "asn1_item_embed_new");
+#endif
+
+ switch (it->itype) {
+
+ case ASN1_ITYPE_EXTERN:
+ ef = it->funcs;
+ if (ef && ef->asn1_ex_new) {
+ if (!ef->asn1_ex_new(pval, it))
+ goto memerr;
+ }
+ break;
+
+ case ASN1_ITYPE_PRIMITIVE:
+ if (it->templates) {
+ if (!asn1_template_new(pval, it->templates))
+ goto memerr;
+ } else if (!asn1_primitive_new(pval, it, embed))
+ goto memerr;
+ break;
+
+ case ASN1_ITYPE_MSTRING:
+ if (!asn1_primitive_new(pval, it, embed))
+ goto memerr;
+ break;
+
+ case ASN1_ITYPE_CHOICE:
+ if (asn1_cb) {
+ i = asn1_cb(ASN1_OP_NEW_PRE, pval, it, NULL);
+ if (!i)
+ goto auxerr;
+ if (i == 2) {
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_pop();
+#endif
+ return 1;
+ }
+ }
+ if (embed) {
+ memset(*pval, 0, it->size);
+ } else {
+ *pval = OPENSSL_zalloc(it->size);
+ if (*pval == NULL)
+ goto memerr;
+ }
+ asn1_set_choice_selector(pval, -1, it);
+ if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL))
+ goto auxerr2;
+ break;
+
+ case ASN1_ITYPE_NDEF_SEQUENCE:
+ case ASN1_ITYPE_SEQUENCE:
+ if (asn1_cb) {
+ i = asn1_cb(ASN1_OP_NEW_PRE, pval, it, NULL);
+ if (!i)
+ goto auxerr;
+ if (i == 2) {
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_pop();
+#endif
+ return 1;
+ }
+ }
+ if (embed) {
+ memset(*pval, 0, it->size);
+ } else {
+ *pval = OPENSSL_zalloc(it->size);
+ if (*pval == NULL)
+ goto memerr;
+ }
+ /* 0 : init. lock */
+ if (asn1_do_lock(pval, 0, it) < 0) {
+ if (!embed) {
+ OPENSSL_free(*pval);
+ *pval = NULL;
+ }
+ goto memerr;
+ }
+ asn1_enc_init(pval, it);
+ for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) {
+ pseqval = asn1_get_field_ptr(pval, tt);
+ if (!asn1_template_new(pseqval, tt))
+ goto memerr2;
+ }
+ if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL))
+ goto auxerr2;
+ break;
+ }
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_pop();
+#endif
+ return 1;
+
+ memerr2:
+ asn1_item_embed_free(pval, it, embed);
+ memerr:
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_NEW, ERR_R_MALLOC_FAILURE);
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_pop();
+#endif
+ return 0;
+
+ auxerr2:
+ asn1_item_embed_free(pval, it, embed);
+ auxerr:
+ ASN1err(ASN1_F_ASN1_ITEM_EMBED_NEW, ASN1_R_AUX_ERROR);
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_pop();
+#endif
+ return 0;
+
+}
+
+static void asn1_item_clear(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ const ASN1_EXTERN_FUNCS *ef;
+
+ switch (it->itype) {
+
+ case ASN1_ITYPE_EXTERN:
+ ef = it->funcs;
+ if (ef && ef->asn1_ex_clear)
+ ef->asn1_ex_clear(pval, it);
+ else
+ *pval = NULL;
+ break;
+
+ case ASN1_ITYPE_PRIMITIVE:
+ if (it->templates)
+ asn1_template_clear(pval, it->templates);
+ else
+ asn1_primitive_clear(pval, it);
+ break;
+
+ case ASN1_ITYPE_MSTRING:
+ asn1_primitive_clear(pval, it);
+ break;
+
+ case ASN1_ITYPE_CHOICE:
+ case ASN1_ITYPE_SEQUENCE:
+ case ASN1_ITYPE_NDEF_SEQUENCE:
+ *pval = NULL;
+ break;
+ }
+}
+
+static int asn1_template_new(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt)
+{
+ const ASN1_ITEM *it = ASN1_ITEM_ptr(tt->item);
+ int embed = tt->flags & ASN1_TFLG_EMBED;
+ ASN1_VALUE *tval;
+ int ret;
+ if (embed) {
+ tval = (ASN1_VALUE *)pval;
+ pval = &tval;
+ }
+ if (tt->flags & ASN1_TFLG_OPTIONAL) {
+ asn1_template_clear(pval, tt);
+ return 1;
+ }
+ /* If ANY DEFINED BY nothing to do */
+
+ if (tt->flags & ASN1_TFLG_ADB_MASK) {
+ *pval = NULL;
+ return 1;
+ }
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_push(tt->field_name
+ ? tt->field_name : "asn1_template_new");
+#endif
+ /* If SET OF or SEQUENCE OF, its a STACK */
+ if (tt->flags & ASN1_TFLG_SK_MASK) {
+ STACK_OF(ASN1_VALUE) *skval;
+ skval = sk_ASN1_VALUE_new_null();
+ if (!skval) {
+ ASN1err(ASN1_F_ASN1_TEMPLATE_NEW, ERR_R_MALLOC_FAILURE);
+ ret = 0;
+ goto done;
+ }
+ *pval = (ASN1_VALUE *)skval;
+ ret = 1;
+ goto done;
+ }
+ /* Otherwise pass it back to the item routine */
+ ret = asn1_item_embed_new(pval, it, embed);
+ done:
+#ifndef OPENSSL_NO_CRYPTO_MDEBUG
+ OPENSSL_mem_debug_pop();
+#endif
+ return ret;
+}
+
+static void asn1_template_clear(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt)
+{
+ /* If ADB or STACK just NULL the field */
+ if (tt->flags & (ASN1_TFLG_ADB_MASK | ASN1_TFLG_SK_MASK))
+ *pval = NULL;
+ else
+ asn1_item_clear(pval, ASN1_ITEM_ptr(tt->item));
+}
+
+/*
+ * NB: could probably combine most of the real XXX_new() behaviour and junk
+ * all the old functions.
+ */
+
+static int asn1_primitive_new(ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int embed)
+{
+ ASN1_TYPE *typ;
+ ASN1_STRING *str;
+ int utype;
+
+ if (!it)
+ return 0;
+
+ if (it->funcs) {
+ const ASN1_PRIMITIVE_FUNCS *pf = it->funcs;
+ if (embed) {
+ if (pf->prim_clear) {
+ pf->prim_clear(pval, it);
+ return 1;
+ }
+ } else if (pf->prim_new) {
+ return pf->prim_new(pval, it);
+ }
+ }
+
+ if (it->itype == ASN1_ITYPE_MSTRING)
+ utype = -1;
+ else
+ utype = it->utype;
+ switch (utype) {
+ case V_ASN1_OBJECT:
+ *pval = (ASN1_VALUE *)OBJ_nid2obj(NID_undef);
+ return 1;
+
+ case V_ASN1_BOOLEAN:
+ *(ASN1_BOOLEAN *)pval = it->size;
+ return 1;
+
+ case V_ASN1_NULL:
+ *pval = (ASN1_VALUE *)1;
+ return 1;
+
+ case V_ASN1_ANY:
+ typ = OPENSSL_malloc(sizeof(*typ));
+ if (typ == NULL)
+ return 0;
+ typ->value.ptr = NULL;
+ typ->type = -1;
+ *pval = (ASN1_VALUE *)typ;
+ break;
+
+ default:
+ if (embed) {
+ str = *(ASN1_STRING **)pval;
+ memset(str, 0, sizeof(*str));
+ str->type = utype;
+ str->flags = ASN1_STRING_FLAG_EMBED;
+ } else {
+ str = ASN1_STRING_type_new(utype);
+ *pval = (ASN1_VALUE *)str;
+ }
+ if (it->itype == ASN1_ITYPE_MSTRING && str)
+ str->flags |= ASN1_STRING_FLAG_MSTRING;
+ break;
+ }
+ if (*pval)
+ return 1;
+ return 0;
+}
+
+static void asn1_primitive_clear(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ int utype;
+ if (it && it->funcs) {
+ const ASN1_PRIMITIVE_FUNCS *pf = it->funcs;
+ if (pf->prim_clear)
+ pf->prim_clear(pval, it);
+ else
+ *pval = NULL;
+ return;
+ }
+ if (!it || (it->itype == ASN1_ITYPE_MSTRING))
+ utype = -1;
+ else
+ utype = it->utype;
+ if (utype == V_ASN1_BOOLEAN)
+ *(ASN1_BOOLEAN *)pval = it->size;
+ else
+ *pval = NULL;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_prn.c b/openssl-1.1.0h/crypto/asn1/tasn_prn.c
new file mode 100644
index 0000000..53a9ee8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_prn.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/objects.h>
+#include <openssl/buffer.h>
+#include <openssl/err.h>
+#include <openssl/x509v3.h>
+#include "internal/asn1_int.h"
+#include "asn1_locl.h"
+
+/*
+ * Print routines.
+ */
+
+/* ASN1_PCTX routines */
+
+static ASN1_PCTX default_pctx = {
+ ASN1_PCTX_FLAGS_SHOW_ABSENT, /* flags */
+ 0, /* nm_flags */
+ 0, /* cert_flags */
+ 0, /* oid_flags */
+ 0 /* str_flags */
+};
+
+ASN1_PCTX *ASN1_PCTX_new(void)
+{
+ ASN1_PCTX *ret;
+
+ ret = OPENSSL_zalloc(sizeof(*ret));
+ if (ret == NULL) {
+ ASN1err(ASN1_F_ASN1_PCTX_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+ return ret;
+}
+
+void ASN1_PCTX_free(ASN1_PCTX *p)
+{
+ OPENSSL_free(p);
+}
+
+unsigned long ASN1_PCTX_get_flags(const ASN1_PCTX *p)
+{
+ return p->flags;
+}
+
+void ASN1_PCTX_set_flags(ASN1_PCTX *p, unsigned long flags)
+{
+ p->flags = flags;
+}
+
+unsigned long ASN1_PCTX_get_nm_flags(const ASN1_PCTX *p)
+{
+ return p->nm_flags;
+}
+
+void ASN1_PCTX_set_nm_flags(ASN1_PCTX *p, unsigned long flags)
+{
+ p->nm_flags = flags;
+}
+
+unsigned long ASN1_PCTX_get_cert_flags(const ASN1_PCTX *p)
+{
+ return p->cert_flags;
+}
+
+void ASN1_PCTX_set_cert_flags(ASN1_PCTX *p, unsigned long flags)
+{
+ p->cert_flags = flags;
+}
+
+unsigned long ASN1_PCTX_get_oid_flags(const ASN1_PCTX *p)
+{
+ return p->oid_flags;
+}
+
+void ASN1_PCTX_set_oid_flags(ASN1_PCTX *p, unsigned long flags)
+{
+ p->oid_flags = flags;
+}
+
+unsigned long ASN1_PCTX_get_str_flags(const ASN1_PCTX *p)
+{
+ return p->str_flags;
+}
+
+void ASN1_PCTX_set_str_flags(ASN1_PCTX *p, unsigned long flags)
+{
+ p->str_flags = flags;
+}
+
+/* Main print routines */
+
+static int asn1_item_print_ctx(BIO *out, ASN1_VALUE **fld, int indent,
+ const ASN1_ITEM *it,
+ const char *fname, const char *sname,
+ int nohdr, const ASN1_PCTX *pctx);
+
+static int asn1_template_print_ctx(BIO *out, ASN1_VALUE **fld, int indent,
+ const ASN1_TEMPLATE *tt, const ASN1_PCTX *pctx);
+
+static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld,
+ const ASN1_ITEM *it, int indent,
+ const char *fname, const char *sname,
+ const ASN1_PCTX *pctx);
+
+static int asn1_print_fsname(BIO *out, int indent,
+ const char *fname, const char *sname,
+ const ASN1_PCTX *pctx);
+
+int ASN1_item_print(BIO *out, ASN1_VALUE *ifld, int indent,
+ const ASN1_ITEM *it, const ASN1_PCTX *pctx)
+{
+ const char *sname;
+ if (pctx == NULL)
+ pctx = &default_pctx;
+ if (pctx->flags & ASN1_PCTX_FLAGS_NO_STRUCT_NAME)
+ sname = NULL;
+ else
+ sname = it->sname;
+ return asn1_item_print_ctx(out, &ifld, indent, it, NULL, sname, 0, pctx);
+}
+
+static int asn1_item_print_ctx(BIO *out, ASN1_VALUE **fld, int indent,
+ const ASN1_ITEM *it,
+ const char *fname, const char *sname,
+ int nohdr, const ASN1_PCTX *pctx)
+{
+ const ASN1_TEMPLATE *tt;
+ const ASN1_EXTERN_FUNCS *ef;
+ ASN1_VALUE **tmpfld;
+ const ASN1_AUX *aux = it->funcs;
+ ASN1_aux_cb *asn1_cb;
+ ASN1_PRINT_ARG parg;
+ int i;
+ if (aux && aux->asn1_cb) {
+ parg.out = out;
+ parg.indent = indent;
+ parg.pctx = pctx;
+ asn1_cb = aux->asn1_cb;
+ } else
+ asn1_cb = 0;
+
+ if (((it->itype != ASN1_ITYPE_PRIMITIVE)
+ || (it->utype != V_ASN1_BOOLEAN)) && *fld == NULL) {
+ if (pctx->flags & ASN1_PCTX_FLAGS_SHOW_ABSENT) {
+ if (!nohdr && !asn1_print_fsname(out, indent, fname, sname, pctx))
+ return 0;
+ if (BIO_puts(out, "<ABSENT>\n") <= 0)
+ return 0;
+ }
+ return 1;
+ }
+
+ switch (it->itype) {
+ case ASN1_ITYPE_PRIMITIVE:
+ if (it->templates) {
+ if (!asn1_template_print_ctx(out, fld, indent,
+ it->templates, pctx))
+ return 0;
+ break;
+ }
+ /* fall through */
+ case ASN1_ITYPE_MSTRING:
+ if (!asn1_primitive_print(out, fld, it, indent, fname, sname, pctx))
+ return 0;
+ break;
+
+ case ASN1_ITYPE_EXTERN:
+ if (!nohdr && !asn1_print_fsname(out, indent, fname, sname, pctx))
+ return 0;
+ /* Use new style print routine if possible */
+ ef = it->funcs;
+ if (ef && ef->asn1_ex_print) {
+ i = ef->asn1_ex_print(out, fld, indent, "", pctx);
+ if (!i)
+ return 0;
+ if ((i == 2) && (BIO_puts(out, "\n") <= 0))
+ return 0;
+ return 1;
+ } else if (sname &&
+ BIO_printf(out, ":EXTERNAL TYPE %s\n", sname) <= 0)
+ return 0;
+ break;
+
+ case ASN1_ITYPE_CHOICE:
+ /* CHOICE type, get selector */
+ i = asn1_get_choice_selector(fld, it);
+ /* This should never happen... */
+ if ((i < 0) || (i >= it->tcount)) {
+ if (BIO_printf(out, "ERROR: selector [%d] invalid\n", i) <= 0)
+ return 0;
+ return 1;
+ }
+ tt = it->templates + i;
+ tmpfld = asn1_get_field_ptr(fld, tt);
+ if (!asn1_template_print_ctx(out, tmpfld, indent, tt, pctx))
+ return 0;
+ break;
+
+ case ASN1_ITYPE_SEQUENCE:
+ case ASN1_ITYPE_NDEF_SEQUENCE:
+ if (!nohdr && !asn1_print_fsname(out, indent, fname, sname, pctx))
+ return 0;
+ if (fname || sname) {
+ if (pctx->flags & ASN1_PCTX_FLAGS_SHOW_SEQUENCE) {
+ if (BIO_puts(out, " {\n") <= 0)
+ return 0;
+ } else {
+ if (BIO_puts(out, "\n") <= 0)
+ return 0;
+ }
+ }
+
+ if (asn1_cb) {
+ i = asn1_cb(ASN1_OP_PRINT_PRE, fld, it, &parg);
+ if (i == 0)
+ return 0;
+ if (i == 2)
+ return 1;
+ }
+
+ /* Print each field entry */
+ for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) {
+ const ASN1_TEMPLATE *seqtt;
+ seqtt = asn1_do_adb(fld, tt, 1);
+ if (!seqtt)
+ return 0;
+ tmpfld = asn1_get_field_ptr(fld, seqtt);
+ if (!asn1_template_print_ctx(out, tmpfld,
+ indent + 2, seqtt, pctx))
+ return 0;
+ }
+ if (pctx->flags & ASN1_PCTX_FLAGS_SHOW_SEQUENCE) {
+ if (BIO_printf(out, "%*s}\n", indent, "") < 0)
+ return 0;
+ }
+
+ if (asn1_cb) {
+ i = asn1_cb(ASN1_OP_PRINT_POST, fld, it, &parg);
+ if (i == 0)
+ return 0;
+ }
+ break;
+
+ default:
+ BIO_printf(out, "Unprocessed type %d\n", it->itype);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int asn1_template_print_ctx(BIO *out, ASN1_VALUE **fld, int indent,
+ const ASN1_TEMPLATE *tt, const ASN1_PCTX *pctx)
+{
+ int i, flags;
+ const char *sname, *fname;
+ ASN1_VALUE *tfld;
+ flags = tt->flags;
+ if (pctx->flags & ASN1_PCTX_FLAGS_SHOW_FIELD_STRUCT_NAME)
+ sname = ASN1_ITEM_ptr(tt->item)->sname;
+ else
+ sname = NULL;
+ if (pctx->flags & ASN1_PCTX_FLAGS_NO_FIELD_NAME)
+ fname = NULL;
+ else
+ fname = tt->field_name;
+
+ /*
+ * If field is embedded then fld needs fixing so it is a pointer to
+ * a pointer to a field.
+ */
+ if (flags & ASN1_TFLG_EMBED) {
+ tfld = (ASN1_VALUE *)fld;
+ fld = &tfld;
+ }
+
+ if (flags & ASN1_TFLG_SK_MASK) {
+ char *tname;
+ ASN1_VALUE *skitem;
+ STACK_OF(ASN1_VALUE) *stack;
+
+ /* SET OF, SEQUENCE OF */
+ if (fname) {
+ if (pctx->flags & ASN1_PCTX_FLAGS_SHOW_SSOF) {
+ if (flags & ASN1_TFLG_SET_OF)
+ tname = "SET";
+ else
+ tname = "SEQUENCE";
+ if (BIO_printf(out, "%*s%s OF %s {\n",
+ indent, "", tname, tt->field_name) <= 0)
+ return 0;
+ } else if (BIO_printf(out, "%*s%s:\n", indent, "", fname) <= 0)
+ return 0;
+ }
+ stack = (STACK_OF(ASN1_VALUE) *)*fld;
+ for (i = 0; i < sk_ASN1_VALUE_num(stack); i++) {
+ if ((i > 0) && (BIO_puts(out, "\n") <= 0))
+ return 0;
+
+ skitem = sk_ASN1_VALUE_value(stack, i);
+ if (!asn1_item_print_ctx(out, &skitem, indent + 2,
+ ASN1_ITEM_ptr(tt->item), NULL, NULL, 1,
+ pctx))
+ return 0;
+ }
+ if (!i && BIO_printf(out, "%*s<EMPTY>\n", indent + 2, "") <= 0)
+ return 0;
+ if (pctx->flags & ASN1_PCTX_FLAGS_SHOW_SEQUENCE) {
+ if (BIO_printf(out, "%*s}\n", indent, "") <= 0)
+ return 0;
+ }
+ return 1;
+ }
+ return asn1_item_print_ctx(out, fld, indent, ASN1_ITEM_ptr(tt->item),
+ fname, sname, 0, pctx);
+}
+
+static int asn1_print_fsname(BIO *out, int indent,
+ const char *fname, const char *sname,
+ const ASN1_PCTX *pctx)
+{
+ static const char spaces[] = " ";
+ static const int nspaces = sizeof(spaces) - 1;
+
+ while (indent > nspaces) {
+ if (BIO_write(out, spaces, nspaces) != nspaces)
+ return 0;
+ indent -= nspaces;
+ }
+ if (BIO_write(out, spaces, indent) != indent)
+ return 0;
+ if (pctx->flags & ASN1_PCTX_FLAGS_NO_STRUCT_NAME)
+ sname = NULL;
+ if (pctx->flags & ASN1_PCTX_FLAGS_NO_FIELD_NAME)
+ fname = NULL;
+ if (!sname && !fname)
+ return 1;
+ if (fname) {
+ if (BIO_puts(out, fname) <= 0)
+ return 0;
+ }
+ if (sname) {
+ if (fname) {
+ if (BIO_printf(out, " (%s)", sname) <= 0)
+ return 0;
+ } else {
+ if (BIO_puts(out, sname) <= 0)
+ return 0;
+ }
+ }
+ if (BIO_write(out, ": ", 2) != 2)
+ return 0;
+ return 1;
+}
+
+static int asn1_print_boolean(BIO *out, int boolval)
+{
+ const char *str;
+ switch (boolval) {
+ case -1:
+ str = "BOOL ABSENT";
+ break;
+
+ case 0:
+ str = "FALSE";
+ break;
+
+ default:
+ str = "TRUE";
+ break;
+
+ }
+
+ if (BIO_puts(out, str) <= 0)
+ return 0;
+ return 1;
+
+}
+
+static int asn1_print_integer(BIO *out, const ASN1_INTEGER *str)
+{
+ char *s;
+ int ret = 1;
+ s = i2s_ASN1_INTEGER(NULL, str);
+ if (s == NULL)
+ return 0;
+ if (BIO_puts(out, s) <= 0)
+ ret = 0;
+ OPENSSL_free(s);
+ return ret;
+}
+
+static int asn1_print_oid(BIO *out, const ASN1_OBJECT *oid)
+{
+ char objbuf[80];
+ const char *ln;
+ ln = OBJ_nid2ln(OBJ_obj2nid(oid));
+ if (!ln)
+ ln = "";
+ OBJ_obj2txt(objbuf, sizeof(objbuf), oid, 1);
+ if (BIO_printf(out, "%s (%s)", ln, objbuf) <= 0)
+ return 0;
+ return 1;
+}
+
+static int asn1_print_obstring(BIO *out, const ASN1_STRING *str, int indent)
+{
+ if (str->type == V_ASN1_BIT_STRING) {
+ if (BIO_printf(out, " (%ld unused bits)\n", str->flags & 0x7) <= 0)
+ return 0;
+ } else if (BIO_puts(out, "\n") <= 0)
+ return 0;
+ if ((str->length > 0)
+ && BIO_dump_indent(out, (const char *)str->data, str->length,
+ indent + 2) <= 0)
+ return 0;
+ return 1;
+}
+
+static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld,
+ const ASN1_ITEM *it, int indent,
+ const char *fname, const char *sname,
+ const ASN1_PCTX *pctx)
+{
+ long utype;
+ ASN1_STRING *str;
+ int ret = 1, needlf = 1;
+ const char *pname;
+ const ASN1_PRIMITIVE_FUNCS *pf;
+ pf = it->funcs;
+ if (!asn1_print_fsname(out, indent, fname, sname, pctx))
+ return 0;
+ if (pf && pf->prim_print)
+ return pf->prim_print(out, fld, it, indent, pctx);
+ if (it->itype == ASN1_ITYPE_MSTRING) {
+ str = (ASN1_STRING *)*fld;
+ utype = str->type & ~V_ASN1_NEG;
+ } else {
+ utype = it->utype;
+ if (utype == V_ASN1_BOOLEAN)
+ str = NULL;
+ else
+ str = (ASN1_STRING *)*fld;
+ }
+ if (utype == V_ASN1_ANY) {
+ ASN1_TYPE *atype = (ASN1_TYPE *)*fld;
+ utype = atype->type;
+ fld = &atype->value.asn1_value;
+ str = (ASN1_STRING *)*fld;
+ if (pctx->flags & ASN1_PCTX_FLAGS_NO_ANY_TYPE)
+ pname = NULL;
+ else
+ pname = ASN1_tag2str(utype);
+ } else {
+ if (pctx->flags & ASN1_PCTX_FLAGS_SHOW_TYPE)
+ pname = ASN1_tag2str(utype);
+ else
+ pname = NULL;
+ }
+
+ if (utype == V_ASN1_NULL) {
+ if (BIO_puts(out, "NULL\n") <= 0)
+ return 0;
+ return 1;
+ }
+
+ if (pname) {
+ if (BIO_puts(out, pname) <= 0)
+ return 0;
+ if (BIO_puts(out, ":") <= 0)
+ return 0;
+ }
+
+ switch (utype) {
+ case V_ASN1_BOOLEAN:
+ {
+ int boolval = *(int *)fld;
+ if (boolval == -1)
+ boolval = it->size;
+ ret = asn1_print_boolean(out, boolval);
+ }
+ break;
+
+ case V_ASN1_INTEGER:
+ case V_ASN1_ENUMERATED:
+ ret = asn1_print_integer(out, str);
+ break;
+
+ case V_ASN1_UTCTIME:
+ ret = ASN1_UTCTIME_print(out, str);
+ break;
+
+ case V_ASN1_GENERALIZEDTIME:
+ ret = ASN1_GENERALIZEDTIME_print(out, str);
+ break;
+
+ case V_ASN1_OBJECT:
+ ret = asn1_print_oid(out, (const ASN1_OBJECT *)*fld);
+ break;
+
+ case V_ASN1_OCTET_STRING:
+ case V_ASN1_BIT_STRING:
+ ret = asn1_print_obstring(out, str, indent);
+ needlf = 0;
+ break;
+
+ case V_ASN1_SEQUENCE:
+ case V_ASN1_SET:
+ case V_ASN1_OTHER:
+ if (BIO_puts(out, "\n") <= 0)
+ return 0;
+ if (ASN1_parse_dump(out, str->data, str->length, indent, 0) <= 0)
+ ret = 0;
+ needlf = 0;
+ break;
+
+ default:
+ ret = ASN1_STRING_print_ex(out, str, pctx->str_flags);
+
+ }
+ if (!ret)
+ return 0;
+ if (needlf && BIO_puts(out, "\n") <= 0)
+ return 0;
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_scn.c b/openssl-1.1.0h/crypto/asn1/tasn_scn.c
new file mode 100644
index 0000000..e1df2cf
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_scn.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/objects.h>
+#include <openssl/buffer.h>
+#include <openssl/err.h>
+#include <openssl/x509v3.h>
+#include "asn1_locl.h"
+
+/*
+ * General ASN1 structure recursive scanner: iterate through all fields
+ * passing details to a callback.
+ */
+
+ASN1_SCTX *ASN1_SCTX_new(int (*scan_cb) (ASN1_SCTX *ctx))
+{
+ ASN1_SCTX *ret = OPENSSL_zalloc(sizeof(*ret));
+
+ if (ret == NULL) {
+ ASN1err(ASN1_F_ASN1_SCTX_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+ ret->scan_cb = scan_cb;
+ return ret;
+}
+
+void ASN1_SCTX_free(ASN1_SCTX *p)
+{
+ OPENSSL_free(p);
+}
+
+const ASN1_ITEM *ASN1_SCTX_get_item(ASN1_SCTX *p)
+{
+ return p->it;
+}
+
+const ASN1_TEMPLATE *ASN1_SCTX_get_template(ASN1_SCTX *p)
+{
+ return p->tt;
+}
+
+unsigned long ASN1_SCTX_get_flags(ASN1_SCTX *p)
+{
+ return p->flags;
+}
+
+void ASN1_SCTX_set_app_data(ASN1_SCTX *p, void *data)
+{
+ p->app_data = data;
+}
+
+void *ASN1_SCTX_get_app_data(ASN1_SCTX *p)
+{
+ return p->app_data;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_typ.c b/openssl-1.1.0h/crypto/asn1/tasn_typ.c
new file mode 100644
index 0000000..98d9879
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_typ.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+
+/* Declarations for string types */
+
+#define IMPLEMENT_ASN1_STRING_FUNCTIONS(sname) \
+ IMPLEMENT_ASN1_TYPE(sname) \
+ IMPLEMENT_ASN1_ENCODE_FUNCTIONS_fname(sname, sname, sname) \
+sname *sname##_new(void) \
+{ \
+ return ASN1_STRING_type_new(V_##sname); \
+} \
+void sname##_free(sname *x) \
+{ \
+ ASN1_STRING_free(x); \
+}
+
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_OCTET_STRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_INTEGER)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_ENUMERATED)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_BIT_STRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UTF8STRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_PRINTABLESTRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_T61STRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_IA5STRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_GENERALSTRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UTCTIME)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_GENERALIZEDTIME)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_VISIBLESTRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UNIVERSALSTRING)
+IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_BMPSTRING)
+
+IMPLEMENT_ASN1_TYPE(ASN1_NULL)
+IMPLEMENT_ASN1_FUNCTIONS(ASN1_NULL)
+
+IMPLEMENT_ASN1_TYPE(ASN1_OBJECT)
+
+IMPLEMENT_ASN1_TYPE(ASN1_ANY)
+
+/* Just swallow an ASN1_SEQUENCE in an ASN1_STRING */
+IMPLEMENT_ASN1_TYPE(ASN1_SEQUENCE)
+
+IMPLEMENT_ASN1_FUNCTIONS_fname(ASN1_TYPE, ASN1_ANY, ASN1_TYPE)
+
+/* Multistring types */
+
+IMPLEMENT_ASN1_MSTRING(ASN1_PRINTABLE, B_ASN1_PRINTABLE)
+IMPLEMENT_ASN1_FUNCTIONS_name(ASN1_STRING, ASN1_PRINTABLE)
+
+IMPLEMENT_ASN1_MSTRING(DISPLAYTEXT, B_ASN1_DISPLAYTEXT)
+IMPLEMENT_ASN1_FUNCTIONS_name(ASN1_STRING, DISPLAYTEXT)
+
+IMPLEMENT_ASN1_MSTRING(DIRECTORYSTRING, B_ASN1_DIRECTORYSTRING)
+IMPLEMENT_ASN1_FUNCTIONS_name(ASN1_STRING, DIRECTORYSTRING)
+
+/* Three separate BOOLEAN type: normal, DEFAULT TRUE and DEFAULT FALSE */
+IMPLEMENT_ASN1_TYPE_ex(ASN1_BOOLEAN, ASN1_BOOLEAN, -1)
+IMPLEMENT_ASN1_TYPE_ex(ASN1_TBOOLEAN, ASN1_BOOLEAN, 1)
+IMPLEMENT_ASN1_TYPE_ex(ASN1_FBOOLEAN, ASN1_BOOLEAN, 0)
+
+/* Special, OCTET STRING with indefinite length constructed support */
+
+IMPLEMENT_ASN1_TYPE_ex(ASN1_OCTET_STRING_NDEF, ASN1_OCTET_STRING, ASN1_TFLG_NDEF)
+
+ASN1_ITEM_TEMPLATE(ASN1_SEQUENCE_ANY) =
+ ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SEQUENCE_OF, 0, ASN1_SEQUENCE_ANY, ASN1_ANY)
+ASN1_ITEM_TEMPLATE_END(ASN1_SEQUENCE_ANY)
+
+ASN1_ITEM_TEMPLATE(ASN1_SET_ANY) =
+ ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SET_OF, 0, ASN1_SET_ANY, ASN1_ANY)
+ASN1_ITEM_TEMPLATE_END(ASN1_SET_ANY)
+
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(ASN1_SEQUENCE_ANY, ASN1_SEQUENCE_ANY, ASN1_SEQUENCE_ANY)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(ASN1_SEQUENCE_ANY, ASN1_SET_ANY, ASN1_SET_ANY)
diff --git a/openssl-1.1.0h/crypto/asn1/tasn_utl.c b/openssl-1.1.0h/crypto/asn1/tasn_utl.c
new file mode 100644
index 0000000..f79d7d6
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/tasn_utl.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <internal/cryptlib.h>
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include <openssl/objects.h>
+#include <openssl/err.h>
+#include "asn1_locl.h"
+
+/* Utility functions for manipulating fields and offsets */
+
+/* Add 'offset' to 'addr' */
+#define offset2ptr(addr, offset) (void *)(((char *) addr) + offset)
+
+/*
+ * Given an ASN1_ITEM CHOICE type return the selector value
+ */
+
+int asn1_get_choice_selector(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ int *sel = offset2ptr(*pval, it->utype);
+ return *sel;
+}
+
+/*
+ * Given an ASN1_ITEM CHOICE type set the selector value, return old value.
+ */
+
+int asn1_set_choice_selector(ASN1_VALUE **pval, int value,
+ const ASN1_ITEM *it)
+{
+ int *sel, ret;
+ sel = offset2ptr(*pval, it->utype);
+ ret = *sel;
+ *sel = value;
+ return ret;
+}
+
+/*
+ * Do atomic reference counting. The value 'op' decides what to do.
+ * If it is +1 then the count is incremented.
+ * If |op| is 0, lock is initialised and count is set to 1.
+ * If |op| is -1, count is decremented and the return value is the current
+ * reference count or 0 if no reference count is active.
+ * It returns -1 on initialisation error.
+ * Used by ASN1_SEQUENCE construct of X509, X509_REQ, X509_CRL objects
+ */
+int asn1_do_lock(ASN1_VALUE **pval, int op, const ASN1_ITEM *it)
+{
+ const ASN1_AUX *aux;
+ int *lck, ret;
+ CRYPTO_RWLOCK **lock;
+ if ((it->itype != ASN1_ITYPE_SEQUENCE)
+ && (it->itype != ASN1_ITYPE_NDEF_SEQUENCE))
+ return 0;
+ aux = it->funcs;
+ if (!aux || !(aux->flags & ASN1_AFLG_REFCOUNT))
+ return 0;
+ lck = offset2ptr(*pval, aux->ref_offset);
+ lock = offset2ptr(*pval, aux->ref_lock);
+ if (op == 0) {
+ *lck = 1;
+ *lock = CRYPTO_THREAD_lock_new();
+ if (*lock == NULL) {
+ ASN1err(ASN1_F_ASN1_DO_LOCK, ERR_R_MALLOC_FAILURE);
+ return -1;
+ }
+ return 1;
+ }
+ if (CRYPTO_atomic_add(lck, op, &ret, *lock) < 0)
+ return -1; /* failed */
+#ifdef REF_PRINT
+ fprintf(stderr, "%p:%4d:%s\n", it, *lck, it->sname);
+#endif
+ REF_ASSERT_ISNT(ret < 0);
+ if (ret == 0) {
+ CRYPTO_THREAD_lock_free(*lock);
+ *lock = NULL;
+ }
+ return ret;
+}
+
+static ASN1_ENCODING *asn1_get_enc_ptr(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ const ASN1_AUX *aux;
+ if (!pval || !*pval)
+ return NULL;
+ aux = it->funcs;
+ if (!aux || !(aux->flags & ASN1_AFLG_ENCODING))
+ return NULL;
+ return offset2ptr(*pval, aux->enc_offset);
+}
+
+void asn1_enc_init(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ ASN1_ENCODING *enc;
+ enc = asn1_get_enc_ptr(pval, it);
+ if (enc) {
+ enc->enc = NULL;
+ enc->len = 0;
+ enc->modified = 1;
+ }
+}
+
+void asn1_enc_free(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ ASN1_ENCODING *enc;
+ enc = asn1_get_enc_ptr(pval, it);
+ if (enc) {
+ OPENSSL_free(enc->enc);
+ enc->enc = NULL;
+ enc->len = 0;
+ enc->modified = 1;
+ }
+}
+
+int asn1_enc_save(ASN1_VALUE **pval, const unsigned char *in, int inlen,
+ const ASN1_ITEM *it)
+{
+ ASN1_ENCODING *enc;
+ enc = asn1_get_enc_ptr(pval, it);
+ if (!enc)
+ return 1;
+
+ OPENSSL_free(enc->enc);
+ enc->enc = OPENSSL_malloc(inlen);
+ if (enc->enc == NULL)
+ return 0;
+ memcpy(enc->enc, in, inlen);
+ enc->len = inlen;
+ enc->modified = 0;
+
+ return 1;
+}
+
+int asn1_enc_restore(int *len, unsigned char **out, ASN1_VALUE **pval,
+ const ASN1_ITEM *it)
+{
+ ASN1_ENCODING *enc;
+ enc = asn1_get_enc_ptr(pval, it);
+ if (!enc || enc->modified)
+ return 0;
+ if (out) {
+ memcpy(*out, enc->enc, enc->len);
+ *out += enc->len;
+ }
+ if (len)
+ *len = enc->len;
+ return 1;
+}
+
+/* Given an ASN1_TEMPLATE get a pointer to a field */
+ASN1_VALUE **asn1_get_field_ptr(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt)
+{
+ ASN1_VALUE **pvaltmp;
+ pvaltmp = offset2ptr(*pval, tt->offset);
+ /*
+ * NOTE for BOOLEAN types the field is just a plain int so we can't
+ * return int **, so settle for (int *).
+ */
+ return pvaltmp;
+}
+
+/*
+ * Handle ANY DEFINED BY template, find the selector, look up the relevant
+ * ASN1_TEMPLATE in the table and return it.
+ */
+
+const ASN1_TEMPLATE *asn1_do_adb(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt,
+ int nullerr)
+{
+ const ASN1_ADB *adb;
+ const ASN1_ADB_TABLE *atbl;
+ long selector;
+ ASN1_VALUE **sfld;
+ int i;
+ if (!(tt->flags & ASN1_TFLG_ADB_MASK))
+ return tt;
+
+ /* Else ANY DEFINED BY ... get the table */
+ adb = ASN1_ADB_ptr(tt->item);
+
+ /* Get the selector field */
+ sfld = offset2ptr(*pval, adb->offset);
+
+ /* Check if NULL */
+ if (*sfld == NULL) {
+ if (!adb->null_tt)
+ goto err;
+ return adb->null_tt;
+ }
+
+ /*
+ * Convert type to a long: NB: don't check for NID_undef here because it
+ * might be a legitimate value in the table
+ */
+ if (tt->flags & ASN1_TFLG_ADB_OID)
+ selector = OBJ_obj2nid((ASN1_OBJECT *)*sfld);
+ else
+ selector = ASN1_INTEGER_get((ASN1_INTEGER *)*sfld);
+
+ /* Let application callback translate value */
+ if (adb->adb_cb != NULL && adb->adb_cb(&selector) == 0) {
+ ASN1err(ASN1_F_ASN1_DO_ADB, ASN1_R_UNSUPPORTED_ANY_DEFINED_BY_TYPE);
+ return NULL;
+ }
+
+ /*
+ * Try to find matching entry in table Maybe should check application
+ * types first to allow application override? Might also be useful to
+ * have a flag which indicates table is sorted and we can do a binary
+ * search. For now stick to a linear search.
+ */
+
+ for (atbl = adb->tbl, i = 0; i < adb->tblcount; i++, atbl++)
+ if (atbl->value == selector)
+ return &atbl->tt;
+
+ /* FIXME: need to search application table too */
+
+ /* No match, return default type */
+ if (!adb->default_tt)
+ goto err;
+ return adb->default_tt;
+
+ err:
+ /* FIXME: should log the value or OID of unsupported type */
+ if (nullerr)
+ ASN1err(ASN1_F_ASN1_DO_ADB, ASN1_R_UNSUPPORTED_ANY_DEFINED_BY_TYPE);
+ return NULL;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/x_algor.c b/openssl-1.1.0h/crypto/asn1/x_algor.c
new file mode 100644
index 0000000..72378db
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_algor.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stddef.h>
+#include <openssl/x509.h>
+#include <openssl/asn1.h>
+#include <openssl/asn1t.h>
+#include "internal/evp_int.h"
+
+ASN1_SEQUENCE(X509_ALGOR) = {
+ ASN1_SIMPLE(X509_ALGOR, algorithm, ASN1_OBJECT),
+ ASN1_OPT(X509_ALGOR, parameter, ASN1_ANY)
+} ASN1_SEQUENCE_END(X509_ALGOR)
+
+ASN1_ITEM_TEMPLATE(X509_ALGORS) =
+ ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SEQUENCE_OF, 0, algorithms, X509_ALGOR)
+ASN1_ITEM_TEMPLATE_END(X509_ALGORS)
+
+IMPLEMENT_ASN1_FUNCTIONS(X509_ALGOR)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_fname(X509_ALGORS, X509_ALGORS, X509_ALGORS)
+IMPLEMENT_ASN1_DUP_FUNCTION(X509_ALGOR)
+
+int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval)
+{
+ if (!alg)
+ return 0;
+ if (ptype != V_ASN1_UNDEF) {
+ if (alg->parameter == NULL)
+ alg->parameter = ASN1_TYPE_new();
+ if (alg->parameter == NULL)
+ return 0;
+ }
+ if (alg) {
+ ASN1_OBJECT_free(alg->algorithm);
+ alg->algorithm = aobj;
+ }
+ if (ptype == 0)
+ return 1;
+ if (ptype == V_ASN1_UNDEF) {
+ ASN1_TYPE_free(alg->parameter);
+ alg->parameter = NULL;
+ } else
+ ASN1_TYPE_set(alg->parameter, ptype, pval);
+ return 1;
+}
+
+void X509_ALGOR_get0(const ASN1_OBJECT **paobj, int *pptype,
+ const void **ppval, const X509_ALGOR *algor)
+{
+ if (paobj)
+ *paobj = algor->algorithm;
+ if (pptype) {
+ if (algor->parameter == NULL) {
+ *pptype = V_ASN1_UNDEF;
+ return;
+ } else
+ *pptype = algor->parameter->type;
+ if (ppval)
+ *ppval = algor->parameter->value.ptr;
+ }
+}
+
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md)
+{
+ int param_type;
+
+ if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+ param_type = V_ASN1_UNDEF;
+ else
+ param_type = V_ASN1_NULL;
+
+ X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+}
+
+int X509_ALGOR_cmp(const X509_ALGOR *a, const X509_ALGOR *b)
+{
+ int rv;
+ rv = OBJ_cmp(a->algorithm, b->algorithm);
+ if (rv)
+ return rv;
+ if (!a->parameter && !b->parameter)
+ return 0;
+ return ASN1_TYPE_cmp(a->parameter, b->parameter);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/x_bignum.c b/openssl-1.1.0h/crypto/asn1/x_bignum.c
new file mode 100644
index 0000000..da57e77
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_bignum.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/bn.h>
+
+/*
+ * Custom primitive type for BIGNUM handling. This reads in an ASN1_INTEGER
+ * as a BIGNUM directly. Currently it ignores the sign which isn't a problem
+ * since all BIGNUMs used are non negative and anything that looks negative
+ * is normally due to an encoding error.
+ */
+
+#define BN_SENSITIVE 1
+
+static int bn_new(ASN1_VALUE **pval, const ASN1_ITEM *it);
+static int bn_secure_new(ASN1_VALUE **pval, const ASN1_ITEM *it);
+static void bn_free(ASN1_VALUE **pval, const ASN1_ITEM *it);
+
+static int bn_i2c(ASN1_VALUE **pval, unsigned char *cont, int *putype,
+ const ASN1_ITEM *it);
+static int bn_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it);
+static int bn_secure_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it);
+static int bn_print(BIO *out, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int indent, const ASN1_PCTX *pctx);
+
+static ASN1_PRIMITIVE_FUNCS bignum_pf = {
+ NULL, 0,
+ bn_new,
+ bn_free,
+ 0,
+ bn_c2i,
+ bn_i2c,
+ bn_print
+};
+
+static ASN1_PRIMITIVE_FUNCS cbignum_pf = {
+ NULL, 0,
+ bn_secure_new,
+ bn_free,
+ 0,
+ bn_secure_c2i,
+ bn_i2c,
+ bn_print
+};
+
+ASN1_ITEM_start(BIGNUM)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &bignum_pf, 0, "BIGNUM"
+ASN1_ITEM_end(BIGNUM)
+
+ASN1_ITEM_start(CBIGNUM)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &cbignum_pf, BN_SENSITIVE, "CBIGNUM"
+ASN1_ITEM_end(CBIGNUM)
+
+static int bn_new(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ *pval = (ASN1_VALUE *)BN_new();
+ if (*pval != NULL)
+ return 1;
+ else
+ return 0;
+}
+
+static int bn_secure_new(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ *pval = (ASN1_VALUE *)BN_secure_new();
+ if (*pval != NULL)
+ return 1;
+ else
+ return 0;
+}
+
+static void bn_free(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ if (!*pval)
+ return;
+ if (it->size & BN_SENSITIVE)
+ BN_clear_free((BIGNUM *)*pval);
+ else
+ BN_free((BIGNUM *)*pval);
+ *pval = NULL;
+}
+
+static int bn_i2c(ASN1_VALUE **pval, unsigned char *cont, int *putype,
+ const ASN1_ITEM *it)
+{
+ BIGNUM *bn;
+ int pad;
+ if (!*pval)
+ return -1;
+ bn = (BIGNUM *)*pval;
+ /* If MSB set in an octet we need a padding byte */
+ if (BN_num_bits(bn) & 0x7)
+ pad = 0;
+ else
+ pad = 1;
+ if (cont) {
+ if (pad)
+ *cont++ = 0;
+ BN_bn2bin(bn, cont);
+ }
+ return pad + BN_num_bytes(bn);
+}
+
+static int bn_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it)
+{
+ BIGNUM *bn;
+
+ if (*pval == NULL && !bn_new(pval, it))
+ return 0;
+ bn = (BIGNUM *)*pval;
+ if (!BN_bin2bn(cont, len, bn)) {
+ bn_free(pval, it);
+ return 0;
+ }
+ return 1;
+}
+
+static int bn_secure_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it)
+{
+ if (!*pval)
+ bn_secure_new(pval, it);
+ return bn_c2i(pval, cont, len, utype, free_cont, it);
+}
+
+static int bn_print(BIO *out, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int indent, const ASN1_PCTX *pctx)
+{
+ if (!BN_print(out, *(BIGNUM **)pval))
+ return 0;
+ if (BIO_puts(out, "\n") <= 0)
+ return 0;
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/x_info.c b/openssl-1.1.0h/crypto/asn1/x_info.c
new file mode 100644
index 0000000..8d99f07
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_info.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/asn1.h>
+#include <openssl/x509.h>
+
+X509_INFO *X509_INFO_new(void)
+{
+ X509_INFO *ret;
+
+ ret = OPENSSL_zalloc(sizeof(*ret));
+ if (ret == NULL) {
+ ASN1err(ASN1_F_X509_INFO_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ return ret;
+}
+
+void X509_INFO_free(X509_INFO *x)
+{
+ if (x == NULL)
+ return;
+
+ X509_free(x->x509);
+ X509_CRL_free(x->crl);
+ X509_PKEY_free(x->x_pkey);
+ OPENSSL_free(x->enc_data);
+ OPENSSL_free(x);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/x_int64.c b/openssl-1.1.0h/crypto/asn1/x_int64.c
new file mode 100644
index 0000000..4433167
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_int64.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include "internal/asn1t.h"
+#include "internal/numbers.h"
+#include <openssl/bn.h>
+#include "asn1_locl.h"
+
+/*
+ * Custom primitive types for handling int32_t, int64_t, uint32_t, uint64_t.
+ * This converts between an ASN1_INTEGER and those types directly.
+ * This is preferred to using the LONG / ZLONG primitives.
+ */
+
+/*
+ * We abuse the ASN1_ITEM fields |size| as a flags field
+ */
+#define INTxx_FLAG_ZERO_DEFAULT (1<<0)
+#define INTxx_FLAG_SIGNED (1<<1)
+
+static int uint64_new(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ *pval = (ASN1_VALUE *)OPENSSL_zalloc(sizeof(uint64_t));
+ if (*pval == NULL)
+ return 0;
+ return 1;
+}
+
+static void uint64_free(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ OPENSSL_free(*pval);
+ *pval = NULL;
+}
+
+static void uint64_clear(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ **(uint64_t **)pval = 0;
+}
+
+static int uint64_i2c(ASN1_VALUE **pval, unsigned char *cont, int *putype,
+ const ASN1_ITEM *it)
+{
+ uint64_t utmp;
+ int neg = 0;
+ /* this exists to bypass broken gcc optimization */
+ char *cp = (char *)*pval;
+
+ /* use memcpy, because we may not be uint64_t aligned */
+ memcpy(&utmp, cp, sizeof(utmp));
+
+ if ((it->size & INTxx_FLAG_ZERO_DEFAULT) == INTxx_FLAG_ZERO_DEFAULT
+ && utmp == 0)
+ return -1;
+ if ((it->size & INTxx_FLAG_SIGNED) == INTxx_FLAG_SIGNED
+ && (int64_t)utmp < 0) {
+ /* i2c_uint64_int() assumes positive values */
+ utmp = 0 - utmp;
+ neg = 1;
+ }
+
+ return i2c_uint64_int(cont, utmp, neg);
+}
+
+static int uint64_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it)
+{
+ uint64_t utmp = 0;
+ char *cp;
+ int neg = 0;
+
+ if (*pval == NULL && !uint64_new(pval, it))
+ return 0;
+
+ cp = (char *)*pval;
+ if (!c2i_uint64_int(&utmp, &neg, &cont, len))
+ return 0;
+ if ((it->size & INTxx_FLAG_SIGNED) == 0 && neg) {
+ ASN1err(ASN1_F_UINT64_C2I, ASN1_R_ILLEGAL_NEGATIVE_VALUE);
+ return 0;
+ }
+ if ((it->size & INTxx_FLAG_SIGNED) == INTxx_FLAG_SIGNED
+ && !neg && utmp > INT64_MAX) {
+ ASN1err(ASN1_F_UINT64_C2I, ASN1_R_TOO_LARGE);
+ return 0;
+ }
+ if (neg)
+ /* c2i_uint64_int() returns positive values */
+ utmp = 0 - utmp;
+ memcpy(cp, &utmp, sizeof(utmp));
+ return 1;
+}
+
+static int uint64_print(BIO *out, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int indent, const ASN1_PCTX *pctx)
+{
+ if ((it->size & INTxx_FLAG_SIGNED) == INTxx_FLAG_SIGNED)
+ return BIO_printf(out, "%"BIO_PRI64"d\n", **(int64_t **)pval);
+ return BIO_printf(out, "%"BIO_PRI64"u\n", **(uint64_t **)pval);
+}
+
+/* 32-bit variants */
+
+static int uint32_new(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ *pval = (ASN1_VALUE *)OPENSSL_zalloc(sizeof(uint32_t));
+ if (*pval == NULL)
+ return 0;
+ return 1;
+}
+
+static void uint32_free(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ OPENSSL_free(*pval);
+ *pval = NULL;
+}
+
+static void uint32_clear(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ **(uint32_t **)pval = 0;
+}
+
+static int uint32_i2c(ASN1_VALUE **pval, unsigned char *cont, int *putype,
+ const ASN1_ITEM *it)
+{
+ uint32_t utmp;
+ int neg = 0;
+ /* this exists to bypass broken gcc optimization */
+ char *cp = (char *)*pval;
+
+ /* use memcpy, because we may not be uint32_t aligned */
+ memcpy(&utmp, cp, sizeof(utmp));
+
+ if ((it->size & INTxx_FLAG_ZERO_DEFAULT) == INTxx_FLAG_ZERO_DEFAULT
+ && utmp == 0)
+ return -1;
+ if ((it->size & INTxx_FLAG_SIGNED) == INTxx_FLAG_SIGNED
+ && (int32_t)utmp < 0) {
+ /* i2c_uint64_int() assumes positive values */
+ utmp = 0 - utmp;
+ neg = 1;
+ }
+
+ return i2c_uint64_int(cont, (uint64_t)utmp, neg);
+}
+
+/*
+ * Absolute value of INT32_MIN: we can't just use -INT32_MIN as it produces
+ * overflow warnings.
+ */
+
+#define ABS_INT32_MIN ((uint32_t)INT32_MAX + 1)
+
+static int uint32_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it)
+{
+ uint64_t utmp = 0;
+ uint32_t utmp2 = 0;
+ char *cp;
+ int neg = 0;
+
+ if (*pval == NULL && !uint64_new(pval, it))
+ return 0;
+
+ cp = (char *)*pval;
+ if (!c2i_uint64_int(&utmp, &neg, &cont, len))
+ return 0;
+ if ((it->size & INTxx_FLAG_SIGNED) == 0 && neg) {
+ ASN1err(ASN1_F_UINT32_C2I, ASN1_R_ILLEGAL_NEGATIVE_VALUE);
+ return 0;
+ }
+ if (neg) {
+ if (utmp > ABS_INT32_MIN) {
+ ASN1err(ASN1_F_UINT32_C2I, ASN1_R_TOO_SMALL);
+ return 0;
+ }
+ utmp = 0 - utmp;
+ } else {
+ if (((it->size & INTxx_FLAG_SIGNED) != 0 && utmp > INT32_MAX)
+ || ((it->size & INTxx_FLAG_SIGNED) == 0 && utmp > UINT32_MAX)) {
+ ASN1err(ASN1_F_UINT32_C2I, ASN1_R_TOO_LARGE);
+ return 0;
+ }
+ }
+ utmp2 = (uint32_t)utmp;
+ memcpy(cp, &utmp2, sizeof(utmp2));
+ return 1;
+}
+
+static int uint32_print(BIO *out, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int indent, const ASN1_PCTX *pctx)
+{
+ if ((it->size & INTxx_FLAG_SIGNED) == INTxx_FLAG_SIGNED)
+ return BIO_printf(out, "%d\n", **(int32_t **)pval);
+ return BIO_printf(out, "%u\n", **(uint32_t **)pval);
+}
+
+
+/* Define the primitives themselves */
+
+static ASN1_PRIMITIVE_FUNCS uint32_pf = {
+ NULL, 0,
+ uint32_new,
+ uint32_free,
+ uint32_clear,
+ uint32_c2i,
+ uint32_i2c,
+ uint32_print
+};
+
+static ASN1_PRIMITIVE_FUNCS uint64_pf = {
+ NULL, 0,
+ uint64_new,
+ uint64_free,
+ uint64_clear,
+ uint64_c2i,
+ uint64_i2c,
+ uint64_print
+};
+
+ASN1_ITEM_start(INT32)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint32_pf,
+ INTxx_FLAG_SIGNED, "INT32"
+ASN1_ITEM_end(INT32)
+
+ASN1_ITEM_start(UINT32)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint32_pf, 0, "UINT32"
+ASN1_ITEM_end(UINT32)
+
+ASN1_ITEM_start(INT64)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint64_pf,
+ INTxx_FLAG_SIGNED, "INT64"
+ASN1_ITEM_end(INT64)
+
+ASN1_ITEM_start(UINT64)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint64_pf, 0, "UINT64"
+ASN1_ITEM_end(UINT64)
+
+ASN1_ITEM_start(ZINT32)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint32_pf,
+ INTxx_FLAG_ZERO_DEFAULT|INTxx_FLAG_SIGNED, "ZINT32"
+ASN1_ITEM_end(ZINT32)
+
+ASN1_ITEM_start(ZUINT32)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint32_pf,
+ INTxx_FLAG_ZERO_DEFAULT, "ZUINT32"
+ASN1_ITEM_end(ZUINT32)
+
+ASN1_ITEM_start(ZINT64)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint64_pf,
+ INTxx_FLAG_ZERO_DEFAULT|INTxx_FLAG_SIGNED, "ZINT64"
+ASN1_ITEM_end(ZINT64)
+
+ASN1_ITEM_start(ZUINT64)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &uint64_pf,
+ INTxx_FLAG_ZERO_DEFAULT, "ZUINT64"
+ASN1_ITEM_end(ZUINT64)
+
diff --git a/openssl-1.1.0h/crypto/asn1/x_long.c b/openssl-1.1.0h/crypto/asn1/x_long.c
new file mode 100644
index 0000000..5895345
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_long.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+
+/*
+ * Custom primitive type for long handling. This converts between an
+ * ASN1_INTEGER and a long directly.
+ */
+
+static int long_new(ASN1_VALUE **pval, const ASN1_ITEM *it);
+static void long_free(ASN1_VALUE **pval, const ASN1_ITEM *it);
+
+static int long_i2c(ASN1_VALUE **pval, unsigned char *cont, int *putype,
+ const ASN1_ITEM *it);
+static int long_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it);
+static int long_print(BIO *out, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int indent, const ASN1_PCTX *pctx);
+
+static ASN1_PRIMITIVE_FUNCS long_pf = {
+ NULL, 0,
+ long_new,
+ long_free,
+ long_free, /* Clear should set to initial value */
+ long_c2i,
+ long_i2c,
+ long_print
+};
+
+ASN1_ITEM_start(LONG)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &long_pf, ASN1_LONG_UNDEF, "LONG"
+ASN1_ITEM_end(LONG)
+
+ASN1_ITEM_start(ZLONG)
+ ASN1_ITYPE_PRIMITIVE, V_ASN1_INTEGER, NULL, 0, &long_pf, 0, "ZLONG"
+ASN1_ITEM_end(ZLONG)
+
+static int long_new(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ *(long *)pval = it->size;
+ return 1;
+}
+
+static void long_free(ASN1_VALUE **pval, const ASN1_ITEM *it)
+{
+ *(long *)pval = it->size;
+}
+
+/*
+ * Originally BN_num_bits_word was called to perform this operation, but
+ * trouble is that there is no guarantee that sizeof(long) equals to
+ * sizeof(BN_ULONG). BN_ULONG is a configurable type that can be as wide
+ * as long, but also double or half...
+ */
+static int num_bits_ulong(unsigned long value)
+{
+ size_t i;
+ unsigned long ret = 0;
+
+ /*
+ * It is argued that *on average* constant counter loop performs
+ * not worse [if not better] than one with conditional break or
+ * mask-n-table-lookup-style, because of branch misprediction
+ * penalties.
+ */
+ for (i = 0; i < sizeof(value) * 8; i++) {
+ ret += (value != 0);
+ value >>= 1;
+ }
+
+ return (int)ret;
+}
+
+static int long_i2c(ASN1_VALUE **pval, unsigned char *cont, int *putype,
+ const ASN1_ITEM *it)
+{
+ long ltmp;
+ unsigned long utmp, sign;
+ int clen, pad, i;
+ /* this exists to bypass broken gcc optimization */
+ char *cp = (char *)pval;
+
+ /* use memcpy, because we may not be long aligned */
+ memcpy(&ltmp, cp, sizeof(long));
+
+ if (ltmp == it->size)
+ return -1;
+ /*
+ * Convert the long to positive: we subtract one if negative so we can
+ * cleanly handle the padding if only the MSB of the leading octet is
+ * set.
+ */
+ if (ltmp < 0) {
+ sign = 0xff;
+ utmp = 0 - (unsigned long)ltmp - 1;
+ } else {
+ sign = 0;
+ utmp = ltmp;
+ }
+ clen = num_bits_ulong(utmp);
+ /* If MSB of leading octet set we need to pad */
+ if (!(clen & 0x7))
+ pad = 1;
+ else
+ pad = 0;
+
+ /* Convert number of bits to number of octets */
+ clen = (clen + 7) >> 3;
+
+ if (cont != NULL) {
+ if (pad)
+ *cont++ = (unsigned char)sign;
+ for (i = clen - 1; i >= 0; i--) {
+ cont[i] = (unsigned char)(utmp ^ sign);
+ utmp >>= 8;
+ }
+ }
+ return clen + pad;
+}
+
+static int long_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
+ int utype, char *free_cont, const ASN1_ITEM *it)
+{
+ int i;
+ long ltmp;
+ unsigned long utmp = 0, sign = 0x100;
+ char *cp = (char *)pval;
+
+ if (len > 1) {
+ /*
+ * Check possible pad byte. Worst case, we're skipping past actual
+ * content, but since that's only with 0x00 and 0xff and we set neg
+ * accordingly, the result will be correct in the end anyway.
+ */
+ switch (cont[0]) {
+ case 0xff:
+ cont++;
+ len--;
+ sign = 0xff;
+ break;
+ case 0:
+ cont++;
+ len--;
+ sign = 0;
+ break;
+ }
+ }
+ if (len > (int)sizeof(long)) {
+ ASN1err(ASN1_F_LONG_C2I, ASN1_R_INTEGER_TOO_LARGE_FOR_LONG);
+ return 0;
+ }
+
+ if (sign == 0x100) {
+ /* Is it negative? */
+ if (len && (cont[0] & 0x80))
+ sign = 0xff;
+ else
+ sign = 0;
+ } else if (((sign ^ cont[0]) & 0x80) == 0) { /* same sign bit? */
+ ASN1err(ASN1_F_LONG_C2I, ASN1_R_ILLEGAL_PADDING);
+ return 0;
+ }
+ utmp = 0;
+ for (i = 0; i < len; i++) {
+ utmp <<= 8;
+ utmp |= cont[i] ^ sign;
+ }
+ ltmp = (long)utmp;
+ if (ltmp < 0) {
+ ASN1err(ASN1_F_LONG_C2I, ASN1_R_INTEGER_TOO_LARGE_FOR_LONG);
+ return 0;
+ }
+ if (sign)
+ ltmp = -ltmp - 1;
+ if (ltmp == it->size) {
+ ASN1err(ASN1_F_LONG_C2I, ASN1_R_INTEGER_TOO_LARGE_FOR_LONG);
+ return 0;
+ }
+ memcpy(cp, &ltmp, sizeof(long));
+ return 1;
+}
+
+static int long_print(BIO *out, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ int indent, const ASN1_PCTX *pctx)
+{
+ return BIO_printf(out, "%ld\n", *(long *)pval);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/x_pkey.c b/openssl-1.1.0h/crypto/asn1/x_pkey.c
new file mode 100644
index 0000000..593049f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_pkey.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/x509.h>
+
+X509_PKEY *X509_PKEY_new(void)
+{
+ X509_PKEY *ret = NULL;
+
+ ret = OPENSSL_zalloc(sizeof(*ret));
+ if (ret == NULL)
+ goto err;
+
+ ret->enc_algor = X509_ALGOR_new();
+ ret->enc_pkey = ASN1_OCTET_STRING_new();
+ if (ret->enc_algor == NULL || ret->enc_pkey == NULL)
+ goto err;
+
+ return ret;
+err:
+ X509_PKEY_free(ret);
+ ASN1err(ASN1_F_X509_PKEY_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+}
+
+void X509_PKEY_free(X509_PKEY *x)
+{
+ if (x == NULL)
+ return;
+
+ X509_ALGOR_free(x->enc_algor);
+ ASN1_OCTET_STRING_free(x->enc_pkey);
+ EVP_PKEY_free(x->dec_pkey);
+ if (x->key_free)
+ OPENSSL_free(x->key_data);
+ OPENSSL_free(x);
+}
diff --git a/openssl-1.1.0h/crypto/asn1/x_sig.c b/openssl-1.1.0h/crypto/asn1/x_sig.c
new file mode 100644
index 0000000..e465cf2
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_sig.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include "internal/x509_int.h"
+
+ASN1_SEQUENCE(X509_SIG) = {
+ ASN1_SIMPLE(X509_SIG, algor, X509_ALGOR),
+ ASN1_SIMPLE(X509_SIG, digest, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(X509_SIG)
+
+IMPLEMENT_ASN1_FUNCTIONS(X509_SIG)
+
+void X509_SIG_get0(const X509_SIG *sig, const X509_ALGOR **palg,
+ const ASN1_OCTET_STRING **pdigest)
+{
+ if (palg)
+ *palg = sig->algor;
+ if (pdigest)
+ *pdigest = sig->digest;
+}
+
+void X509_SIG_getm(X509_SIG *sig, X509_ALGOR **palg,
+ ASN1_OCTET_STRING **pdigest)
+{
+ if (palg)
+ *palg = sig->algor;
+ if (pdigest)
+ *pdigest = sig->digest;
+}
diff --git a/openssl-1.1.0h/crypto/asn1/x_spki.c b/openssl-1.1.0h/crypto/asn1/x_spki.c
new file mode 100644
index 0000000..c45400b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_spki.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+ /*
+ * This module was send to me my Pat Richards <patr@x509.com> who wrote it.
+ * It is under my Copyright with his permission
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/asn1t.h>
+
+ASN1_SEQUENCE(NETSCAPE_SPKAC) = {
+ ASN1_SIMPLE(NETSCAPE_SPKAC, pubkey, X509_PUBKEY),
+ ASN1_SIMPLE(NETSCAPE_SPKAC, challenge, ASN1_IA5STRING)
+} ASN1_SEQUENCE_END(NETSCAPE_SPKAC)
+
+IMPLEMENT_ASN1_FUNCTIONS(NETSCAPE_SPKAC)
+
+ASN1_SEQUENCE(NETSCAPE_SPKI) = {
+ ASN1_SIMPLE(NETSCAPE_SPKI, spkac, NETSCAPE_SPKAC),
+ ASN1_EMBED(NETSCAPE_SPKI, sig_algor, X509_ALGOR),
+ ASN1_SIMPLE(NETSCAPE_SPKI, signature, ASN1_BIT_STRING)
+} ASN1_SEQUENCE_END(NETSCAPE_SPKI)
+
+IMPLEMENT_ASN1_FUNCTIONS(NETSCAPE_SPKI)
diff --git a/openssl-1.1.0h/crypto/asn1/x_val.c b/openssl-1.1.0h/crypto/asn1/x_val.c
new file mode 100644
index 0000000..d1f1d3b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/asn1/x_val.c
@@ -0,0 +1,20 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "internal/cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+
+ASN1_SEQUENCE(X509_VAL) = {
+ ASN1_SIMPLE(X509_VAL, notBefore, ASN1_TIME),
+ ASN1_SIMPLE(X509_VAL, notAfter, ASN1_TIME)
+} ASN1_SEQUENCE_END(X509_VAL)
+
+IMPLEMENT_ASN1_FUNCTIONS(X509_VAL)
diff --git a/openssl-1.1.0h/crypto/async/arch/async_null.c b/openssl-1.1.0h/crypto/async/arch/async_null.c
new file mode 100644
index 0000000..3eaf170
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/arch/async_null.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/* This must be the first #include file */
+#include "../async_locl.h"
+
+#ifdef ASYNC_NULL
+int ASYNC_is_capable(void)
+{
+ return 0;
+}
+
+void async_local_cleanup(void)
+{
+}
+#endif
+
diff --git a/openssl-1.1.0h/crypto/async/arch/async_null.h b/openssl-1.1.0h/crypto/async/arch/async_null.h
new file mode 100644
index 0000000..aef40b5
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/arch/async_null.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/async.h>
+
+/*
+ * If we haven't managed to detect any other async architecture then we default
+ * to NULL.
+ */
+#ifndef ASYNC_ARCH
+# define ASYNC_NULL
+# define ASYNC_ARCH
+
+typedef struct async_fibre_st {
+ int dummy;
+} async_fibre;
+
+
+# define async_fibre_swapcontext(o,n,r) 0
+# define async_fibre_makecontext(c) 0
+# define async_fibre_free(f)
+# define async_fibre_init_dispatcher(f)
+
+#endif
diff --git a/openssl-1.1.0h/crypto/async/arch/async_posix.c b/openssl-1.1.0h/crypto/async/arch/async_posix.c
new file mode 100644
index 0000000..02c342d
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/arch/async_posix.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/* This must be the first #include file */
+#include "../async_locl.h"
+
+#ifdef ASYNC_POSIX
+
+# include <stddef.h>
+# include <unistd.h>
+
+#define STACKSIZE 32768
+
+int ASYNC_is_capable(void)
+{
+ ucontext_t ctx;
+
+ /*
+ * Some platforms provide getcontext() but it does not work (notably
+ * MacOSX PPC64). Check for a working getcontext();
+ */
+ return getcontext(&ctx) == 0;
+}
+
+void async_local_cleanup(void)
+{
+}
+
+int async_fibre_makecontext(async_fibre *fibre)
+{
+ fibre->env_init = 0;
+ if (getcontext(&fibre->fibre) == 0) {
+ fibre->fibre.uc_stack.ss_sp = OPENSSL_malloc(STACKSIZE);
+ if (fibre->fibre.uc_stack.ss_sp != NULL) {
+ fibre->fibre.uc_stack.ss_size = STACKSIZE;
+ fibre->fibre.uc_link = NULL;
+ makecontext(&fibre->fibre, async_start_func, 0);
+ return 1;
+ }
+ } else {
+ fibre->fibre.uc_stack.ss_sp = NULL;
+ }
+ return 0;
+}
+
+void async_fibre_free(async_fibre *fibre)
+{
+ OPENSSL_free(fibre->fibre.uc_stack.ss_sp);
+ fibre->fibre.uc_stack.ss_sp = NULL;
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/async/arch/async_posix.h b/openssl-1.1.0h/crypto/async/arch/async_posix.h
new file mode 100644
index 0000000..76937a9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/arch/async_posix.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef OPENSSL_ASYNC_ARCH_ASYNC_POSIX_H
+#define OPENSSL_ASYNC_ARCH_ASYNC_POSIX_H
+#include <openssl/e_os2.h>
+
+#if defined(OPENSSL_SYS_UNIX) \
+ && defined(OPENSSL_THREADS) && !defined(OPENSSL_NO_ASYNC) \
+ && !defined(__ANDROID__) && !defined(__OpenBSD__)
+
+# include <unistd.h>
+
+# if _POSIX_VERSION >= 200112L
+
+# include <pthread.h>
+
+# define ASYNC_POSIX
+# define ASYNC_ARCH
+
+# include <ucontext.h>
+# include <setjmp.h>
+# include "e_os.h"
+
+typedef struct async_fibre_st {
+ ucontext_t fibre;
+ jmp_buf env;
+ int env_init;
+} async_fibre;
+
+static ossl_inline int async_fibre_swapcontext(async_fibre *o, async_fibre *n, int r)
+{
+ o->env_init = 1;
+
+ if (!r || !_setjmp(o->env)) {
+ if (n->env_init)
+ _longjmp(n->env, 1);
+ else
+ setcontext(&n->fibre);
+ }
+
+ return 1;
+}
+
+# define async_fibre_init_dispatcher(d)
+
+int async_fibre_makecontext(async_fibre *fibre);
+void async_fibre_free(async_fibre *fibre);
+
+# endif
+#endif
+#endif /* OPENSSL_ASYNC_ARCH_ASYNC_POSIX_H */
diff --git a/openssl-1.1.0h/crypto/async/arch/async_win.c b/openssl-1.1.0h/crypto/async/arch/async_win.c
new file mode 100644
index 0000000..077d56c
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/arch/async_win.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/* This must be the first #include file */
+#include "../async_locl.h"
+
+#ifdef ASYNC_WIN
+
+# include <windows.h>
+# include "internal/cryptlib.h"
+
+int ASYNC_is_capable(void)
+{
+ return 1;
+}
+
+void async_local_cleanup(void)
+{
+ async_ctx *ctx = async_get_ctx();
+ if (ctx != NULL) {
+ async_fibre *fibre = &ctx->dispatcher;
+ if (fibre != NULL && fibre->fibre != NULL && fibre->converted) {
+ ConvertFiberToThread();
+ fibre->fibre = NULL;
+ }
+ }
+}
+
+int async_fibre_init_dispatcher(async_fibre *fibre)
+{
+ fibre->fibre = ConvertThreadToFiber(NULL);
+ if (fibre->fibre == NULL) {
+ fibre->converted = 0;
+ fibre->fibre = GetCurrentFiber();
+ if (fibre->fibre == NULL)
+ return 0;
+ } else {
+ fibre->converted = 1;
+ }
+
+ return 1;
+}
+
+VOID CALLBACK async_start_func_win(PVOID unused)
+{
+ async_start_func();
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/async/arch/async_win.h b/openssl-1.1.0h/crypto/async/arch/async_win.h
new file mode 100644
index 0000000..61cfdd7
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/arch/async_win.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * This is the same detection used in cryptlib to set up the thread local
+ * storage that we depend on, so just copy that
+ */
+#if defined(_WIN32) && !defined(OPENSSL_NO_ASYNC)
+#include <openssl/async.h>
+# define ASYNC_WIN
+# define ASYNC_ARCH
+
+# include <windows.h>
+# include "internal/cryptlib.h"
+
+typedef struct async_fibre_st {
+ LPVOID fibre;
+ int converted;
+} async_fibre;
+
+# define async_fibre_swapcontext(o,n,r) \
+ (SwitchToFiber((n)->fibre), 1)
+# define async_fibre_makecontext(c) \
+ ((c)->fibre = CreateFiber(0, async_start_func_win, 0))
+# define async_fibre_free(f) (DeleteFiber((f)->fibre))
+
+int async_fibre_init_dispatcher(async_fibre *fibre);
+VOID CALLBACK async_start_func_win(PVOID unused);
+
+#endif
diff --git a/openssl-1.1.0h/crypto/async/async.c b/openssl-1.1.0h/crypto/async/async.c
new file mode 100644
index 0000000..9a4e6b2
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/async.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Without this we start getting longjmp crashes because it thinks we're jumping
+ * up the stack when in fact we are jumping to an entirely different stack. The
+ * cost of this is not having certain buffer overrun/underrun checks etc for
+ * this source file :-(
+ */
+#undef _FORTIFY_SOURCE
+
+/* This must be the first #include file */
+#include "async_locl.h"
+
+#include <openssl/err.h>
+#include <internal/cryptlib_int.h>
+#include <string.h>
+
+#define ASYNC_JOB_RUNNING 0
+#define ASYNC_JOB_PAUSING 1
+#define ASYNC_JOB_PAUSED 2
+#define ASYNC_JOB_STOPPING 3
+
+static CRYPTO_THREAD_LOCAL ctxkey;
+static CRYPTO_THREAD_LOCAL poolkey;
+
+static void async_free_pool_internal(async_pool *pool);
+
+static async_ctx *async_ctx_new(void)
+{
+ async_ctx *nctx = NULL;
+
+ nctx = OPENSSL_malloc(sizeof(async_ctx));
+ if (nctx == NULL) {
+ ASYNCerr(ASYNC_F_ASYNC_CTX_NEW, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ async_fibre_init_dispatcher(&nctx->dispatcher);
+ nctx->currjob = NULL;
+ nctx->blocked = 0;
+ if (!CRYPTO_THREAD_set_local(&ctxkey, nctx))
+ goto err;
+
+ return nctx;
+err:
+ OPENSSL_free(nctx);
+
+ return NULL;
+}
+
+async_ctx *async_get_ctx(void)
+{
+ if (!OPENSSL_init_crypto(OPENSSL_INIT_ASYNC, NULL))
+ return NULL;
+
+ return (async_ctx *)CRYPTO_THREAD_get_local(&ctxkey);
+}
+
+static int async_ctx_free(void)
+{
+ async_ctx *ctx;
+
+ ctx = async_get_ctx();
+
+ if (!CRYPTO_THREAD_set_local(&ctxkey, NULL))
+ return 0;
+
+ OPENSSL_free(ctx);
+
+ return 1;
+}
+
+static ASYNC_JOB *async_job_new(void)
+{
+ ASYNC_JOB *job = NULL;
+
+ job = OPENSSL_zalloc(sizeof(ASYNC_JOB));
+ if (job == NULL) {
+ ASYNCerr(ASYNC_F_ASYNC_JOB_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ job->status = ASYNC_JOB_RUNNING;
+
+ return job;
+}
+
+static void async_job_free(ASYNC_JOB *job)
+{
+ if (job != NULL) {
+ OPENSSL_free(job->funcargs);
+ async_fibre_free(&job->fibrectx);
+ OPENSSL_free(job);
+ }
+}
+
+static ASYNC_JOB *async_get_pool_job(void) {
+ ASYNC_JOB *job;
+ async_pool *pool;
+
+ pool = (async_pool *)CRYPTO_THREAD_get_local(&poolkey);
+ if (pool == NULL) {
+ /*
+ * Pool has not been initialised, so init with the defaults, i.e.
+ * no max size and no pre-created jobs
+ */
+ if (ASYNC_init_thread(0, 0) == 0)
+ return NULL;
+ pool = (async_pool *)CRYPTO_THREAD_get_local(&poolkey);
+ }
+
+ job = sk_ASYNC_JOB_pop(pool->jobs);
+ if (job == NULL) {
+ /* Pool is empty */
+ if ((pool->max_size != 0) && (pool->curr_size >= pool->max_size))
+ return NULL;
+
+ job = async_job_new();
+ if (job != NULL) {
+ if (! async_fibre_makecontext(&job->fibrectx)) {
+ async_job_free(job);
+ return NULL;
+ }
+ pool->curr_size++;
+ }
+ }
+ return job;
+}
+
+static void async_release_job(ASYNC_JOB *job) {
+ async_pool *pool;
+
+ pool = (async_pool *)CRYPTO_THREAD_get_local(&poolkey);
+ OPENSSL_free(job->funcargs);
+ job->funcargs = NULL;
+ sk_ASYNC_JOB_push(pool->jobs, job);
+}
+
+void async_start_func(void)
+{
+ ASYNC_JOB *job;
+ async_ctx *ctx = async_get_ctx();
+
+ while (1) {
+ /* Run the job */
+ job = ctx->currjob;
+ job->ret = job->func(job->funcargs);
+
+ /* Stop the job */
+ job->status = ASYNC_JOB_STOPPING;
+ if (!async_fibre_swapcontext(&job->fibrectx,
+ &ctx->dispatcher, 1)) {
+ /*
+ * Should not happen. Getting here will close the thread...can't do
+ * much about it
+ */
+ ASYNCerr(ASYNC_F_ASYNC_START_FUNC, ASYNC_R_FAILED_TO_SWAP_CONTEXT);
+ }
+ }
+}
+
+int ASYNC_start_job(ASYNC_JOB **job, ASYNC_WAIT_CTX *wctx, int *ret,
+ int (*func)(void *), void *args, size_t size)
+{
+ async_ctx *ctx = async_get_ctx();
+ if (ctx == NULL)
+ ctx = async_ctx_new();
+ if (ctx == NULL) {
+ return ASYNC_ERR;
+ }
+
+ if (*job) {
+ ctx->currjob = *job;
+ }
+
+ for (;;) {
+ if (ctx->currjob != NULL) {
+ if (ctx->currjob->status == ASYNC_JOB_STOPPING) {
+ *ret = ctx->currjob->ret;
+ ctx->currjob->waitctx = NULL;
+ async_release_job(ctx->currjob);
+ ctx->currjob = NULL;
+ *job = NULL;
+ return ASYNC_FINISH;
+ }
+
+ if (ctx->currjob->status == ASYNC_JOB_PAUSING) {
+ *job = ctx->currjob;
+ ctx->currjob->status = ASYNC_JOB_PAUSED;
+ ctx->currjob = NULL;
+ return ASYNC_PAUSE;
+ }
+
+ if (ctx->currjob->status == ASYNC_JOB_PAUSED) {
+ ctx->currjob = *job;
+ /* Resume previous job */
+ if (!async_fibre_swapcontext(&ctx->dispatcher,
+ &ctx->currjob->fibrectx, 1)) {
+ ASYNCerr(ASYNC_F_ASYNC_START_JOB,
+ ASYNC_R_FAILED_TO_SWAP_CONTEXT);
+ goto err;
+ }
+ continue;
+ }
+
+ /* Should not happen */
+ ASYNCerr(ASYNC_F_ASYNC_START_JOB, ERR_R_INTERNAL_ERROR);
+ async_release_job(ctx->currjob);
+ ctx->currjob = NULL;
+ *job = NULL;
+ return ASYNC_ERR;
+ }
+
+ /* Start a new job */
+ if ((ctx->currjob = async_get_pool_job()) == NULL) {
+ return ASYNC_NO_JOBS;
+ }
+
+ if (args != NULL) {
+ ctx->currjob->funcargs = OPENSSL_malloc(size);
+ if (ctx->currjob->funcargs == NULL) {
+ ASYNCerr(ASYNC_F_ASYNC_START_JOB, ERR_R_MALLOC_FAILURE);
+ async_release_job(ctx->currjob);
+ ctx->currjob = NULL;
+ return ASYNC_ERR;
+ }
+ memcpy(ctx->currjob->funcargs, args, size);
+ } else {
+ ctx->currjob->funcargs = NULL;
+ }
+
+ ctx->currjob->func = func;
+ ctx->currjob->waitctx = wctx;
+ if (!async_fibre_swapcontext(&ctx->dispatcher,
+ &ctx->currjob->fibrectx, 1)) {
+ ASYNCerr(ASYNC_F_ASYNC_START_JOB, ASYNC_R_FAILED_TO_SWAP_CONTEXT);
+ goto err;
+ }
+ }
+
+err:
+ async_release_job(ctx->currjob);
+ ctx->currjob = NULL;
+ *job = NULL;
+ return ASYNC_ERR;
+}
+
+int ASYNC_pause_job(void)
+{
+ ASYNC_JOB *job;
+ async_ctx *ctx = async_get_ctx();
+
+ if (ctx == NULL
+ || ctx->currjob == NULL
+ || ctx->blocked) {
+ /*
+ * Could be we've deliberately not been started within a job so this is
+ * counted as success.
+ */
+ return 1;
+ }
+
+ job = ctx->currjob;
+ job->status = ASYNC_JOB_PAUSING;
+
+ if (!async_fibre_swapcontext(&job->fibrectx,
+ &ctx->dispatcher, 1)) {
+ ASYNCerr(ASYNC_F_ASYNC_PAUSE_JOB, ASYNC_R_FAILED_TO_SWAP_CONTEXT);
+ return 0;
+ }
+ /* Reset counts of added and deleted fds */
+ async_wait_ctx_reset_counts(job->waitctx);
+
+ return 1;
+}
+
+static void async_empty_pool(async_pool *pool)
+{
+ ASYNC_JOB *job;
+
+ if (!pool || !pool->jobs)
+ return;
+
+ do {
+ job = sk_ASYNC_JOB_pop(pool->jobs);
+ async_job_free(job);
+ } while (job);
+}
+
+int async_init(void)
+{
+ if (!CRYPTO_THREAD_init_local(&ctxkey, NULL))
+ return 0;
+
+ if (!CRYPTO_THREAD_init_local(&poolkey, NULL)) {
+ CRYPTO_THREAD_cleanup_local(&ctxkey);
+ return 0;
+ }
+
+ return 1;
+}
+
+void async_deinit(void)
+{
+ CRYPTO_THREAD_cleanup_local(&ctxkey);
+ CRYPTO_THREAD_cleanup_local(&poolkey);
+}
+
+int ASYNC_init_thread(size_t max_size, size_t init_size)
+{
+ async_pool *pool;
+ size_t curr_size = 0;
+
+ if (init_size > max_size) {
+ ASYNCerr(ASYNC_F_ASYNC_INIT_THREAD, ASYNC_R_INVALID_POOL_SIZE);
+ return 0;
+ }
+
+ if (!OPENSSL_init_crypto(OPENSSL_INIT_ASYNC, NULL)) {
+ return 0;
+ }
+ if (!ossl_init_thread_start(OPENSSL_INIT_THREAD_ASYNC)) {
+ return 0;
+ }
+
+ pool = OPENSSL_zalloc(sizeof(*pool));
+ if (pool == NULL) {
+ ASYNCerr(ASYNC_F_ASYNC_INIT_THREAD, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+
+ pool->jobs = sk_ASYNC_JOB_new_null();
+ if (pool->jobs == NULL) {
+ ASYNCerr(ASYNC_F_ASYNC_INIT_THREAD, ERR_R_MALLOC_FAILURE);
+ OPENSSL_free(pool);
+ return 0;
+ }
+
+ pool->max_size = max_size;
+
+ /* Pre-create jobs as required */
+ while (init_size--) {
+ ASYNC_JOB *job;
+ job = async_job_new();
+ if (job == NULL || !async_fibre_makecontext(&job->fibrectx)) {
+ /*
+ * Not actually fatal because we already created the pool, just
+ * skip creation of any more jobs
+ */
+ async_job_free(job);
+ break;
+ }
+ job->funcargs = NULL;
+ sk_ASYNC_JOB_push(pool->jobs, job);
+ curr_size++;
+ }
+ pool->curr_size = curr_size;
+ if (!CRYPTO_THREAD_set_local(&poolkey, pool)) {
+ ASYNCerr(ASYNC_F_ASYNC_INIT_THREAD, ASYNC_R_FAILED_TO_SET_POOL);
+ goto err;
+ }
+
+ return 1;
+err:
+ async_free_pool_internal(pool);
+ return 0;
+}
+
+static void async_free_pool_internal(async_pool *pool)
+{
+ if (pool == NULL)
+ return;
+
+ async_empty_pool(pool);
+ sk_ASYNC_JOB_free(pool->jobs);
+ OPENSSL_free(pool);
+ CRYPTO_THREAD_set_local(&poolkey, NULL);
+ async_local_cleanup();
+ async_ctx_free();
+}
+
+void ASYNC_cleanup_thread(void)
+{
+ async_free_pool_internal((async_pool *)CRYPTO_THREAD_get_local(&poolkey));
+}
+
+ASYNC_JOB *ASYNC_get_current_job(void)
+{
+ async_ctx *ctx;
+
+ ctx = async_get_ctx();
+ if (ctx == NULL)
+ return NULL;
+
+ return ctx->currjob;
+}
+
+ASYNC_WAIT_CTX *ASYNC_get_wait_ctx(ASYNC_JOB *job)
+{
+ return job->waitctx;
+}
+
+void ASYNC_block_pause(void)
+{
+ async_ctx *ctx = async_get_ctx();
+ if (ctx == NULL || ctx->currjob == NULL) {
+ /*
+ * We're not in a job anyway so ignore this
+ */
+ return;
+ }
+ ctx->blocked++;
+}
+
+void ASYNC_unblock_pause(void)
+{
+ async_ctx *ctx = async_get_ctx();
+ if (ctx == NULL || ctx->currjob == NULL) {
+ /*
+ * We're not in a job anyway so ignore this
+ */
+ return;
+ }
+ if (ctx->blocked > 0)
+ ctx->blocked--;
+}
diff --git a/openssl-1.1.0h/crypto/async/async_err.c b/openssl-1.1.0h/crypto/async/async_err.c
new file mode 100644
index 0000000..ae97e96
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/async_err.c
@@ -0,0 +1,51 @@
+/*
+ * Generated by util/mkerr.pl DO NOT EDIT
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <openssl/err.h>
+#include <openssl/async.h>
+
+/* BEGIN ERROR CODES */
+#ifndef OPENSSL_NO_ERR
+
+# define ERR_FUNC(func) ERR_PACK(ERR_LIB_ASYNC,func,0)
+# define ERR_REASON(reason) ERR_PACK(ERR_LIB_ASYNC,0,reason)
+
+static ERR_STRING_DATA ASYNC_str_functs[] = {
+ {ERR_FUNC(ASYNC_F_ASYNC_CTX_NEW), "async_ctx_new"},
+ {ERR_FUNC(ASYNC_F_ASYNC_INIT_THREAD), "ASYNC_init_thread"},
+ {ERR_FUNC(ASYNC_F_ASYNC_JOB_NEW), "async_job_new"},
+ {ERR_FUNC(ASYNC_F_ASYNC_PAUSE_JOB), "ASYNC_pause_job"},
+ {ERR_FUNC(ASYNC_F_ASYNC_START_FUNC), "async_start_func"},
+ {ERR_FUNC(ASYNC_F_ASYNC_START_JOB), "ASYNC_start_job"},
+ {0, NULL}
+};
+
+static ERR_STRING_DATA ASYNC_str_reasons[] = {
+ {ERR_REASON(ASYNC_R_FAILED_TO_SET_POOL), "failed to set pool"},
+ {ERR_REASON(ASYNC_R_FAILED_TO_SWAP_CONTEXT), "failed to swap context"},
+ {ERR_REASON(ASYNC_R_INIT_FAILED), "init failed"},
+ {ERR_REASON(ASYNC_R_INVALID_POOL_SIZE), "invalid pool size"},
+ {0, NULL}
+};
+
+#endif
+
+int ERR_load_ASYNC_strings(void)
+{
+#ifndef OPENSSL_NO_ERR
+
+ if (ERR_func_error_string(ASYNC_str_functs[0].error) == NULL) {
+ ERR_load_strings(0, ASYNC_str_functs);
+ ERR_load_strings(0, ASYNC_str_reasons);
+ }
+#endif
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/async/async_locl.h b/openssl-1.1.0h/crypto/async/async_locl.h
new file mode 100644
index 0000000..f0ac05a
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/async_locl.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Must do this before including any header files, because on MacOS/X <stlib.h>
+ * includes <signal.h> which includes <ucontext.h>
+ */
+#if defined(__APPLE__) && defined(__MACH__) && !defined(_XOPEN_SOURCE)
+# define _XOPEN_SOURCE /* Otherwise incomplete ucontext_t structure */
+# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#if defined(_WIN32)
+# include <windows.h>
+#endif
+
+#include <internal/async.h>
+#include <openssl/crypto.h>
+
+typedef struct async_ctx_st async_ctx;
+typedef struct async_pool_st async_pool;
+
+#include "arch/async_win.h"
+#include "arch/async_posix.h"
+#include "arch/async_null.h"
+
+struct async_ctx_st {
+ async_fibre dispatcher;
+ ASYNC_JOB *currjob;
+ unsigned int blocked;
+};
+
+struct async_job_st {
+ async_fibre fibrectx;
+ int (*func) (void *);
+ void *funcargs;
+ int ret;
+ int status;
+ ASYNC_WAIT_CTX *waitctx;
+};
+
+struct fd_lookup_st {
+ const void *key;
+ OSSL_ASYNC_FD fd;
+ void *custom_data;
+ void (*cleanup)(ASYNC_WAIT_CTX *, const void *, OSSL_ASYNC_FD, void *);
+ int add;
+ int del;
+ struct fd_lookup_st *next;
+};
+
+struct async_wait_ctx_st {
+ struct fd_lookup_st *fds;
+ size_t numadd;
+ size_t numdel;
+};
+
+DEFINE_STACK_OF(ASYNC_JOB)
+
+struct async_pool_st {
+ STACK_OF(ASYNC_JOB) *jobs;
+ size_t curr_size;
+ size_t max_size;
+};
+
+void async_local_cleanup(void);
+void async_start_func(void);
+async_ctx *async_get_ctx(void);
+
+void async_wait_ctx_reset_counts(ASYNC_WAIT_CTX *ctx);
+
diff --git a/openssl-1.1.0h/crypto/async/async_wait.c b/openssl-1.1.0h/crypto/async/async_wait.c
new file mode 100644
index 0000000..0a0bf87
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/async_wait.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/* This must be the first #include file */
+#include "async_locl.h"
+
+#include <openssl/err.h>
+
+ASYNC_WAIT_CTX *ASYNC_WAIT_CTX_new(void)
+{
+ return OPENSSL_zalloc(sizeof(ASYNC_WAIT_CTX));
+}
+
+void ASYNC_WAIT_CTX_free(ASYNC_WAIT_CTX *ctx)
+{
+ struct fd_lookup_st *curr;
+ struct fd_lookup_st *next;
+
+ if (ctx == NULL)
+ return;
+
+ curr = ctx->fds;
+ while (curr != NULL) {
+ if (!curr->del) {
+ /* Only try and cleanup if it hasn't been marked deleted */
+ if (curr->cleanup != NULL)
+ curr->cleanup(ctx, curr->key, curr->fd, curr->custom_data);
+ }
+ /* Always free the fd_lookup_st */
+ next = curr->next;
+ OPENSSL_free(curr);
+ curr = next;
+ }
+
+ OPENSSL_free(ctx);
+}
+int ASYNC_WAIT_CTX_set_wait_fd(ASYNC_WAIT_CTX *ctx, const void *key,
+ OSSL_ASYNC_FD fd, void *custom_data,
+ void (*cleanup)(ASYNC_WAIT_CTX *, const void *,
+ OSSL_ASYNC_FD, void *))
+{
+ struct fd_lookup_st *fdlookup;
+
+ fdlookup = OPENSSL_zalloc(sizeof(*fdlookup));
+ if (fdlookup == NULL)
+ return 0;
+
+ fdlookup->key = key;
+ fdlookup->fd = fd;
+ fdlookup->custom_data = custom_data;
+ fdlookup->cleanup = cleanup;
+ fdlookup->add = 1;
+ fdlookup->next = ctx->fds;
+ ctx->fds = fdlookup;
+ ctx->numadd++;
+ return 1;
+}
+
+int ASYNC_WAIT_CTX_get_fd(ASYNC_WAIT_CTX *ctx, const void *key,
+ OSSL_ASYNC_FD *fd, void **custom_data)
+{
+ struct fd_lookup_st *curr;
+
+ curr = ctx->fds;
+ while (curr != NULL) {
+ if (curr->del) {
+ /* This one has been marked deleted so do nothing */
+ curr = curr->next;
+ continue;
+ }
+ if (curr->key == key) {
+ *fd = curr->fd;
+ *custom_data = curr->custom_data;
+ return 1;
+ }
+ curr = curr->next;
+ }
+ return 0;
+}
+
+int ASYNC_WAIT_CTX_get_all_fds(ASYNC_WAIT_CTX *ctx, OSSL_ASYNC_FD *fd,
+ size_t *numfds)
+{
+ struct fd_lookup_st *curr;
+
+ curr = ctx->fds;
+ *numfds = 0;
+ while (curr != NULL) {
+ if (curr->del) {
+ /* This one has been marked deleted so do nothing */
+ curr = curr->next;
+ continue;
+ }
+ if (fd != NULL) {
+ *fd = curr->fd;
+ fd++;
+ }
+ (*numfds)++;
+ curr = curr->next;
+ }
+ return 1;
+}
+
+int ASYNC_WAIT_CTX_get_changed_fds(ASYNC_WAIT_CTX *ctx, OSSL_ASYNC_FD *addfd,
+ size_t *numaddfds, OSSL_ASYNC_FD *delfd,
+ size_t *numdelfds)
+{
+ struct fd_lookup_st *curr;
+
+ *numaddfds = ctx->numadd;
+ *numdelfds = ctx->numdel;
+ if (addfd == NULL && delfd == NULL)
+ return 1;
+
+ curr = ctx->fds;
+
+ while (curr != NULL) {
+ /* We ignore fds that have been marked as both added and deleted */
+ if (curr->del && !curr->add && (delfd != NULL)) {
+ *delfd = curr->fd;
+ delfd++;
+ }
+ if (curr->add && !curr->del && (addfd != NULL)) {
+ *addfd = curr->fd;
+ addfd++;
+ }
+ curr = curr->next;
+ }
+
+ return 1;
+}
+
+int ASYNC_WAIT_CTX_clear_fd(ASYNC_WAIT_CTX *ctx, const void *key)
+{
+ struct fd_lookup_st *curr, *prev;
+
+ curr = ctx->fds;
+ prev = NULL;
+ while (curr != NULL) {
+ if (curr->del == 1) {
+ /* This one has been marked deleted already so do nothing */
+ curr = curr->next;
+ continue;
+ }
+ if (curr->key == key) {
+ /* If fd has just been added, remove it from the list */
+ if (curr->add == 1) {
+ if (ctx->fds == curr) {
+ ctx->fds = curr->next;
+ } else {
+ prev->next = curr->next;
+ }
+
+ /* It is responsibility of the caller to cleanup before calling
+ * ASYNC_WAIT_CTX_clear_fd
+ */
+ OPENSSL_free(curr);
+ ctx->numadd--;
+ return 1;
+ }
+
+ /*
+ * Mark it as deleted. We don't call cleanup if explicitly asked
+ * to clear an fd. We assume the caller is going to do that (if
+ * appropriate).
+ */
+ curr->del = 1;
+ ctx->numdel++;
+ return 1;
+ }
+ prev = curr;
+ curr = curr->next;
+ }
+ return 0;
+}
+
+void async_wait_ctx_reset_counts(ASYNC_WAIT_CTX *ctx)
+{
+ struct fd_lookup_st *curr, *prev = NULL;
+
+ ctx->numadd = 0;
+ ctx->numdel = 0;
+
+ curr = ctx->fds;
+
+ while (curr != NULL) {
+ if (curr->del) {
+ if (prev == NULL)
+ ctx->fds = curr->next;
+ else
+ prev->next = curr->next;
+ OPENSSL_free(curr);
+ if (prev == NULL)
+ curr = ctx->fds;
+ else
+ curr = prev->next;
+ continue;
+ }
+ if (curr->add) {
+ curr->add = 0;
+ }
+ prev = curr;
+ curr = curr->next;
+ }
+}
diff --git a/openssl-1.1.0h/crypto/async/build.info b/openssl-1.1.0h/crypto/async/build.info
new file mode 100644
index 0000000..278e3e9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/async/build.info
@@ -0,0 +1,4 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+ async.c async_wait.c async_err.c arch/async_posix.c arch/async_win.c \
+ arch/async_null.c
diff --git a/openssl-1.1.0h/crypto/bf/asm/bf-586.pl b/openssl-1.1.0h/crypto/bf/asm/bf-586.pl
new file mode 100644
index 0000000..ebc24f4
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/asm/bf-586.pl
@@ -0,0 +1,149 @@
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+require "cbc.pl";
+
+$output = pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],"bf-586.pl",$ARGV[$#ARGV] eq "386");
+
+$BF_ROUNDS=16;
+$BF_OFF=($BF_ROUNDS+2)*4;
+$L="edi";
+$R="esi";
+$P="ebp";
+$tmp1="eax";
+$tmp2="ebx";
+$tmp3="ecx";
+$tmp4="edx";
+
+&BF_encrypt("BF_encrypt",1);
+&BF_encrypt("BF_decrypt",0);
+&cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1);
+&asm_finish();
+
+close STDOUT;
+
+sub BF_encrypt
+ {
+ local($name,$enc)=@_;
+
+ &function_begin_B($name,"");
+
+ &comment("");
+
+ &push("ebp");
+ &push("ebx");
+ &mov($tmp2,&wparam(0));
+ &mov($P,&wparam(1));
+ &push("esi");
+ &push("edi");
+
+ &comment("Load the 2 words");
+ &mov($L,&DWP(0,$tmp2,"",0));
+ &mov($R,&DWP(4,$tmp2,"",0));
+
+ &xor( $tmp1, $tmp1);
+
+ # encrypting part
+
+ if ($enc)
+ {
+ &mov($tmp2,&DWP(0,$P,"",0));
+ &xor( $tmp3, $tmp3);
+
+ &xor($L,$tmp2);
+ for ($i=0; $i<$BF_ROUNDS; $i+=2)
+ {
+ &comment("");
+ &comment("Round $i");
+ &BF_ENCRYPT($i+1,$R,$L,$P,$tmp1,$tmp2,$tmp3,$tmp4,1);
+
+ &comment("");
+ &comment("Round ".sprintf("%d",$i+1));
+ &BF_ENCRYPT($i+2,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,1);
+ }
+ # &mov($tmp1,&wparam(0)); In last loop
+ &mov($tmp4,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
+ }
+ else
+ {
+ &mov($tmp2,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
+ &xor( $tmp3, $tmp3);
+
+ &xor($L,$tmp2);
+ for ($i=$BF_ROUNDS; $i>0; $i-=2)
+ {
+ &comment("");
+ &comment("Round $i");
+ &BF_ENCRYPT($i,$R,$L,$P,$tmp1,$tmp2,$tmp3,$tmp4,0);
+ &comment("");
+ &comment("Round ".sprintf("%d",$i-1));
+ &BF_ENCRYPT($i-1,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,0);
+ }
+ # &mov($tmp1,&wparam(0)); In last loop
+ &mov($tmp4,&DWP(0,$P,"",0));
+ }
+
+ &xor($R,$tmp4);
+ &mov(&DWP(4,$tmp1,"",0),$L);
+
+ &mov(&DWP(0,$tmp1,"",0),$R);
+ &function_end($name);
+ }
+
+sub BF_ENCRYPT
+ {
+ local($i,$L,$R,$P,$tmp1,$tmp2,$tmp3,$tmp4,$enc)=@_;
+
+ &mov( $tmp4, &DWP(&n2a($i*4),$P,"",0)); # for next round
+
+ &mov( $tmp2, $R);
+ &xor( $L, $tmp4);
+
+ &shr( $tmp2, 16);
+ &mov( $tmp4, $R);
+
+ &movb( &LB($tmp1), &HB($tmp2)); # A
+ &and( $tmp2, 0xff); # B
+
+ &movb( &LB($tmp3), &HB($tmp4)); # C
+ &and( $tmp4, 0xff); # D
+
+ &mov( $tmp1, &DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4));
+ &mov( $tmp2, &DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4));
+
+ &add( $tmp2, $tmp1);
+ &mov( $tmp1, &DWP(&n2a($BF_OFF+0x0800),$P,$tmp3,4));
+
+ &xor( $tmp2, $tmp1);
+ &mov( $tmp4, &DWP(&n2a($BF_OFF+0x0C00),$P,$tmp4,4));
+
+ &add( $tmp2, $tmp4);
+ if (($enc && ($i != 16)) || ((!$enc) && ($i != 1)))
+ { &xor( $tmp1, $tmp1); }
+ else
+ {
+ &comment("Load parameter 0 ($i) enc=$enc");
+ &mov($tmp1,&wparam(0));
+ } # In last loop
+
+ &xor( $L, $tmp2);
+ # delay
+ }
+
+sub n2a
+ {
+ sprintf("%d",$_[0]);
+ }
+
diff --git a/openssl-1.1.0h/crypto/bf/bf_cbc.c b/openssl-1.1.0h/crypto/bf/bf_cbc.c
new file mode 100644
index 0000000..6ed6257
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_cbc.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/blowfish.h>
+#include "bf_locl.h"
+
+void BF_cbc_encrypt(const unsigned char *in, unsigned char *out, long length,
+ const BF_KEY *schedule, unsigned char *ivec, int encrypt)
+{
+ register BF_LONG tin0, tin1;
+ register BF_LONG tout0, tout1, xor0, xor1;
+ register long l = length;
+ BF_LONG tin[2];
+
+ if (encrypt) {
+ n2l(ivec, tout0);
+ n2l(ivec, tout1);
+ ivec -= 8;
+ for (l -= 8; l >= 0; l -= 8) {
+ n2l(in, tin0);
+ n2l(in, tin1);
+ tin0 ^= tout0;
+ tin1 ^= tout1;
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_encrypt(tin, schedule);
+ tout0 = tin[0];
+ tout1 = tin[1];
+ l2n(tout0, out);
+ l2n(tout1, out);
+ }
+ if (l != -8) {
+ n2ln(in, tin0, tin1, l + 8);
+ tin0 ^= tout0;
+ tin1 ^= tout1;
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_encrypt(tin, schedule);
+ tout0 = tin[0];
+ tout1 = tin[1];
+ l2n(tout0, out);
+ l2n(tout1, out);
+ }
+ l2n(tout0, ivec);
+ l2n(tout1, ivec);
+ } else {
+ n2l(ivec, xor0);
+ n2l(ivec, xor1);
+ ivec -= 8;
+ for (l -= 8; l >= 0; l -= 8) {
+ n2l(in, tin0);
+ n2l(in, tin1);
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_decrypt(tin, schedule);
+ tout0 = tin[0] ^ xor0;
+ tout1 = tin[1] ^ xor1;
+ l2n(tout0, out);
+ l2n(tout1, out);
+ xor0 = tin0;
+ xor1 = tin1;
+ }
+ if (l != -8) {
+ n2l(in, tin0);
+ n2l(in, tin1);
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_decrypt(tin, schedule);
+ tout0 = tin[0] ^ xor0;
+ tout1 = tin[1] ^ xor1;
+ l2nn(tout0, tout1, out, l + 8);
+ xor0 = tin0;
+ xor1 = tin1;
+ }
+ l2n(xor0, ivec);
+ l2n(xor1, ivec);
+ }
+ tin0 = tin1 = tout0 = tout1 = xor0 = xor1 = 0;
+ tin[0] = tin[1] = 0;
+}
diff --git a/openssl-1.1.0h/crypto/bf/bf_cfb64.c b/openssl-1.1.0h/crypto/bf/bf_cfb64.c
new file mode 100644
index 0000000..ce6e13b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_cfb64.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/blowfish.h>
+#include "bf_locl.h"
+
+/*
+ * The input and output encrypted as though 64bit cfb mode is being used.
+ * The extra state information to record how much of the 64bit block we have
+ * used is contained in *num;
+ */
+
+void BF_cfb64_encrypt(const unsigned char *in, unsigned char *out,
+ long length, const BF_KEY *schedule,
+ unsigned char *ivec, int *num, int encrypt)
+{
+ register BF_LONG v0, v1, t;
+ register int n = *num;
+ register long l = length;
+ BF_LONG ti[2];
+ unsigned char *iv, c, cc;
+
+ iv = (unsigned char *)ivec;
+ if (encrypt) {
+ while (l--) {
+ if (n == 0) {
+ n2l(iv, v0);
+ ti[0] = v0;
+ n2l(iv, v1);
+ ti[1] = v1;
+ BF_encrypt((BF_LONG *)ti, schedule);
+ iv = (unsigned char *)ivec;
+ t = ti[0];
+ l2n(t, iv);
+ t = ti[1];
+ l2n(t, iv);
+ iv = (unsigned char *)ivec;
+ }
+ c = *(in++) ^ iv[n];
+ *(out++) = c;
+ iv[n] = c;
+ n = (n + 1) & 0x07;
+ }
+ } else {
+ while (l--) {
+ if (n == 0) {
+ n2l(iv, v0);
+ ti[0] = v0;
+ n2l(iv, v1);
+ ti[1] = v1;
+ BF_encrypt((BF_LONG *)ti, schedule);
+ iv = (unsigned char *)ivec;
+ t = ti[0];
+ l2n(t, iv);
+ t = ti[1];
+ l2n(t, iv);
+ iv = (unsigned char *)ivec;
+ }
+ cc = *(in++);
+ c = iv[n];
+ iv[n] = cc;
+ *(out++) = c ^ cc;
+ n = (n + 1) & 0x07;
+ }
+ }
+ v0 = v1 = ti[0] = ti[1] = t = c = cc = 0;
+ *num = n;
+}
diff --git a/openssl-1.1.0h/crypto/bf/bf_ecb.c b/openssl-1.1.0h/crypto/bf/bf_ecb.c
new file mode 100644
index 0000000..aa73540
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_ecb.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/blowfish.h>
+#include "bf_locl.h"
+#include <openssl/opensslv.h>
+
+/*
+ * Blowfish as implemented from 'Blowfish: Springer-Verlag paper' (From
+ * LECTURE NOTES IN COMPUTER SCIENCE 809, FAST SOFTWARE ENCRYPTION, CAMBRIDGE
+ * SECURITY WORKSHOP, CAMBRIDGE, U.K., DECEMBER 9-11, 1993)
+ */
+
+const char *BF_options(void)
+{
+ return ("blowfish(ptr)");
+}
+
+void BF_ecb_encrypt(const unsigned char *in, unsigned char *out,
+ const BF_KEY *key, int encrypt)
+{
+ BF_LONG l, d[2];
+
+ n2l(in, l);
+ d[0] = l;
+ n2l(in, l);
+ d[1] = l;
+ if (encrypt)
+ BF_encrypt(d, key);
+ else
+ BF_decrypt(d, key);
+ l = d[0];
+ l2n(l, out);
+ l = d[1];
+ l2n(l, out);
+ l = d[0] = d[1] = 0;
+}
diff --git a/openssl-1.1.0h/crypto/bf/bf_enc.c b/openssl-1.1.0h/crypto/bf/bf_enc.c
new file mode 100644
index 0000000..9f80c56
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_enc.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/blowfish.h>
+#include "bf_locl.h"
+
+/*
+ * Blowfish as implemented from 'Blowfish: Springer-Verlag paper' (From
+ * LECTURE NOTES IN COMPUTER SCIENCE 809, FAST SOFTWARE ENCRYPTION, CAMBRIDGE
+ * SECURITY WORKSHOP, CAMBRIDGE, U.K., DECEMBER 9-11, 1993)
+ */
+
+#if (BF_ROUNDS != 16) && (BF_ROUNDS != 20)
+# error If you set BF_ROUNDS to some value other than 16 or 20, you will have \
+to modify the code.
+#endif
+
+void BF_encrypt(BF_LONG *data, const BF_KEY *key)
+{
+ register BF_LONG l, r;
+ register const BF_LONG *p, *s;
+
+ p = key->P;
+ s = &(key->S[0]);
+ l = data[0];
+ r = data[1];
+
+ l ^= p[0];
+ BF_ENC(r, l, s, p[1]);
+ BF_ENC(l, r, s, p[2]);
+ BF_ENC(r, l, s, p[3]);
+ BF_ENC(l, r, s, p[4]);
+ BF_ENC(r, l, s, p[5]);
+ BF_ENC(l, r, s, p[6]);
+ BF_ENC(r, l, s, p[7]);
+ BF_ENC(l, r, s, p[8]);
+ BF_ENC(r, l, s, p[9]);
+ BF_ENC(l, r, s, p[10]);
+ BF_ENC(r, l, s, p[11]);
+ BF_ENC(l, r, s, p[12]);
+ BF_ENC(r, l, s, p[13]);
+ BF_ENC(l, r, s, p[14]);
+ BF_ENC(r, l, s, p[15]);
+ BF_ENC(l, r, s, p[16]);
+# if BF_ROUNDS == 20
+ BF_ENC(r, l, s, p[17]);
+ BF_ENC(l, r, s, p[18]);
+ BF_ENC(r, l, s, p[19]);
+ BF_ENC(l, r, s, p[20]);
+# endif
+ r ^= p[BF_ROUNDS + 1];
+
+ data[1] = l & 0xffffffffU;
+ data[0] = r & 0xffffffffU;
+}
+
+#ifndef BF_DEFAULT_OPTIONS
+
+void BF_decrypt(BF_LONG *data, const BF_KEY *key)
+{
+ register BF_LONG l, r;
+ register const BF_LONG *p, *s;
+
+ p = key->P;
+ s = &(key->S[0]);
+ l = data[0];
+ r = data[1];
+
+ l ^= p[BF_ROUNDS + 1];
+# if BF_ROUNDS == 20
+ BF_ENC(r, l, s, p[20]);
+ BF_ENC(l, r, s, p[19]);
+ BF_ENC(r, l, s, p[18]);
+ BF_ENC(l, r, s, p[17]);
+# endif
+ BF_ENC(r, l, s, p[16]);
+ BF_ENC(l, r, s, p[15]);
+ BF_ENC(r, l, s, p[14]);
+ BF_ENC(l, r, s, p[13]);
+ BF_ENC(r, l, s, p[12]);
+ BF_ENC(l, r, s, p[11]);
+ BF_ENC(r, l, s, p[10]);
+ BF_ENC(l, r, s, p[9]);
+ BF_ENC(r, l, s, p[8]);
+ BF_ENC(l, r, s, p[7]);
+ BF_ENC(r, l, s, p[6]);
+ BF_ENC(l, r, s, p[5]);
+ BF_ENC(r, l, s, p[4]);
+ BF_ENC(l, r, s, p[3]);
+ BF_ENC(r, l, s, p[2]);
+ BF_ENC(l, r, s, p[1]);
+ r ^= p[0];
+
+ data[1] = l & 0xffffffffU;
+ data[0] = r & 0xffffffffU;
+}
+
+void BF_cbc_encrypt(const unsigned char *in, unsigned char *out, long length,
+ const BF_KEY *schedule, unsigned char *ivec, int encrypt)
+{
+ register BF_LONG tin0, tin1;
+ register BF_LONG tout0, tout1, xor0, xor1;
+ register long l = length;
+ BF_LONG tin[2];
+
+ if (encrypt) {
+ n2l(ivec, tout0);
+ n2l(ivec, tout1);
+ ivec -= 8;
+ for (l -= 8; l >= 0; l -= 8) {
+ n2l(in, tin0);
+ n2l(in, tin1);
+ tin0 ^= tout0;
+ tin1 ^= tout1;
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_encrypt(tin, schedule);
+ tout0 = tin[0];
+ tout1 = tin[1];
+ l2n(tout0, out);
+ l2n(tout1, out);
+ }
+ if (l != -8) {
+ n2ln(in, tin0, tin1, l + 8);
+ tin0 ^= tout0;
+ tin1 ^= tout1;
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_encrypt(tin, schedule);
+ tout0 = tin[0];
+ tout1 = tin[1];
+ l2n(tout0, out);
+ l2n(tout1, out);
+ }
+ l2n(tout0, ivec);
+ l2n(tout1, ivec);
+ } else {
+ n2l(ivec, xor0);
+ n2l(ivec, xor1);
+ ivec -= 8;
+ for (l -= 8; l >= 0; l -= 8) {
+ n2l(in, tin0);
+ n2l(in, tin1);
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_decrypt(tin, schedule);
+ tout0 = tin[0] ^ xor0;
+ tout1 = tin[1] ^ xor1;
+ l2n(tout0, out);
+ l2n(tout1, out);
+ xor0 = tin0;
+ xor1 = tin1;
+ }
+ if (l != -8) {
+ n2l(in, tin0);
+ n2l(in, tin1);
+ tin[0] = tin0;
+ tin[1] = tin1;
+ BF_decrypt(tin, schedule);
+ tout0 = tin[0] ^ xor0;
+ tout1 = tin[1] ^ xor1;
+ l2nn(tout0, tout1, out, l + 8);
+ xor0 = tin0;
+ xor1 = tin1;
+ }
+ l2n(xor0, ivec);
+ l2n(xor1, ivec);
+ }
+ tin0 = tin1 = tout0 = tout1 = xor0 = xor1 = 0;
+ tin[0] = tin[1] = 0;
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/bf/bf_locl.h b/openssl-1.1.0h/crypto/bf/bf_locl.h
new file mode 100644
index 0000000..b1a415e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_locl.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef HEADER_BF_LOCL_H
+# define HEADER_BF_LOCL_H
+# include <openssl/opensslconf.h>
+
+/* NOTE - c is not incremented as per n2l */
+# define n2ln(c,l1,l2,n) { \
+ c+=n; \
+ l1=l2=0; \
+ switch (n) { \
+ case 8: l2 =((unsigned long)(*(--(c)))) ; \
+ /* fall thru */ \
+ case 7: l2|=((unsigned long)(*(--(c))))<< 8; \
+ /* fall thru */ \
+ case 6: l2|=((unsigned long)(*(--(c))))<<16; \
+ /* fall thru */ \
+ case 5: l2|=((unsigned long)(*(--(c))))<<24; \
+ /* fall thru */ \
+ case 4: l1 =((unsigned long)(*(--(c)))) ; \
+ /* fall thru */ \
+ case 3: l1|=((unsigned long)(*(--(c))))<< 8; \
+ /* fall thru */ \
+ case 2: l1|=((unsigned long)(*(--(c))))<<16; \
+ /* fall thru */ \
+ case 1: l1|=((unsigned long)(*(--(c))))<<24; \
+ } \
+ }
+
+/* NOTE - c is not incremented as per l2n */
+# define l2nn(l1,l2,c,n) { \
+ c+=n; \
+ switch (n) { \
+ case 8: *(--(c))=(unsigned char)(((l2) )&0xff); \
+ /* fall thru */ \
+ case 7: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
+ /* fall thru */ \
+ case 6: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
+ /* fall thru */ \
+ case 5: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
+ /* fall thru */ \
+ case 4: *(--(c))=(unsigned char)(((l1) )&0xff); \
+ /* fall thru */ \
+ case 3: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
+ /* fall thru */ \
+ case 2: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
+ /* fall thru */ \
+ case 1: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
+ } \
+ }
+
+# undef n2l
+# define n2l(c,l) (l =((unsigned long)(*((c)++)))<<24L, \
+ l|=((unsigned long)(*((c)++)))<<16L, \
+ l|=((unsigned long)(*((c)++)))<< 8L, \
+ l|=((unsigned long)(*((c)++))))
+
+# undef l2n
+# define l2n(l,c) (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \
+ *((c)++)=(unsigned char)(((l)>>16L)&0xff), \
+ *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \
+ *((c)++)=(unsigned char)(((l) )&0xff))
+
+/*
+ * This is actually a big endian algorithm, the most significant byte is used
+ * to lookup array 0
+ */
+
+# define BF_ENC(LL,R,S,P) ( \
+ LL^=P, \
+ LL^=((( S[ ((R>>24)&0xff)] + \
+ S[0x0100+((R>>16)&0xff)])^ \
+ S[0x0200+((R>> 8)&0xff)])+ \
+ S[0x0300+((R )&0xff)])&0xffffffffU \
+ )
+
+#endif
diff --git a/openssl-1.1.0h/crypto/bf/bf_ofb64.c b/openssl-1.1.0h/crypto/bf/bf_ofb64.c
new file mode 100644
index 0000000..6418217
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_ofb64.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/blowfish.h>
+#include "bf_locl.h"
+
+/*
+ * The input and output encrypted as though 64bit ofb mode is being used.
+ * The extra state information to record how much of the 64bit block we have
+ * used is contained in *num;
+ */
+void BF_ofb64_encrypt(const unsigned char *in, unsigned char *out,
+ long length, const BF_KEY *schedule,
+ unsigned char *ivec, int *num)
+{
+ register BF_LONG v0, v1, t;
+ register int n = *num;
+ register long l = length;
+ unsigned char d[8];
+ register char *dp;
+ BF_LONG ti[2];
+ unsigned char *iv;
+ int save = 0;
+
+ iv = (unsigned char *)ivec;
+ n2l(iv, v0);
+ n2l(iv, v1);
+ ti[0] = v0;
+ ti[1] = v1;
+ dp = (char *)d;
+ l2n(v0, dp);
+ l2n(v1, dp);
+ while (l--) {
+ if (n == 0) {
+ BF_encrypt((BF_LONG *)ti, schedule);
+ dp = (char *)d;
+ t = ti[0];
+ l2n(t, dp);
+ t = ti[1];
+ l2n(t, dp);
+ save++;
+ }
+ *(out++) = *(in++) ^ d[n];
+ n = (n + 1) & 0x07;
+ }
+ if (save) {
+ v0 = ti[0];
+ v1 = ti[1];
+ iv = (unsigned char *)ivec;
+ l2n(v0, iv);
+ l2n(v1, iv);
+ }
+ t = v0 = v1 = ti[0] = ti[1] = 0;
+ *num = n;
+}
diff --git a/openssl-1.1.0h/crypto/bf/bf_pi.h b/openssl-1.1.0h/crypto/bf/bf_pi.h
new file mode 100644
index 0000000..a054b03
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_pi.h
@@ -0,0 +1,530 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+static const BF_KEY bf_init = {
+ {
+ 0x243f6a88L, 0x85a308d3L, 0x13198a2eL, 0x03707344L,
+ 0xa4093822L, 0x299f31d0L, 0x082efa98L, 0xec4e6c89L,
+ 0x452821e6L, 0x38d01377L, 0xbe5466cfL, 0x34e90c6cL,
+ 0xc0ac29b7L, 0xc97c50ddL, 0x3f84d5b5L, 0xb5470917L,
+ 0x9216d5d9L, 0x8979fb1b}, {
+ 0xd1310ba6L, 0x98dfb5acL, 0x2ffd72dbL,
+ 0xd01adfb7L,
+ 0xb8e1afedL, 0x6a267e96L, 0xba7c9045L,
+ 0xf12c7f99L,
+ 0x24a19947L, 0xb3916cf7L, 0x0801f2e2L,
+ 0x858efc16L,
+ 0x636920d8L, 0x71574e69L, 0xa458fea3L,
+ 0xf4933d7eL,
+ 0x0d95748fL, 0x728eb658L, 0x718bcd58L,
+ 0x82154aeeL,
+ 0x7b54a41dL, 0xc25a59b5L, 0x9c30d539L,
+ 0x2af26013L,
+ 0xc5d1b023L, 0x286085f0L, 0xca417918L,
+ 0xb8db38efL,
+ 0x8e79dcb0L, 0x603a180eL, 0x6c9e0e8bL,
+ 0xb01e8a3eL,
+ 0xd71577c1L, 0xbd314b27L, 0x78af2fdaL,
+ 0x55605c60L,
+ 0xe65525f3L, 0xaa55ab94L, 0x57489862L,
+ 0x63e81440L,
+ 0x55ca396aL, 0x2aab10b6L, 0xb4cc5c34L,
+ 0x1141e8ceL,
+ 0xa15486afL, 0x7c72e993L, 0xb3ee1411L,
+ 0x636fbc2aL,
+ 0x2ba9c55dL, 0x741831f6L, 0xce5c3e16L,
+ 0x9b87931eL,
+ 0xafd6ba33L, 0x6c24cf5cL, 0x7a325381L,
+ 0x28958677L,
+ 0x3b8f4898L, 0x6b4bb9afL, 0xc4bfe81bL,
+ 0x66282193L,
+ 0x61d809ccL, 0xfb21a991L, 0x487cac60L,
+ 0x5dec8032L,
+ 0xef845d5dL, 0xe98575b1L, 0xdc262302L,
+ 0xeb651b88L,
+ 0x23893e81L, 0xd396acc5L, 0x0f6d6ff3L,
+ 0x83f44239L,
+ 0x2e0b4482L, 0xa4842004L, 0x69c8f04aL,
+ 0x9e1f9b5eL,
+ 0x21c66842L, 0xf6e96c9aL, 0x670c9c61L,
+ 0xabd388f0L,
+ 0x6a51a0d2L, 0xd8542f68L, 0x960fa728L,
+ 0xab5133a3L,
+ 0x6eef0b6cL, 0x137a3be4L, 0xba3bf050L,
+ 0x7efb2a98L,
+ 0xa1f1651dL, 0x39af0176L, 0x66ca593eL,
+ 0x82430e88L,
+ 0x8cee8619L, 0x456f9fb4L, 0x7d84a5c3L,
+ 0x3b8b5ebeL,
+ 0xe06f75d8L, 0x85c12073L, 0x401a449fL,
+ 0x56c16aa6L,
+ 0x4ed3aa62L, 0x363f7706L, 0x1bfedf72L,
+ 0x429b023dL,
+ 0x37d0d724L, 0xd00a1248L, 0xdb0fead3L,
+ 0x49f1c09bL,
+ 0x075372c9L, 0x80991b7bL, 0x25d479d8L,
+ 0xf6e8def7L,
+ 0xe3fe501aL, 0xb6794c3bL, 0x976ce0bdL,
+ 0x04c006baL,
+ 0xc1a94fb6L, 0x409f60c4L, 0x5e5c9ec2L,
+ 0x196a2463L,
+ 0x68fb6fafL, 0x3e6c53b5L, 0x1339b2ebL,
+ 0x3b52ec6fL,
+ 0x6dfc511fL, 0x9b30952cL, 0xcc814544L,
+ 0xaf5ebd09L,
+ 0xbee3d004L, 0xde334afdL, 0x660f2807L,
+ 0x192e4bb3L,
+ 0xc0cba857L, 0x45c8740fL, 0xd20b5f39L,
+ 0xb9d3fbdbL,
+ 0x5579c0bdL, 0x1a60320aL, 0xd6a100c6L,
+ 0x402c7279L,
+ 0x679f25feL, 0xfb1fa3ccL, 0x8ea5e9f8L,
+ 0xdb3222f8L,
+ 0x3c7516dfL, 0xfd616b15L, 0x2f501ec8L,
+ 0xad0552abL,
+ 0x323db5faL, 0xfd238760L, 0x53317b48L,
+ 0x3e00df82L,
+ 0x9e5c57bbL, 0xca6f8ca0L, 0x1a87562eL,
+ 0xdf1769dbL,
+ 0xd542a8f6L, 0x287effc3L, 0xac6732c6L,
+ 0x8c4f5573L,
+ 0x695b27b0L, 0xbbca58c8L, 0xe1ffa35dL,
+ 0xb8f011a0L,
+ 0x10fa3d98L, 0xfd2183b8L, 0x4afcb56cL,
+ 0x2dd1d35bL,
+ 0x9a53e479L, 0xb6f84565L, 0xd28e49bcL,
+ 0x4bfb9790L,
+ 0xe1ddf2daL, 0xa4cb7e33L, 0x62fb1341L,
+ 0xcee4c6e8L,
+ 0xef20cadaL, 0x36774c01L, 0xd07e9efeL,
+ 0x2bf11fb4L,
+ 0x95dbda4dL, 0xae909198L, 0xeaad8e71L,
+ 0x6b93d5a0L,
+ 0xd08ed1d0L, 0xafc725e0L, 0x8e3c5b2fL,
+ 0x8e7594b7L,
+ 0x8ff6e2fbL, 0xf2122b64L, 0x8888b812L,
+ 0x900df01cL,
+ 0x4fad5ea0L, 0x688fc31cL, 0xd1cff191L,
+ 0xb3a8c1adL,
+ 0x2f2f2218L, 0xbe0e1777L, 0xea752dfeL,
+ 0x8b021fa1L,
+ 0xe5a0cc0fL, 0xb56f74e8L, 0x18acf3d6L,
+ 0xce89e299L,
+ 0xb4a84fe0L, 0xfd13e0b7L, 0x7cc43b81L,
+ 0xd2ada8d9L,
+ 0x165fa266L, 0x80957705L, 0x93cc7314L,
+ 0x211a1477L,
+ 0xe6ad2065L, 0x77b5fa86L, 0xc75442f5L,
+ 0xfb9d35cfL,
+ 0xebcdaf0cL, 0x7b3e89a0L, 0xd6411bd3L,
+ 0xae1e7e49L,
+ 0x00250e2dL, 0x2071b35eL, 0x226800bbL,
+ 0x57b8e0afL,
+ 0x2464369bL, 0xf009b91eL, 0x5563911dL,
+ 0x59dfa6aaL,
+ 0x78c14389L, 0xd95a537fL, 0x207d5ba2L,
+ 0x02e5b9c5L,
+ 0x83260376L, 0x6295cfa9L, 0x11c81968L,
+ 0x4e734a41L,
+ 0xb3472dcaL, 0x7b14a94aL, 0x1b510052L,
+ 0x9a532915L,
+ 0xd60f573fL, 0xbc9bc6e4L, 0x2b60a476L,
+ 0x81e67400L,
+ 0x08ba6fb5L, 0x571be91fL, 0xf296ec6bL,
+ 0x2a0dd915L,
+ 0xb6636521L, 0xe7b9f9b6L, 0xff34052eL,
+ 0xc5855664L,
+ 0x53b02d5dL, 0xa99f8fa1L, 0x08ba4799L,
+ 0x6e85076aL,
+ 0x4b7a70e9L, 0xb5b32944L, 0xdb75092eL,
+ 0xc4192623L,
+ 0xad6ea6b0L, 0x49a7df7dL, 0x9cee60b8L,
+ 0x8fedb266L,
+ 0xecaa8c71L, 0x699a17ffL, 0x5664526cL,
+ 0xc2b19ee1L,
+ 0x193602a5L, 0x75094c29L, 0xa0591340L,
+ 0xe4183a3eL,
+ 0x3f54989aL, 0x5b429d65L, 0x6b8fe4d6L,
+ 0x99f73fd6L,
+ 0xa1d29c07L, 0xefe830f5L, 0x4d2d38e6L,
+ 0xf0255dc1L,
+ 0x4cdd2086L, 0x8470eb26L, 0x6382e9c6L,
+ 0x021ecc5eL,
+ 0x09686b3fL, 0x3ebaefc9L, 0x3c971814L,
+ 0x6b6a70a1L,
+ 0x687f3584L, 0x52a0e286L, 0xb79c5305L,
+ 0xaa500737L,
+ 0x3e07841cL, 0x7fdeae5cL, 0x8e7d44ecL,
+ 0x5716f2b8L,
+ 0xb03ada37L, 0xf0500c0dL, 0xf01c1f04L,
+ 0x0200b3ffL,
+ 0xae0cf51aL, 0x3cb574b2L, 0x25837a58L,
+ 0xdc0921bdL,
+ 0xd19113f9L, 0x7ca92ff6L, 0x94324773L,
+ 0x22f54701L,
+ 0x3ae5e581L, 0x37c2dadcL, 0xc8b57634L,
+ 0x9af3dda7L,
+ 0xa9446146L, 0x0fd0030eL, 0xecc8c73eL,
+ 0xa4751e41L,
+ 0xe238cd99L, 0x3bea0e2fL, 0x3280bba1L,
+ 0x183eb331L,
+ 0x4e548b38L, 0x4f6db908L, 0x6f420d03L,
+ 0xf60a04bfL,
+ 0x2cb81290L, 0x24977c79L, 0x5679b072L,
+ 0xbcaf89afL,
+ 0xde9a771fL, 0xd9930810L, 0xb38bae12L,
+ 0xdccf3f2eL,
+ 0x5512721fL, 0x2e6b7124L, 0x501adde6L,
+ 0x9f84cd87L,
+ 0x7a584718L, 0x7408da17L, 0xbc9f9abcL,
+ 0xe94b7d8cL,
+ 0xec7aec3aL, 0xdb851dfaL, 0x63094366L,
+ 0xc464c3d2L,
+ 0xef1c1847L, 0x3215d908L, 0xdd433b37L,
+ 0x24c2ba16L,
+ 0x12a14d43L, 0x2a65c451L, 0x50940002L,
+ 0x133ae4ddL,
+ 0x71dff89eL, 0x10314e55L, 0x81ac77d6L,
+ 0x5f11199bL,
+ 0x043556f1L, 0xd7a3c76bL, 0x3c11183bL,
+ 0x5924a509L,
+ 0xf28fe6edL, 0x97f1fbfaL, 0x9ebabf2cL,
+ 0x1e153c6eL,
+ 0x86e34570L, 0xeae96fb1L, 0x860e5e0aL,
+ 0x5a3e2ab3L,
+ 0x771fe71cL, 0x4e3d06faL, 0x2965dcb9L,
+ 0x99e71d0fL,
+ 0x803e89d6L, 0x5266c825L, 0x2e4cc978L,
+ 0x9c10b36aL,
+ 0xc6150ebaL, 0x94e2ea78L, 0xa5fc3c53L,
+ 0x1e0a2df4L,
+ 0xf2f74ea7L, 0x361d2b3dL, 0x1939260fL,
+ 0x19c27960L,
+ 0x5223a708L, 0xf71312b6L, 0xebadfe6eL,
+ 0xeac31f66L,
+ 0xe3bc4595L, 0xa67bc883L, 0xb17f37d1L,
+ 0x018cff28L,
+ 0xc332ddefL, 0xbe6c5aa5L, 0x65582185L,
+ 0x68ab9802L,
+ 0xeecea50fL, 0xdb2f953bL, 0x2aef7dadL,
+ 0x5b6e2f84L,
+ 0x1521b628L, 0x29076170L, 0xecdd4775L,
+ 0x619f1510L,
+ 0x13cca830L, 0xeb61bd96L, 0x0334fe1eL,
+ 0xaa0363cfL,
+ 0xb5735c90L, 0x4c70a239L, 0xd59e9e0bL,
+ 0xcbaade14L,
+ 0xeecc86bcL, 0x60622ca7L, 0x9cab5cabL,
+ 0xb2f3846eL,
+ 0x648b1eafL, 0x19bdf0caL, 0xa02369b9L,
+ 0x655abb50L,
+ 0x40685a32L, 0x3c2ab4b3L, 0x319ee9d5L,
+ 0xc021b8f7L,
+ 0x9b540b19L, 0x875fa099L, 0x95f7997eL,
+ 0x623d7da8L,
+ 0xf837889aL, 0x97e32d77L, 0x11ed935fL,
+ 0x16681281L,
+ 0x0e358829L, 0xc7e61fd6L, 0x96dedfa1L,
+ 0x7858ba99L,
+ 0x57f584a5L, 0x1b227263L, 0x9b83c3ffL,
+ 0x1ac24696L,
+ 0xcdb30aebL, 0x532e3054L, 0x8fd948e4L,
+ 0x6dbc3128L,
+ 0x58ebf2efL, 0x34c6ffeaL, 0xfe28ed61L,
+ 0xee7c3c73L,
+ 0x5d4a14d9L, 0xe864b7e3L, 0x42105d14L,
+ 0x203e13e0L,
+ 0x45eee2b6L, 0xa3aaabeaL, 0xdb6c4f15L,
+ 0xfacb4fd0L,
+ 0xc742f442L, 0xef6abbb5L, 0x654f3b1dL,
+ 0x41cd2105L,
+ 0xd81e799eL, 0x86854dc7L, 0xe44b476aL,
+ 0x3d816250L,
+ 0xcf62a1f2L, 0x5b8d2646L, 0xfc8883a0L,
+ 0xc1c7b6a3L,
+ 0x7f1524c3L, 0x69cb7492L, 0x47848a0bL,
+ 0x5692b285L,
+ 0x095bbf00L, 0xad19489dL, 0x1462b174L,
+ 0x23820e00L,
+ 0x58428d2aL, 0x0c55f5eaL, 0x1dadf43eL,
+ 0x233f7061L,
+ 0x3372f092L, 0x8d937e41L, 0xd65fecf1L,
+ 0x6c223bdbL,
+ 0x7cde3759L, 0xcbee7460L, 0x4085f2a7L,
+ 0xce77326eL,
+ 0xa6078084L, 0x19f8509eL, 0xe8efd855L,
+ 0x61d99735L,
+ 0xa969a7aaL, 0xc50c06c2L, 0x5a04abfcL,
+ 0x800bcadcL,
+ 0x9e447a2eL, 0xc3453484L, 0xfdd56705L,
+ 0x0e1e9ec9L,
+ 0xdb73dbd3L, 0x105588cdL, 0x675fda79L,
+ 0xe3674340L,
+ 0xc5c43465L, 0x713e38d8L, 0x3d28f89eL,
+ 0xf16dff20L,
+ 0x153e21e7L, 0x8fb03d4aL, 0xe6e39f2bL,
+ 0xdb83adf7L,
+ 0xe93d5a68L, 0x948140f7L, 0xf64c261cL,
+ 0x94692934L,
+ 0x411520f7L, 0x7602d4f7L, 0xbcf46b2eL,
+ 0xd4a20068L,
+ 0xd4082471L, 0x3320f46aL, 0x43b7d4b7L,
+ 0x500061afL,
+ 0x1e39f62eL, 0x97244546L, 0x14214f74L,
+ 0xbf8b8840L,
+ 0x4d95fc1dL, 0x96b591afL, 0x70f4ddd3L,
+ 0x66a02f45L,
+ 0xbfbc09ecL, 0x03bd9785L, 0x7fac6dd0L,
+ 0x31cb8504L,
+ 0x96eb27b3L, 0x55fd3941L, 0xda2547e6L,
+ 0xabca0a9aL,
+ 0x28507825L, 0x530429f4L, 0x0a2c86daL,
+ 0xe9b66dfbL,
+ 0x68dc1462L, 0xd7486900L, 0x680ec0a4L,
+ 0x27a18deeL,
+ 0x4f3ffea2L, 0xe887ad8cL, 0xb58ce006L,
+ 0x7af4d6b6L,
+ 0xaace1e7cL, 0xd3375fecL, 0xce78a399L,
+ 0x406b2a42L,
+ 0x20fe9e35L, 0xd9f385b9L, 0xee39d7abL,
+ 0x3b124e8bL,
+ 0x1dc9faf7L, 0x4b6d1856L, 0x26a36631L,
+ 0xeae397b2L,
+ 0x3a6efa74L, 0xdd5b4332L, 0x6841e7f7L,
+ 0xca7820fbL,
+ 0xfb0af54eL, 0xd8feb397L, 0x454056acL,
+ 0xba489527L,
+ 0x55533a3aL, 0x20838d87L, 0xfe6ba9b7L,
+ 0xd096954bL,
+ 0x55a867bcL, 0xa1159a58L, 0xcca92963L,
+ 0x99e1db33L,
+ 0xa62a4a56L, 0x3f3125f9L, 0x5ef47e1cL,
+ 0x9029317cL,
+ 0xfdf8e802L, 0x04272f70L, 0x80bb155cL,
+ 0x05282ce3L,
+ 0x95c11548L, 0xe4c66d22L, 0x48c1133fL,
+ 0xc70f86dcL,
+ 0x07f9c9eeL, 0x41041f0fL, 0x404779a4L,
+ 0x5d886e17L,
+ 0x325f51ebL, 0xd59bc0d1L, 0xf2bcc18fL,
+ 0x41113564L,
+ 0x257b7834L, 0x602a9c60L, 0xdff8e8a3L,
+ 0x1f636c1bL,
+ 0x0e12b4c2L, 0x02e1329eL, 0xaf664fd1L,
+ 0xcad18115L,
+ 0x6b2395e0L, 0x333e92e1L, 0x3b240b62L,
+ 0xeebeb922L,
+ 0x85b2a20eL, 0xe6ba0d99L, 0xde720c8cL,
+ 0x2da2f728L,
+ 0xd0127845L, 0x95b794fdL, 0x647d0862L,
+ 0xe7ccf5f0L,
+ 0x5449a36fL, 0x877d48faL, 0xc39dfd27L,
+ 0xf33e8d1eL,
+ 0x0a476341L, 0x992eff74L, 0x3a6f6eabL,
+ 0xf4f8fd37L,
+ 0xa812dc60L, 0xa1ebddf8L, 0x991be14cL,
+ 0xdb6e6b0dL,
+ 0xc67b5510L, 0x6d672c37L, 0x2765d43bL,
+ 0xdcd0e804L,
+ 0xf1290dc7L, 0xcc00ffa3L, 0xb5390f92L,
+ 0x690fed0bL,
+ 0x667b9ffbL, 0xcedb7d9cL, 0xa091cf0bL,
+ 0xd9155ea3L,
+ 0xbb132f88L, 0x515bad24L, 0x7b9479bfL,
+ 0x763bd6ebL,
+ 0x37392eb3L, 0xcc115979L, 0x8026e297L,
+ 0xf42e312dL,
+ 0x6842ada7L, 0xc66a2b3bL, 0x12754cccL,
+ 0x782ef11cL,
+ 0x6a124237L, 0xb79251e7L, 0x06a1bbe6L,
+ 0x4bfb6350L,
+ 0x1a6b1018L, 0x11caedfaL, 0x3d25bdd8L,
+ 0xe2e1c3c9L,
+ 0x44421659L, 0x0a121386L, 0xd90cec6eL,
+ 0xd5abea2aL,
+ 0x64af674eL, 0xda86a85fL, 0xbebfe988L,
+ 0x64e4c3feL,
+ 0x9dbc8057L, 0xf0f7c086L, 0x60787bf8L,
+ 0x6003604dL,
+ 0xd1fd8346L, 0xf6381fb0L, 0x7745ae04L,
+ 0xd736fcccL,
+ 0x83426b33L, 0xf01eab71L, 0xb0804187L,
+ 0x3c005e5fL,
+ 0x77a057beL, 0xbde8ae24L, 0x55464299L,
+ 0xbf582e61L,
+ 0x4e58f48fL, 0xf2ddfda2L, 0xf474ef38L,
+ 0x8789bdc2L,
+ 0x5366f9c3L, 0xc8b38e74L, 0xb475f255L,
+ 0x46fcd9b9L,
+ 0x7aeb2661L, 0x8b1ddf84L, 0x846a0e79L,
+ 0x915f95e2L,
+ 0x466e598eL, 0x20b45770L, 0x8cd55591L,
+ 0xc902de4cL,
+ 0xb90bace1L, 0xbb8205d0L, 0x11a86248L,
+ 0x7574a99eL,
+ 0xb77f19b6L, 0xe0a9dc09L, 0x662d09a1L,
+ 0xc4324633L,
+ 0xe85a1f02L, 0x09f0be8cL, 0x4a99a025L,
+ 0x1d6efe10L,
+ 0x1ab93d1dL, 0x0ba5a4dfL, 0xa186f20fL,
+ 0x2868f169L,
+ 0xdcb7da83L, 0x573906feL, 0xa1e2ce9bL,
+ 0x4fcd7f52L,
+ 0x50115e01L, 0xa70683faL, 0xa002b5c4L,
+ 0x0de6d027L,
+ 0x9af88c27L, 0x773f8641L, 0xc3604c06L,
+ 0x61a806b5L,
+ 0xf0177a28L, 0xc0f586e0L, 0x006058aaL,
+ 0x30dc7d62L,
+ 0x11e69ed7L, 0x2338ea63L, 0x53c2dd94L,
+ 0xc2c21634L,
+ 0xbbcbee56L, 0x90bcb6deL, 0xebfc7da1L,
+ 0xce591d76L,
+ 0x6f05e409L, 0x4b7c0188L, 0x39720a3dL,
+ 0x7c927c24L,
+ 0x86e3725fL, 0x724d9db9L, 0x1ac15bb4L,
+ 0xd39eb8fcL,
+ 0xed545578L, 0x08fca5b5L, 0xd83d7cd3L,
+ 0x4dad0fc4L,
+ 0x1e50ef5eL, 0xb161e6f8L, 0xa28514d9L,
+ 0x6c51133cL,
+ 0x6fd5c7e7L, 0x56e14ec4L, 0x362abfceL,
+ 0xddc6c837L,
+ 0xd79a3234L, 0x92638212L, 0x670efa8eL,
+ 0x406000e0L,
+ 0x3a39ce37L, 0xd3faf5cfL, 0xabc27737L,
+ 0x5ac52d1bL,
+ 0x5cb0679eL, 0x4fa33742L, 0xd3822740L,
+ 0x99bc9bbeL,
+ 0xd5118e9dL, 0xbf0f7315L, 0xd62d1c7eL,
+ 0xc700c47bL,
+ 0xb78c1b6bL, 0x21a19045L, 0xb26eb1beL,
+ 0x6a366eb4L,
+ 0x5748ab2fL, 0xbc946e79L, 0xc6a376d2L,
+ 0x6549c2c8L,
+ 0x530ff8eeL, 0x468dde7dL, 0xd5730a1dL,
+ 0x4cd04dc6L,
+ 0x2939bbdbL, 0xa9ba4650L, 0xac9526e8L,
+ 0xbe5ee304L,
+ 0xa1fad5f0L, 0x6a2d519aL, 0x63ef8ce2L,
+ 0x9a86ee22L,
+ 0xc089c2b8L, 0x43242ef6L, 0xa51e03aaL,
+ 0x9cf2d0a4L,
+ 0x83c061baL, 0x9be96a4dL, 0x8fe51550L,
+ 0xba645bd6L,
+ 0x2826a2f9L, 0xa73a3ae1L, 0x4ba99586L,
+ 0xef5562e9L,
+ 0xc72fefd3L, 0xf752f7daL, 0x3f046f69L,
+ 0x77fa0a59L,
+ 0x80e4a915L, 0x87b08601L, 0x9b09e6adL,
+ 0x3b3ee593L,
+ 0xe990fd5aL, 0x9e34d797L, 0x2cf0b7d9L,
+ 0x022b8b51L,
+ 0x96d5ac3aL, 0x017da67dL, 0xd1cf3ed6L,
+ 0x7c7d2d28L,
+ 0x1f9f25cfL, 0xadf2b89bL, 0x5ad6b472L,
+ 0x5a88f54cL,
+ 0xe029ac71L, 0xe019a5e6L, 0x47b0acfdL,
+ 0xed93fa9bL,
+ 0xe8d3c48dL, 0x283b57ccL, 0xf8d56629L,
+ 0x79132e28L,
+ 0x785f0191L, 0xed756055L, 0xf7960e44L,
+ 0xe3d35e8cL,
+ 0x15056dd4L, 0x88f46dbaL, 0x03a16125L,
+ 0x0564f0bdL,
+ 0xc3eb9e15L, 0x3c9057a2L, 0x97271aecL,
+ 0xa93a072aL,
+ 0x1b3f6d9bL, 0x1e6321f5L, 0xf59c66fbL,
+ 0x26dcf319L,
+ 0x7533d928L, 0xb155fdf5L, 0x03563482L,
+ 0x8aba3cbbL,
+ 0x28517711L, 0xc20ad9f8L, 0xabcc5167L,
+ 0xccad925fL,
+ 0x4de81751L, 0x3830dc8eL, 0x379d5862L,
+ 0x9320f991L,
+ 0xea7a90c2L, 0xfb3e7bceL, 0x5121ce64L,
+ 0x774fbe32L,
+ 0xa8b6e37eL, 0xc3293d46L, 0x48de5369L,
+ 0x6413e680L,
+ 0xa2ae0810L, 0xdd6db224L, 0x69852dfdL,
+ 0x09072166L,
+ 0xb39a460aL, 0x6445c0ddL, 0x586cdecfL,
+ 0x1c20c8aeL,
+ 0x5bbef7ddL, 0x1b588d40L, 0xccd2017fL,
+ 0x6bb4e3bbL,
+ 0xdda26a7eL, 0x3a59ff45L, 0x3e350a44L,
+ 0xbcb4cdd5L,
+ 0x72eacea8L, 0xfa6484bbL, 0x8d6612aeL,
+ 0xbf3c6f47L,
+ 0xd29be463L, 0x542f5d9eL, 0xaec2771bL,
+ 0xf64e6370L,
+ 0x740e0d8dL, 0xe75b1357L, 0xf8721671L,
+ 0xaf537d5dL,
+ 0x4040cb08L, 0x4eb4e2ccL, 0x34d2466aL,
+ 0x0115af84L,
+ 0xe1b00428L, 0x95983a1dL, 0x06b89fb4L,
+ 0xce6ea048L,
+ 0x6f3f3b82L, 0x3520ab82L, 0x011a1d4bL,
+ 0x277227f8L,
+ 0x611560b1L, 0xe7933fdcL, 0xbb3a792bL,
+ 0x344525bdL,
+ 0xa08839e1L, 0x51ce794bL, 0x2f32c9b7L,
+ 0xa01fbac9L,
+ 0xe01cc87eL, 0xbcc7d1f6L, 0xcf0111c3L,
+ 0xa1e8aac7L,
+ 0x1a908749L, 0xd44fbd9aL, 0xd0dadecbL,
+ 0xd50ada38L,
+ 0x0339c32aL, 0xc6913667L, 0x8df9317cL,
+ 0xe0b12b4fL,
+ 0xf79e59b7L, 0x43f5bb3aL, 0xf2d519ffL,
+ 0x27d9459cL,
+ 0xbf97222cL, 0x15e6fc2aL, 0x0f91fc71L,
+ 0x9b941525L,
+ 0xfae59361L, 0xceb69cebL, 0xc2a86459L,
+ 0x12baa8d1L,
+ 0xb6c1075eL, 0xe3056a0cL, 0x10d25065L,
+ 0xcb03a442L,
+ 0xe0ec6e0eL, 0x1698db3bL, 0x4c98a0beL,
+ 0x3278e964L,
+ 0x9f1f9532L, 0xe0d392dfL, 0xd3a0342bL,
+ 0x8971f21eL,
+ 0x1b0a7441L, 0x4ba3348cL, 0xc5be7120L,
+ 0xc37632d8L,
+ 0xdf359f8dL, 0x9b992f2eL, 0xe60b6f47L,
+ 0x0fe3f11dL,
+ 0xe54cda54L, 0x1edad891L, 0xce6279cfL,
+ 0xcd3e7e6fL,
+ 0x1618b166L, 0xfd2c1d05L, 0x848fd2c5L,
+ 0xf6fb2299L,
+ 0xf523f357L, 0xa6327623L, 0x93a83531L,
+ 0x56cccd02L,
+ 0xacf08162L, 0x5a75ebb5L, 0x6e163697L,
+ 0x88d273ccL,
+ 0xde966292L, 0x81b949d0L, 0x4c50901bL,
+ 0x71c65614L,
+ 0xe6c6c7bdL, 0x327a140aL, 0x45e1d006L,
+ 0xc3f27b9aL,
+ 0xc9aa53fdL, 0x62a80f00L, 0xbb25bfe2L,
+ 0x35bdd2f6L,
+ 0x71126905L, 0xb2040222L, 0xb6cbcf7cL,
+ 0xcd769c2bL,
+ 0x53113ec0L, 0x1640e3d3L, 0x38abbd60L,
+ 0x2547adf0L,
+ 0xba38209cL, 0xf746ce76L, 0x77afa1c5L,
+ 0x20756060L,
+ 0x85cbfe4eL, 0x8ae88dd8L, 0x7aaaf9b0L,
+ 0x4cf9aa7eL,
+ 0x1948c25cL, 0x02fb8a8cL, 0x01c36ae4L,
+ 0xd6ebe1f9L,
+ 0x90d4f869L, 0xa65cdea0L, 0x3f09252dL,
+ 0xc208e69fL,
+ 0xb74e6132L, 0xce77e25bL, 0x578fdfe3L,
+ 0x3ac372e6L,
+ }
+};
diff --git a/openssl-1.1.0h/crypto/bf/bf_skey.c b/openssl-1.1.0h/crypto/bf/bf_skey.c
new file mode 100644
index 0000000..a4903a2
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/bf_skey.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <openssl/blowfish.h>
+#include "bf_locl.h"
+#include "bf_pi.h"
+
+void BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+{
+ int i;
+ BF_LONG *p, ri, in[2];
+ const unsigned char *d, *end;
+
+ memcpy(key, &bf_init, sizeof(BF_KEY));
+ p = key->P;
+
+ if (len > ((BF_ROUNDS + 2) * 4))
+ len = (BF_ROUNDS + 2) * 4;
+
+ d = data;
+ end = &(data[len]);
+ for (i = 0; i < (BF_ROUNDS + 2); i++) {
+ ri = *(d++);
+ if (d >= end)
+ d = data;
+
+ ri <<= 8;
+ ri |= *(d++);
+ if (d >= end)
+ d = data;
+
+ ri <<= 8;
+ ri |= *(d++);
+ if (d >= end)
+ d = data;
+
+ ri <<= 8;
+ ri |= *(d++);
+ if (d >= end)
+ d = data;
+
+ p[i] ^= ri;
+ }
+
+ in[0] = 0L;
+ in[1] = 0L;
+ for (i = 0; i < (BF_ROUNDS + 2); i += 2) {
+ BF_encrypt(in, key);
+ p[i] = in[0];
+ p[i + 1] = in[1];
+ }
+
+ p = key->S;
+ for (i = 0; i < 4 * 256; i += 2) {
+ BF_encrypt(in, key);
+ p[i] = in[0];
+ p[i + 1] = in[1];
+ }
+}
diff --git a/openssl-1.1.0h/crypto/bf/build.info b/openssl-1.1.0h/crypto/bf/build.info
new file mode 100644
index 0000000..37a004e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bf/build.info
@@ -0,0 +1,6 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=bf_skey.c bf_ecb.c bf_cfb64.c bf_ofb64.c \
+ {- $target{bf_asm_src} -}
+
+GENERATE[bf-586.s]=asm/bf-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(LIB_CFLAGS) $(PROCESSOR)
+DEPEND[bf-586.s]=../perlasm/x86asm.pl ../perlasm/cbc.pl
diff --git a/openssl-1.1.0h/crypto/bio/b_addr.c b/openssl-1.1.0h/crypto/bio/b_addr.c
new file mode 100644
index 0000000..aea843a
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/b_addr.c
@@ -0,0 +1,883 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+
+#include "bio_lcl.h"
+#include <openssl/crypto.h>
+
+#ifndef OPENSSL_NO_SOCK
+#include <openssl/err.h>
+#include <openssl/buffer.h>
+#include <internal/thread_once.h>
+#include <ctype.h>
+
+CRYPTO_RWLOCK *bio_lookup_lock;
+static CRYPTO_ONCE bio_lookup_init = CRYPTO_ONCE_STATIC_INIT;
+
+/*
+ * Throughout this file and bio_lcl.h, the existence of the macro
+ * AI_PASSIVE is used to detect the availability of struct addrinfo,
+ * getnameinfo() and getaddrinfo(). If that macro doesn't exist,
+ * we use our own implementation instead, using gethostbyname,
+ * getservbyname and a few other.
+ */
+
+/**********************************************************************
+ *
+ * Address structure
+ *
+ */
+
+BIO_ADDR *BIO_ADDR_new(void)
+{
+ BIO_ADDR *ret = OPENSSL_zalloc(sizeof(*ret));
+
+ if (ret == NULL) {
+ BIOerr(BIO_F_BIO_ADDR_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ ret->sa.sa_family = AF_UNSPEC;
+ return ret;
+}
+
+void BIO_ADDR_free(BIO_ADDR *ap)
+{
+ OPENSSL_free(ap);
+}
+
+void BIO_ADDR_clear(BIO_ADDR *ap)
+{
+ memset(ap, 0, sizeof(*ap));
+ ap->sa.sa_family = AF_UNSPEC;
+}
+
+/*
+ * BIO_ADDR_make - non-public routine to fill a BIO_ADDR with the contents
+ * of a struct sockaddr.
+ */
+int BIO_ADDR_make(BIO_ADDR *ap, const struct sockaddr *sa)
+{
+ if (sa->sa_family == AF_INET) {
+ ap->s_in = *(const struct sockaddr_in *)sa;
+ return 1;
+ }
+#ifdef AF_INET6
+ if (sa->sa_family == AF_INET6) {
+ ap->s_in6 = *(const struct sockaddr_in6 *)sa;
+ return 1;
+ }
+#endif
+#ifdef AF_UNIX
+ if (sa->sa_family == AF_UNIX) {
+ ap->s_un = *(const struct sockaddr_un *)sa;
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+int BIO_ADDR_rawmake(BIO_ADDR *ap, int family,
+ const void *where, size_t wherelen,
+ unsigned short port)
+{
+#ifdef AF_UNIX
+ if (family == AF_UNIX) {
+ if (wherelen + 1 > sizeof(ap->s_un.sun_path))
+ return 0;
+ memset(&ap->s_un, 0, sizeof(ap->s_un));
+ ap->s_un.sun_family = family;
+ strncpy(ap->s_un.sun_path, where, sizeof(ap->s_un.sun_path) - 1);
+ return 1;
+ }
+#endif
+ if (family == AF_INET) {
+ if (wherelen != sizeof(struct in_addr))
+ return 0;
+ memset(&ap->s_in, 0, sizeof(ap->s_in));
+ ap->s_in.sin_family = family;
+ ap->s_in.sin_port = port;
+ ap->s_in.sin_addr = *(struct in_addr *)where;
+ return 1;
+ }
+#ifdef AF_INET6
+ if (family == AF_INET6) {
+ if (wherelen != sizeof(struct in6_addr))
+ return 0;
+ memset(&ap->s_in6, 0, sizeof(ap->s_in6));
+ ap->s_in6.sin6_family = family;
+ ap->s_in6.sin6_port = port;
+ ap->s_in6.sin6_addr = *(struct in6_addr *)where;
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+int BIO_ADDR_family(const BIO_ADDR *ap)
+{
+ return ap->sa.sa_family;
+}
+
+int BIO_ADDR_rawaddress(const BIO_ADDR *ap, void *p, size_t *l)
+{
+ size_t len = 0;
+ const void *addrptr = NULL;
+
+ if (ap->sa.sa_family == AF_INET) {
+ len = sizeof(ap->s_in.sin_addr);
+ addrptr = &ap->s_in.sin_addr;
+ }
+#ifdef AF_INET6
+ else if (ap->sa.sa_family == AF_INET6) {
+ len = sizeof(ap->s_in6.sin6_addr);
+ addrptr = &ap->s_in6.sin6_addr;
+ }
+#endif
+#ifdef AF_UNIX
+ else if (ap->sa.sa_family == AF_UNIX) {
+ len = strlen(ap->s_un.sun_path);
+ addrptr = &ap->s_un.sun_path;
+ }
+#endif
+
+ if (addrptr == NULL)
+ return 0;
+
+ if (p != NULL) {
+ memcpy(p, addrptr, len);
+ }
+ if (l != NULL)
+ *l = len;
+
+ return 1;
+}
+
+unsigned short BIO_ADDR_rawport(const BIO_ADDR *ap)
+{
+ if (ap->sa.sa_family == AF_INET)
+ return ap->s_in.sin_port;
+#ifdef AF_INET6
+ if (ap->sa.sa_family == AF_INET6)
+ return ap->s_in6.sin6_port;
+#endif
+ return 0;
+}
+
+/*-
+ * addr_strings - helper function to get host and service names
+ * @ap: the BIO_ADDR that has the input info
+ * @numeric: 0 if actual names should be returned, 1 if the numeric
+ * representation should be returned.
+ * @hostname: a pointer to a pointer to a memory area to store the
+ * host name or numeric representation. Unused if NULL.
+ * @service: a pointer to a pointer to a memory area to store the
+ * service name or numeric representation. Unused if NULL.
+ *
+ * The return value is 0 on failure, with the error code in the error
+ * stack, and 1 on success.
+ */
+static int addr_strings(const BIO_ADDR *ap, int numeric,
+ char **hostname, char **service)
+{
+ if (BIO_sock_init() != 1)
+ return 0;
+
+ if (1) {
+#ifdef AI_PASSIVE
+ int ret = 0;
+ char host[NI_MAXHOST] = "", serv[NI_MAXSERV] = "";
+ int flags = 0;
+
+ if (numeric)
+ flags |= NI_NUMERICHOST | NI_NUMERICSERV;
+
+ if ((ret = getnameinfo(BIO_ADDR_sockaddr(ap),
+ BIO_ADDR_sockaddr_size(ap),
+ host, sizeof(host), serv, sizeof(serv),
+ flags)) != 0) {
+# ifdef EAI_SYSTEM
+ if (ret == EAI_SYSTEM) {
+ SYSerr(SYS_F_GETNAMEINFO, get_last_socket_error());
+ BIOerr(BIO_F_ADDR_STRINGS, ERR_R_SYS_LIB);
+ } else
+# endif
+ {
+ BIOerr(BIO_F_ADDR_STRINGS, ERR_R_SYS_LIB);
+ ERR_add_error_data(1, gai_strerror(ret));
+ }
+ return 0;
+ }
+
+ /* VMS getnameinfo() has a bug, it doesn't fill in serv, which
+ * leaves it with whatever garbage that happens to be there.
+ * However, we initialise serv with the empty string (serv[0]
+ * is therefore NUL), so it gets real easy to detect when things
+ * didn't go the way one might expect.
+ */
+ if (serv[0] == '\0') {
+ BIO_snprintf(serv, sizeof(serv), "%d",
+ ntohs(BIO_ADDR_rawport(ap)));
+ }
+
+ if (hostname != NULL)
+ *hostname = OPENSSL_strdup(host);
+ if (service != NULL)
+ *service = OPENSSL_strdup(serv);
+ } else {
+#endif
+ if (hostname != NULL)
+ *hostname = OPENSSL_strdup(inet_ntoa(ap->s_in.sin_addr));
+ if (service != NULL) {
+ char serv[6]; /* port is 16 bits => max 5 decimal digits */
+ BIO_snprintf(serv, sizeof(serv), "%d", ntohs(ap->s_in.sin_port));
+ *service = OPENSSL_strdup(serv);
+ }
+ }
+
+ if ((hostname != NULL && *hostname == NULL)
+ || (service != NULL && *service == NULL)) {
+ if (hostname != NULL) {
+ OPENSSL_free(*hostname);
+ *hostname = NULL;
+ }
+ if (service != NULL) {
+ OPENSSL_free(*service);
+ *service = NULL;
+ }
+ BIOerr(BIO_F_ADDR_STRINGS, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+
+ return 1;
+}
+
+char *BIO_ADDR_hostname_string(const BIO_ADDR *ap, int numeric)
+{
+ char *hostname = NULL;
+
+ if (addr_strings(ap, numeric, &hostname, NULL))
+ return hostname;
+
+ return NULL;
+}
+
+char *BIO_ADDR_service_string(const BIO_ADDR *ap, int numeric)
+{
+ char *service = NULL;
+
+ if (addr_strings(ap, numeric, NULL, &service))
+ return service;
+
+ return NULL;
+}
+
+char *BIO_ADDR_path_string(const BIO_ADDR *ap)
+{
+#ifdef AF_UNIX
+ if (ap->sa.sa_family == AF_UNIX)
+ return OPENSSL_strdup(ap->s_un.sun_path);
+#endif
+ return NULL;
+}
+
+/*
+ * BIO_ADDR_sockaddr - non-public routine to return the struct sockaddr
+ * for a given BIO_ADDR. In reality, this is simply a type safe cast.
+ * The returned struct sockaddr is const, so it can't be tampered with.
+ */
+const struct sockaddr *BIO_ADDR_sockaddr(const BIO_ADDR *ap)
+{
+ return &(ap->sa);
+}
+
+/*
+ * BIO_ADDR_sockaddr_noconst - non-public function that does the same
+ * as BIO_ADDR_sockaddr, but returns a non-const. USE WITH CARE, as
+ * it allows you to tamper with the data (and thereby the contents
+ * of the input BIO_ADDR).
+ */
+struct sockaddr *BIO_ADDR_sockaddr_noconst(BIO_ADDR *ap)
+{
+ return &(ap->sa);
+}
+
+/*
+ * BIO_ADDR_sockaddr_size - non-public function that returns the size
+ * of the struct sockaddr the BIO_ADDR is using. If the protocol family
+ * isn't set or is something other than AF_INET, AF_INET6 or AF_UNIX,
+ * the size of the BIO_ADDR type is returned.
+ */
+socklen_t BIO_ADDR_sockaddr_size(const BIO_ADDR *ap)
+{
+ if (ap->sa.sa_family == AF_INET)
+ return sizeof(ap->s_in);
+#ifdef AF_INET6
+ if (ap->sa.sa_family == AF_INET6)
+ return sizeof(ap->s_in6);
+#endif
+#ifdef AF_UNIX
+ if (ap->sa.sa_family == AF_UNIX)
+ return sizeof(ap->s_un);
+#endif
+ return sizeof(*ap);
+}
+
+/**********************************************************************
+ *
+ * Address info database
+ *
+ */
+
+const BIO_ADDRINFO *BIO_ADDRINFO_next(const BIO_ADDRINFO *bai)
+{
+ if (bai != NULL)
+ return bai->bai_next;
+ return NULL;
+}
+
+int BIO_ADDRINFO_family(const BIO_ADDRINFO *bai)
+{
+ if (bai != NULL)
+ return bai->bai_family;
+ return 0;
+}
+
+int BIO_ADDRINFO_socktype(const BIO_ADDRINFO *bai)
+{
+ if (bai != NULL)
+ return bai->bai_socktype;
+ return 0;
+}
+
+int BIO_ADDRINFO_protocol(const BIO_ADDRINFO *bai)
+{
+ if (bai != NULL) {
+ if (bai->bai_protocol != 0)
+ return bai->bai_protocol;
+
+#ifdef AF_UNIX
+ if (bai->bai_family == AF_UNIX)
+ return 0;
+#endif
+
+ switch (bai->bai_socktype) {
+ case SOCK_STREAM:
+ return IPPROTO_TCP;
+ case SOCK_DGRAM:
+ return IPPROTO_UDP;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+/*
+ * BIO_ADDRINFO_sockaddr_size - non-public function that returns the size
+ * of the struct sockaddr inside the BIO_ADDRINFO.
+ */
+socklen_t BIO_ADDRINFO_sockaddr_size(const BIO_ADDRINFO *bai)
+{
+ if (bai != NULL)
+ return bai->bai_addrlen;
+ return 0;
+}
+
+/*
+ * BIO_ADDRINFO_sockaddr - non-public function that returns bai_addr
+ * as the struct sockaddr it is.
+ */
+const struct sockaddr *BIO_ADDRINFO_sockaddr(const BIO_ADDRINFO *bai)
+{
+ if (bai != NULL)
+ return bai->bai_addr;
+ return NULL;
+}
+
+const BIO_ADDR *BIO_ADDRINFO_address(const BIO_ADDRINFO *bai)
+{
+ if (bai != NULL)
+ return (BIO_ADDR *)bai->bai_addr;
+ return NULL;
+}
+
+void BIO_ADDRINFO_free(BIO_ADDRINFO *bai)
+{
+ if (bai == NULL)
+ return;
+
+#ifdef AI_PASSIVE
+# ifdef AF_UNIX
+# define _cond bai->bai_family != AF_UNIX
+# else
+# define _cond 1
+# endif
+ if (_cond) {
+ freeaddrinfo(bai);
+ return;
+ }
+#endif
+
+ /* Free manually when we know that addrinfo_wrap() was used.
+ * See further comment above addrinfo_wrap()
+ */
+ while (bai != NULL) {
+ BIO_ADDRINFO *next = bai->bai_next;
+ OPENSSL_free(bai->bai_addr);
+ OPENSSL_free(bai);
+ bai = next;
+ }
+}
+
+/**********************************************************************
+ *
+ * Service functions
+ *
+ */
+
+/*-
+ * The specs in hostserv can take these forms:
+ *
+ * host:service => *host = "host", *service = "service"
+ * host:* => *host = "host", *service = NULL
+ * host: => *host = "host", *service = NULL
+ * :service => *host = NULL, *service = "service"
+ * *:service => *host = NULL, *service = "service"
+ *
+ * in case no : is present in the string, the result depends on
+ * hostserv_prio, as follows:
+ *
+ * when hostserv_prio == BIO_PARSE_PRIO_HOST
+ * host => *host = "host", *service untouched
+ *
+ * when hostserv_prio == BIO_PARSE_PRIO_SERV
+ * service => *host untouched, *service = "service"
+ *
+ */
+int BIO_parse_hostserv(const char *hostserv, char **host, char **service,
+ enum BIO_hostserv_priorities hostserv_prio)
+{
+ const char *h = NULL; size_t hl = 0;
+ const char *p = NULL; size_t pl = 0;
+
+ if (*hostserv == '[') {
+ if ((p = strchr(hostserv, ']')) == NULL)
+ goto spec_err;
+ h = hostserv + 1;
+ hl = p - h;
+ p++;
+ if (*p == '\0')
+ p = NULL;
+ else if (*p != ':')
+ goto spec_err;
+ else {
+ p++;
+ pl = strlen(p);
+ }
+ } else {
+ const char *p2 = strrchr(hostserv, ':');
+ p = strchr(hostserv, ':');
+
+ /*-
+ * Check for more than one colon. There are three possible
+ * interpretations:
+ * 1. IPv6 address with port number, last colon being separator.
+ * 2. IPv6 address only.
+ * 3. IPv6 address only if hostserv_prio == BIO_PARSE_PRIO_HOST,
+ * IPv6 address and port number if hostserv_prio == BIO_PARSE_PRIO_SERV
+ * Because of this ambiguity, we currently choose to make it an
+ * error.
+ */
+ if (p != p2)
+ goto amb_err;
+
+ if (p != NULL) {
+ h = hostserv;
+ hl = p - h;
+ p++;
+ pl = strlen(p);
+ } else if (hostserv_prio == BIO_PARSE_PRIO_HOST) {
+ h = hostserv;
+ hl = strlen(h);
+ } else {
+ p = hostserv;
+ pl = strlen(p);
+ }
+ }
+
+ if (p != NULL && strchr(p, ':'))
+ goto spec_err;
+
+ if (h != NULL && host != NULL) {
+ if (hl == 0
+ || (hl == 1 && h[0] == '*')) {
+ *host = NULL;
+ } else {
+ *host = OPENSSL_strndup(h, hl);
+ if (*host == NULL)
+ goto memerr;
+ }
+ }
+ if (p != NULL && service != NULL) {
+ if (pl == 0
+ || (pl == 1 && p[0] == '*')) {
+ *service = NULL;
+ } else {
+ *service = OPENSSL_strndup(p, pl);
+ if (*service == NULL)
+ goto memerr;
+ }
+ }
+
+ return 1;
+ amb_err:
+ BIOerr(BIO_F_BIO_PARSE_HOSTSERV, BIO_R_AMBIGUOUS_HOST_OR_SERVICE);
+ return 0;
+ spec_err:
+ BIOerr(BIO_F_BIO_PARSE_HOSTSERV, BIO_R_MALFORMED_HOST_OR_SERVICE);
+ return 0;
+ memerr:
+ BIOerr(BIO_F_BIO_PARSE_HOSTSERV, ERR_R_MALLOC_FAILURE);
+ return 0;
+}
+
+/* addrinfo_wrap is used to build our own addrinfo "chain".
+ * (it has only one entry, so calling it a chain may be a stretch)
+ * It should ONLY be called when getaddrinfo() and friends
+ * aren't available, OR when dealing with a non IP protocol
+ * family, such as AF_UNIX
+ *
+ * the return value is 1 on success, or 0 on failure, which
+ * only happens if a memory allocation error occurred.
+ */
+static int addrinfo_wrap(int family, int socktype,
+ const void *where, size_t wherelen,
+ unsigned short port,
+ BIO_ADDRINFO **bai)
+{
+ OPENSSL_assert(bai != NULL);
+
+ *bai = OPENSSL_zalloc(sizeof(**bai));
+ if (*bai == NULL)
+ return 0;
+
+ (*bai)->bai_family = family;
+ (*bai)->bai_socktype = socktype;
+ if (socktype == SOCK_STREAM)
+ (*bai)->bai_protocol = IPPROTO_TCP;
+ if (socktype == SOCK_DGRAM)
+ (*bai)->bai_protocol = IPPROTO_UDP;
+#ifdef AF_UNIX
+ if (family == AF_UNIX)
+ (*bai)->bai_protocol = 0;
+#endif
+ {
+ /* Magic: We know that BIO_ADDR_sockaddr_noconst is really
+ just an advanced cast of BIO_ADDR* to struct sockaddr *
+ by the power of union, so while it may seem that we're
+ creating a memory leak here, we are not. It will be
+ all right. */
+ BIO_ADDR *addr = BIO_ADDR_new();
+ if (addr != NULL) {
+ BIO_ADDR_rawmake(addr, family, where, wherelen, port);
+ (*bai)->bai_addr = BIO_ADDR_sockaddr_noconst(addr);
+ }
+ }
+ (*bai)->bai_next = NULL;
+ if ((*bai)->bai_addr == NULL) {
+ BIO_ADDRINFO_free(*bai);
+ *bai = NULL;
+ return 0;
+ }
+ return 1;
+}
+
+DEFINE_RUN_ONCE_STATIC(do_bio_lookup_init)
+{
+ OPENSSL_init_crypto(0, NULL);
+ bio_lookup_lock = CRYPTO_THREAD_lock_new();
+ return bio_lookup_lock != NULL;
+}
+
+/*-
+ * BIO_lookup - look up the node and service you want to connect to.
+ * @node: the node you want to connect to.
+ * @service: the service you want to connect to.
+ * @lookup_type: declare intent with the result, client or server.
+ * @family: the address family you want to use. Use AF_UNSPEC for any, or
+ * AF_INET, AF_INET6 or AF_UNIX.
+ * @socktype: The socket type you want to use. Can be SOCK_STREAM, SOCK_DGRAM
+ * or 0 for all.
+ * @res: Storage place for the resulting list of returned addresses
+ *
+ * This will do a lookup of the node and service that you want to connect to.
+ * It returns a linked list of different addresses you can try to connect to.
+ *
+ * When no longer needed you should call BIO_ADDRINFO_free() to free the result.
+ *
+ * The return value is 1 on success or 0 in case of error.
+ */
+int BIO_lookup(const char *host, const char *service,
+ enum BIO_lookup_type lookup_type,
+ int family, int socktype, BIO_ADDRINFO **res)
+{
+ int ret = 0; /* Assume failure */
+
+ switch(family) {
+ case AF_INET:
+#ifdef AF_INET6
+ case AF_INET6:
+#endif
+#ifdef AF_UNIX
+ case AF_UNIX:
+#endif
+#ifdef AF_UNSPEC
+ case AF_UNSPEC:
+#endif
+ break;
+ default:
+ BIOerr(BIO_F_BIO_LOOKUP, BIO_R_UNSUPPORTED_PROTOCOL_FAMILY);
+ return 0;
+ }
+
+#ifdef AF_UNIX
+ if (family == AF_UNIX) {
+ if (addrinfo_wrap(family, socktype, host, strlen(host), 0, res))
+ return 1;
+ else
+ BIOerr(BIO_F_BIO_LOOKUP, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+#endif
+
+ if (BIO_sock_init() != 1)
+ return 0;
+
+ if (1) {
+#ifdef AI_PASSIVE
+ int gai_ret = 0;
+ struct addrinfo hints;
+
+ memset(&hints, 0, sizeof(hints));
+
+ hints.ai_family = family;
+ hints.ai_socktype = socktype;
+
+ if (lookup_type == BIO_LOOKUP_SERVER)
+ hints.ai_flags |= AI_PASSIVE;
+
+ /* Note that |res| SHOULD be a 'struct addrinfo **' thanks to
+ * macro magic in bio_lcl.h
+ */
+ switch ((gai_ret = getaddrinfo(host, service, &hints, res))) {
+# ifdef EAI_SYSTEM
+ case EAI_SYSTEM:
+ SYSerr(SYS_F_GETADDRINFO, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LOOKUP, ERR_R_SYS_LIB);
+ break;
+# endif
+ case 0:
+ ret = 1; /* Success */
+ break;
+ default:
+ BIOerr(BIO_F_BIO_LOOKUP, ERR_R_SYS_LIB);
+ ERR_add_error_data(1, gai_strerror(gai_ret));
+ break;
+ }
+ } else {
+#endif
+ const struct hostent *he;
+/*
+ * Because struct hostent is defined for 32-bit pointers only with
+ * VMS C, we need to make sure that '&he_fallback_address' and
+ * '&he_fallback_addresses' are 32-bit pointers
+ */
+#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
+# pragma pointer_size save
+# pragma pointer_size 32
+#endif
+ /* Windows doesn't seem to have in_addr_t */
+#ifdef OPENSSL_SYS_WINDOWS
+ static uint32_t he_fallback_address;
+ static const char *he_fallback_addresses[] =
+ { (char *)&he_fallback_address, NULL };
+#else
+ static in_addr_t he_fallback_address;
+ static const char *he_fallback_addresses[] =
+ { (char *)&he_fallback_address, NULL };
+#endif
+ static const struct hostent he_fallback =
+ { NULL, NULL, AF_INET, sizeof(he_fallback_address),
+ (char **)&he_fallback_addresses };
+#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
+# pragma pointer_size restore
+#endif
+
+ struct servent *se;
+ /* Apparently, on WIN64, s_proto and s_port have traded places... */
+#ifdef _WIN64
+ struct servent se_fallback = { NULL, NULL, NULL, 0 };
+#else
+ struct servent se_fallback = { NULL, NULL, 0, NULL };
+#endif
+
+ if (!RUN_ONCE(&bio_lookup_init, do_bio_lookup_init)) {
+ BIOerr(BIO_F_BIO_LOOKUP, ERR_R_MALLOC_FAILURE);
+ ret = 0;
+ goto err;
+ }
+
+ CRYPTO_THREAD_write_lock(bio_lookup_lock);
+ he_fallback_address = INADDR_ANY;
+ if (host == NULL) {
+ he = &he_fallback;
+ switch(lookup_type) {
+ case BIO_LOOKUP_CLIENT:
+ he_fallback_address = INADDR_LOOPBACK;
+ break;
+ case BIO_LOOKUP_SERVER:
+ he_fallback_address = INADDR_ANY;
+ break;
+ default:
+ OPENSSL_assert(("We forgot to handle a lookup type!" == 0));
+ break;
+ }
+ } else {
+ he = gethostbyname(host);
+
+ if (he == NULL) {
+#ifndef OPENSSL_SYS_WINDOWS
+ /*
+ * This might be misleading, because h_errno is used as if
+ * it was errno. To minimize mixup add 1000. Underlying
+ * reason for this is that hstrerror is declared obsolete,
+ * not to mention that a) h_errno is not always guaranteed
+ * to be meaningless; b) hstrerror can reside in yet another
+ * library, linking for sake of hstrerror is an overkill;
+ * c) this path is not executed on contemporary systems
+ * anyway [above getaddrinfo/gai_strerror is]. We just let
+ * system administrator figure this out...
+ */
+ SYSerr(SYS_F_GETHOSTBYNAME, 1000 + h_errno);
+#else
+ SYSerr(SYS_F_GETHOSTBYNAME, WSAGetLastError());
+#endif
+ ret = 0;
+ goto err;
+ }
+ }
+
+ if (service == NULL) {
+ se_fallback.s_port = 0;
+ se_fallback.s_proto = NULL;
+ se = &se_fallback;
+ } else {
+ char *endp = NULL;
+ long portnum = strtol(service, &endp, 10);
+
+/*
+ * Because struct servent is defined for 32-bit pointers only with
+ * VMS C, we need to make sure that 'proto' is a 32-bit pointer.
+ */
+#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
+# pragma pointer_size save
+# pragma pointer_size 32
+#endif
+ char *proto = NULL;
+#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
+# pragma pointer_size restore
+#endif
+
+ switch (socktype) {
+ case SOCK_STREAM:
+ proto = "tcp";
+ break;
+ case SOCK_DGRAM:
+ proto = "udp";
+ break;
+ }
+
+ if (endp != service && *endp == '\0'
+ && portnum > 0 && portnum < 65536) {
+ se_fallback.s_port = htons(portnum);
+ se_fallback.s_proto = proto;
+ se = &se_fallback;
+ } else if (endp == service) {
+ se = getservbyname(service, proto);
+
+ if (se == NULL) {
+#ifndef OPENSSL_SYS_WINDOWS
+ SYSerr(SYS_F_GETSERVBYNAME, errno);
+#else
+ SYSerr(SYS_F_GETSERVBYNAME, WSAGetLastError());
+#endif
+ goto err;
+ }
+ } else {
+ BIOerr(BIO_F_BIO_LOOKUP, BIO_R_MALFORMED_HOST_OR_SERVICE);
+ goto err;
+ }
+ }
+
+ *res = NULL;
+
+ {
+/*
+ * Because hostent::h_addr_list is an array of 32-bit pointers with VMS C,
+ * we must make sure our iterator designates the same element type, hence
+ * the pointer size dance.
+ */
+#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
+# pragma pointer_size save
+# pragma pointer_size 32
+#endif
+ char **addrlistp;
+#if defined(OPENSSL_SYS_VMS) && defined(__DECC)
+# pragma pointer_size restore
+#endif
+ size_t addresses;
+ BIO_ADDRINFO *tmp_bai = NULL;
+
+ /* The easiest way to create a linked list from an
+ array is to start from the back */
+ for(addrlistp = he->h_addr_list; *addrlistp != NULL;
+ addrlistp++)
+ ;
+
+ for(addresses = addrlistp - he->h_addr_list;
+ addrlistp--, addresses-- > 0; ) {
+ if (!addrinfo_wrap(he->h_addrtype, socktype,
+ *addrlistp, he->h_length,
+ se->s_port, &tmp_bai))
+ goto addrinfo_malloc_err;
+ tmp_bai->bai_next = *res;
+ *res = tmp_bai;
+ continue;
+ addrinfo_malloc_err:
+ BIO_ADDRINFO_free(*res);
+ *res = NULL;
+ BIOerr(BIO_F_BIO_LOOKUP, ERR_R_MALLOC_FAILURE);
+ ret = 0;
+ goto err;
+ }
+
+ ret = 1;
+ }
+ err:
+ CRYPTO_THREAD_unlock(bio_lookup_lock);
+ }
+
+ return ret;
+}
+
+#endif /* OPENSSL_NO_SOCK */
diff --git a/openssl-1.1.0h/crypto/bio/b_dump.c b/openssl-1.1.0h/crypto/bio/b_dump.c
new file mode 100644
index 0000000..424195e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/b_dump.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Stolen from tjh's ssl/ssl_trc.c stuff.
+ */
+
+#include <stdio.h>
+#include "bio_lcl.h"
+
+#define DUMP_WIDTH 16
+#define DUMP_WIDTH_LESS_INDENT(i) (DUMP_WIDTH-((i-(i>6?6:i)+3)/4))
+
+int BIO_dump_cb(int (*cb) (const void *data, size_t len, void *u),
+ void *u, const char *s, int len)
+{
+ return BIO_dump_indent_cb(cb, u, s, len, 0);
+}
+
+int BIO_dump_indent_cb(int (*cb) (const void *data, size_t len, void *u),
+ void *u, const char *s, int len, int indent)
+{
+ int ret = 0;
+ char buf[288 + 1], tmp[20], str[128 + 1];
+ int i, j, rows;
+ unsigned char ch;
+ int dump_width;
+
+ if (indent < 0)
+ indent = 0;
+ if (indent) {
+ if (indent > 128)
+ indent = 128;
+ memset(str, ' ', indent);
+ }
+ str[indent] = '\0';
+
+ dump_width = DUMP_WIDTH_LESS_INDENT(indent);
+ rows = (len / dump_width);
+ if ((rows * dump_width) < len)
+ rows++;
+ for (i = 0; i < rows; i++) {
+ OPENSSL_strlcpy(buf, str, sizeof(buf));
+ BIO_snprintf(tmp, sizeof(tmp), "%04x - ", i * dump_width);
+ OPENSSL_strlcat(buf, tmp, sizeof(buf));
+ for (j = 0; j < dump_width; j++) {
+ if (((i * dump_width) + j) >= len) {
+ OPENSSL_strlcat(buf, " ", sizeof(buf));
+ } else {
+ ch = ((unsigned char)*(s + i * dump_width + j)) & 0xff;
+ BIO_snprintf(tmp, sizeof(tmp), "%02x%c", ch,
+ j == 7 ? '-' : ' ');
+ OPENSSL_strlcat(buf, tmp, sizeof(buf));
+ }
+ }
+ OPENSSL_strlcat(buf, " ", sizeof(buf));
+ for (j = 0; j < dump_width; j++) {
+ if (((i * dump_width) + j) >= len)
+ break;
+ ch = ((unsigned char)*(s + i * dump_width + j)) & 0xff;
+#ifndef CHARSET_EBCDIC
+ BIO_snprintf(tmp, sizeof(tmp), "%c",
+ ((ch >= ' ') && (ch <= '~')) ? ch : '.');
+#else
+ BIO_snprintf(tmp, sizeof(tmp), "%c",
+ ((ch >= os_toascii[' ']) && (ch <= os_toascii['~']))
+ ? os_toebcdic[ch]
+ : '.');
+#endif
+ OPENSSL_strlcat(buf, tmp, sizeof(buf));
+ }
+ OPENSSL_strlcat(buf, "\n", sizeof(buf));
+ /*
+ * if this is the last call then update the ddt_dump thing so that we
+ * will move the selection point in the debug window
+ */
+ ret += cb((void *)buf, strlen(buf), u);
+ }
+ return ret;
+}
+
+#ifndef OPENSSL_NO_STDIO
+static int write_fp(const void *data, size_t len, void *fp)
+{
+ return UP_fwrite(data, len, 1, fp);
+}
+
+int BIO_dump_fp(FILE *fp, const char *s, int len)
+{
+ return BIO_dump_cb(write_fp, fp, s, len);
+}
+
+int BIO_dump_indent_fp(FILE *fp, const char *s, int len, int indent)
+{
+ return BIO_dump_indent_cb(write_fp, fp, s, len, indent);
+}
+#endif
+
+static int write_bio(const void *data, size_t len, void *bp)
+{
+ return BIO_write((BIO *)bp, (const char *)data, len);
+}
+
+int BIO_dump(BIO *bp, const char *s, int len)
+{
+ return BIO_dump_cb(write_bio, bp, s, len);
+}
+
+int BIO_dump_indent(BIO *bp, const char *s, int len, int indent)
+{
+ return BIO_dump_indent_cb(write_bio, bp, s, len, indent);
+}
+
+int BIO_hex_string(BIO *out, int indent, int width, unsigned char *data,
+ int datalen)
+{
+ int i, j = 0;
+
+ if (datalen < 1)
+ return 1;
+
+ for (i = 0; i < datalen - 1; i++) {
+ if (i && !j)
+ BIO_printf(out, "%*s", indent, "");
+
+ BIO_printf(out, "%02X:", data[i]);
+
+ j = (j + 1) % width;
+ if (!j)
+ BIO_printf(out, "\n");
+ }
+
+ if (i && !j)
+ BIO_printf(out, "%*s", indent, "");
+ BIO_printf(out, "%02X", data[datalen - 1]);
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/bio/b_print.c b/openssl-1.1.0h/crypto/bio/b_print.c
new file mode 100644
index 0000000..cdfe05f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/b_print.c
@@ -0,0 +1,926 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include "internal/numbers.h"
+#include "internal/cryptlib.h"
+#include <openssl/bio.h>
+
+/*
+ * Copyright Patrick Powell 1995
+ * This code is based on code written by Patrick Powell <papowell@astart.com>
+ * It may be used for any purpose as long as this notice remains intact
+ * on all source code distributions.
+ */
+
+#ifdef HAVE_LONG_DOUBLE
+# define LDOUBLE long double
+#else
+# define LDOUBLE double
+#endif
+
+static int fmtstr(char **, char **, size_t *, size_t *,
+ const char *, int, int, int);
+static int fmtint(char **, char **, size_t *, size_t *,
+ int64_t, int, int, int, int);
+static int fmtfp(char **, char **, size_t *, size_t *,
+ LDOUBLE, int, int, int, int);
+static int doapr_outch(char **, char **, size_t *, size_t *, int);
+static int _dopr(char **sbuffer, char **buffer,
+ size_t *maxlen, size_t *retlen, int *truncated,
+ const char *format, va_list args);
+
+/* format read states */
+#define DP_S_DEFAULT 0
+#define DP_S_FLAGS 1
+#define DP_S_MIN 2
+#define DP_S_DOT 3
+#define DP_S_MAX 4
+#define DP_S_MOD 5
+#define DP_S_CONV 6
+#define DP_S_DONE 7
+
+/* format flags - Bits */
+/* left-aligned padding */
+#define DP_F_MINUS (1 << 0)
+/* print an explicit '+' for a value with positive sign */
+#define DP_F_PLUS (1 << 1)
+/* print an explicit ' ' for a value with positive sign */
+#define DP_F_SPACE (1 << 2)
+/* print 0/0x prefix for octal/hex and decimal point for floating point */
+#define DP_F_NUM (1 << 3)
+/* print leading zeroes */
+#define DP_F_ZERO (1 << 4)
+/* print HEX in UPPPERcase */
+#define DP_F_UP (1 << 5)
+/* treat value as unsigned */
+#define DP_F_UNSIGNED (1 << 6)
+
+/* conversion flags */
+#define DP_C_SHORT 1
+#define DP_C_LONG 2
+#define DP_C_LDOUBLE 3
+#define DP_C_LLONG 4
+
+/* Floating point formats */
+#define F_FORMAT 0
+#define E_FORMAT 1
+#define G_FORMAT 2
+
+/* some handy macros */
+#define char_to_int(p) (p - '0')
+#define OSSL_MAX(p,q) ((p >= q) ? p : q)
+
+static int
+_dopr(char **sbuffer,
+ char **buffer,
+ size_t *maxlen,
+ size_t *retlen, int *truncated, const char *format, va_list args)
+{
+ char ch;
+ int64_t value;
+ LDOUBLE fvalue;
+ char *strvalue;
+ int min;
+ int max;
+ int state;
+ int flags;
+ int cflags;
+ size_t currlen;
+
+ state = DP_S_DEFAULT;
+ flags = currlen = cflags = min = 0;
+ max = -1;
+ ch = *format++;
+
+ while (state != DP_S_DONE) {
+ if (ch == '\0' || (buffer == NULL && currlen >= *maxlen))
+ state = DP_S_DONE;
+
+ switch (state) {
+ case DP_S_DEFAULT:
+ if (ch == '%')
+ state = DP_S_FLAGS;
+ else
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
+ ch = *format++;
+ break;
+ case DP_S_FLAGS:
+ switch (ch) {
+ case '-':
+ flags |= DP_F_MINUS;
+ ch = *format++;
+ break;
+ case '+':
+ flags |= DP_F_PLUS;
+ ch = *format++;
+ break;
+ case ' ':
+ flags |= DP_F_SPACE;
+ ch = *format++;
+ break;
+ case '#':
+ flags |= DP_F_NUM;
+ ch = *format++;
+ break;
+ case '0':
+ flags |= DP_F_ZERO;
+ ch = *format++;
+ break;
+ default:
+ state = DP_S_MIN;
+ break;
+ }
+ break;
+ case DP_S_MIN:
+ if (isdigit((unsigned char)ch)) {
+ min = 10 * min + char_to_int(ch);
+ ch = *format++;
+ } else if (ch == '*') {
+ min = va_arg(args, int);
+ ch = *format++;
+ state = DP_S_DOT;
+ } else
+ state = DP_S_DOT;
+ break;
+ case DP_S_DOT:
+ if (ch == '.') {
+ state = DP_S_MAX;
+ ch = *format++;
+ } else
+ state = DP_S_MOD;
+ break;
+ case DP_S_MAX:
+ if (isdigit((unsigned char)ch)) {
+ if (max < 0)
+ max = 0;
+ max = 10 * max + char_to_int(ch);
+ ch = *format++;
+ } else if (ch == '*') {
+ max = va_arg(args, int);
+ ch = *format++;
+ state = DP_S_MOD;
+ } else
+ state = DP_S_MOD;
+ break;
+ case DP_S_MOD:
+ switch (ch) {
+ case 'h':
+ cflags = DP_C_SHORT;
+ ch = *format++;
+ break;
+ case 'l':
+ if (*format == 'l') {
+ cflags = DP_C_LLONG;
+ format++;
+ } else
+ cflags = DP_C_LONG;
+ ch = *format++;
+ break;
+ case 'q':
+ cflags = DP_C_LLONG;
+ ch = *format++;
+ break;
+ case 'L':
+ cflags = DP_C_LDOUBLE;
+ ch = *format++;
+ break;
+ default:
+ break;
+ }
+ state = DP_S_CONV;
+ break;
+ case DP_S_CONV:
+ switch (ch) {
+ case 'd':
+ case 'i':
+ switch (cflags) {
+ case DP_C_SHORT:
+ value = (short int)va_arg(args, int);
+ break;
+ case DP_C_LONG:
+ value = va_arg(args, long int);
+ break;
+ case DP_C_LLONG:
+ value = va_arg(args, int64_t);
+ break;
+ default:
+ value = va_arg(args, int);
+ break;
+ }
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, 10, min,
+ max, flags))
+ return 0;
+ break;
+ case 'X':
+ flags |= DP_F_UP;
+ /* FALLTHROUGH */
+ case 'x':
+ case 'o':
+ case 'u':
+ flags |= DP_F_UNSIGNED;
+ switch (cflags) {
+ case DP_C_SHORT:
+ value = (unsigned short int)va_arg(args, unsigned int);
+ break;
+ case DP_C_LONG:
+ value = va_arg(args, unsigned long int);
+ break;
+ case DP_C_LLONG:
+ value = va_arg(args, uint64_t);
+ break;
+ default:
+ value = va_arg(args, unsigned int);
+ break;
+ }
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen, value,
+ ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
+ min, max, flags))
+ return 0;
+ break;
+ case 'f':
+ if (cflags == DP_C_LDOUBLE)
+ fvalue = va_arg(args, LDOUBLE);
+ else
+ fvalue = va_arg(args, double);
+ if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max,
+ flags, F_FORMAT))
+ return 0;
+ break;
+ case 'E':
+ flags |= DP_F_UP;
+ /* fall thru */
+ case 'e':
+ if (cflags == DP_C_LDOUBLE)
+ fvalue = va_arg(args, LDOUBLE);
+ else
+ fvalue = va_arg(args, double);
+ if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max,
+ flags, E_FORMAT))
+ return 0;
+ break;
+ case 'G':
+ flags |= DP_F_UP;
+ /* fall thru */
+ case 'g':
+ if (cflags == DP_C_LDOUBLE)
+ fvalue = va_arg(args, LDOUBLE);
+ else
+ fvalue = va_arg(args, double);
+ if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max,
+ flags, G_FORMAT))
+ return 0;
+ break;
+ case 'c':
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen,
+ va_arg(args, int)))
+ return 0;
+ break;
+ case 's':
+ strvalue = va_arg(args, char *);
+ if (max < 0) {
+ if (buffer)
+ max = INT_MAX;
+ else
+ max = *maxlen;
+ }
+ if (!fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
+ flags, min, max))
+ return 0;
+ break;
+ case 'p':
+ value = (size_t)va_arg(args, void *);
+ if (!fmtint(sbuffer, buffer, &currlen, maxlen,
+ value, 16, min, max, flags | DP_F_NUM))
+ return 0;
+ break;
+ case 'n': /* XXX */
+ if (cflags == DP_C_SHORT) {
+ short int *num;
+ num = va_arg(args, short int *);
+ *num = currlen;
+ } else if (cflags == DP_C_LONG) { /* XXX */
+ long int *num;
+ num = va_arg(args, long int *);
+ *num = (long int)currlen;
+ } else if (cflags == DP_C_LLONG) { /* XXX */
+ int64_t *num;
+ num = va_arg(args, int64_t *);
+ *num = (int64_t)currlen;
+ } else {
+ int *num;
+ num = va_arg(args, int *);
+ *num = currlen;
+ }
+ break;
+ case '%':
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+ return 0;
+ break;
+ case 'w':
+ /* not supported yet, treat as next char */
+ ch = *format++;
+ break;
+ default:
+ /* unknown, skip */
+ break;
+ }
+ ch = *format++;
+ state = DP_S_DEFAULT;
+ flags = cflags = min = 0;
+ max = -1;
+ break;
+ case DP_S_DONE:
+ break;
+ default:
+ break;
+ }
+ }
+ /*
+ * We have to truncate if there is no dynamic buffer and we have filled the
+ * static buffer.
+ */
+ if (buffer == NULL) {
+ *truncated = (currlen > *maxlen - 1);
+ if (*truncated)
+ currlen = *maxlen - 1;
+ }
+ if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0'))
+ return 0;
+ *retlen = currlen - 1;
+ return 1;
+}
+
+static int
+fmtstr(char **sbuffer,
+ char **buffer,
+ size_t *currlen,
+ size_t *maxlen, const char *value, int flags, int min, int max)
+{
+ int padlen;
+ size_t strln;
+ int cnt = 0;
+
+ if (value == 0)
+ value = "<NULL>";
+
+ strln = OPENSSL_strnlen(value, max < 0 ? SIZE_MAX : (size_t)max);
+
+ padlen = min - strln;
+ if (min < 0 || padlen < 0)
+ padlen = 0;
+ if (max >= 0) {
+ /*
+ * Calculate the maximum output including padding.
+ * Make sure max doesn't overflow into negativity
+ */
+ if (max < INT_MAX - padlen)
+ max += padlen;
+ else
+ max = INT_MAX;
+ }
+ if (flags & DP_F_MINUS)
+ padlen = -padlen;
+
+ while ((padlen > 0) && (max < 0 || cnt < max)) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
+ --padlen;
+ ++cnt;
+ }
+ while (strln > 0 && (max < 0 || cnt < max)) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *value++))
+ return 0;
+ --strln;
+ ++cnt;
+ }
+ while ((padlen < 0) && (max < 0 || cnt < max)) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
+ ++padlen;
+ ++cnt;
+ }
+ return 1;
+}
+
+static int
+fmtint(char **sbuffer,
+ char **buffer,
+ size_t *currlen,
+ size_t *maxlen, int64_t value, int base, int min, int max, int flags)
+{
+ int signvalue = 0;
+ const char *prefix = "";
+ uint64_t uvalue;
+ char convert[DECIMAL_SIZE(value) + 3];
+ int place = 0;
+ int spadlen = 0;
+ int zpadlen = 0;
+ int caps = 0;
+
+ if (max < 0)
+ max = 0;
+ uvalue = value;
+ if (!(flags & DP_F_UNSIGNED)) {
+ if (value < 0) {
+ signvalue = '-';
+ uvalue = 0 - (uint64_t)value;
+ } else if (flags & DP_F_PLUS)
+ signvalue = '+';
+ else if (flags & DP_F_SPACE)
+ signvalue = ' ';
+ }
+ if (flags & DP_F_NUM) {
+ if (base == 8)
+ prefix = "0";
+ if (base == 16)
+ prefix = "0x";
+ }
+ if (flags & DP_F_UP)
+ caps = 1;
+ do {
+ convert[place++] = (caps ? "0123456789ABCDEF" : "0123456789abcdef")
+ [uvalue % (unsigned)base];
+ uvalue = (uvalue / (unsigned)base);
+ } while (uvalue && (place < (int)sizeof(convert)));
+ if (place == sizeof(convert))
+ place--;
+ convert[place] = 0;
+
+ zpadlen = max - place;
+ spadlen =
+ min - OSSL_MAX(max, place) - (signvalue ? 1 : 0) - strlen(prefix);
+ if (zpadlen < 0)
+ zpadlen = 0;
+ if (spadlen < 0)
+ spadlen = 0;
+ if (flags & DP_F_ZERO) {
+ zpadlen = OSSL_MAX(zpadlen, spadlen);
+ spadlen = 0;
+ }
+ if (flags & DP_F_MINUS)
+ spadlen = -spadlen;
+
+ /* spaces */
+ while (spadlen > 0) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
+ --spadlen;
+ }
+
+ /* sign */
+ if (signvalue)
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
+
+ /* prefix */
+ while (*prefix) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix))
+ return 0;
+ prefix++;
+ }
+
+ /* zeros */
+ if (zpadlen > 0) {
+ while (zpadlen > 0) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
+ --zpadlen;
+ }
+ }
+ /* digits */
+ while (place > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]))
+ return 0;
+ }
+
+ /* left justified spaces */
+ while (spadlen < 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
+ ++spadlen;
+ }
+ return 1;
+}
+
+static LDOUBLE abs_val(LDOUBLE value)
+{
+ LDOUBLE result = value;
+ if (value < 0)
+ result = -value;
+ return result;
+}
+
+static LDOUBLE pow_10(int in_exp)
+{
+ LDOUBLE result = 1;
+ while (in_exp) {
+ result *= 10;
+ in_exp--;
+ }
+ return result;
+}
+
+static long roundv(LDOUBLE value)
+{
+ long intpart;
+ intpart = (long)value;
+ value = value - intpart;
+ if (value >= 0.5)
+ intpart++;
+ return intpart;
+}
+
+static int
+fmtfp(char **sbuffer,
+ char **buffer,
+ size_t *currlen,
+ size_t *maxlen, LDOUBLE fvalue, int min, int max, int flags, int style)
+{
+ int signvalue = 0;
+ LDOUBLE ufvalue;
+ LDOUBLE tmpvalue;
+ char iconvert[20];
+ char fconvert[20];
+ char econvert[20];
+ int iplace = 0;
+ int fplace = 0;
+ int eplace = 0;
+ int padlen = 0;
+ int zpadlen = 0;
+ long exp = 0;
+ unsigned long intpart;
+ unsigned long fracpart;
+ unsigned long max10;
+ int realstyle;
+
+ if (max < 0)
+ max = 6;
+
+ if (fvalue < 0)
+ signvalue = '-';
+ else if (flags & DP_F_PLUS)
+ signvalue = '+';
+ else if (flags & DP_F_SPACE)
+ signvalue = ' ';
+
+ /*
+ * G_FORMAT sometimes prints like E_FORMAT and sometimes like F_FORMAT
+ * depending on the number to be printed. Work out which one it is and use
+ * that from here on.
+ */
+ if (style == G_FORMAT) {
+ if (fvalue == 0.0) {
+ realstyle = F_FORMAT;
+ } else if (fvalue < 0.0001) {
+ realstyle = E_FORMAT;
+ } else if ((max == 0 && fvalue >= 10)
+ || (max > 0 && fvalue >= pow_10(max))) {
+ realstyle = E_FORMAT;
+ } else {
+ realstyle = F_FORMAT;
+ }
+ } else {
+ realstyle = style;
+ }
+
+ if (style != F_FORMAT) {
+ tmpvalue = fvalue;
+ /* Calculate the exponent */
+ if (fvalue != 0.0) {
+ while (tmpvalue < 1) {
+ tmpvalue *= 10;
+ exp--;
+ }
+ while (tmpvalue > 10) {
+ tmpvalue /= 10;
+ exp++;
+ }
+ }
+ if (style == G_FORMAT) {
+ /*
+ * In G_FORMAT the "precision" represents significant digits. We
+ * always have at least 1 significant digit.
+ */
+ if (max == 0)
+ max = 1;
+ /* Now convert significant digits to decimal places */
+ if (realstyle == F_FORMAT) {
+ max -= (exp + 1);
+ if (max < 0) {
+ /*
+ * Should not happen. If we're in F_FORMAT then exp < max?
+ */
+ return 0;
+ }
+ } else {
+ /*
+ * In E_FORMAT there is always one significant digit in front
+ * of the decimal point, so:
+ * significant digits == 1 + decimal places
+ */
+ max--;
+ }
+ }
+ if (realstyle == E_FORMAT)
+ fvalue = tmpvalue;
+ }
+ ufvalue = abs_val(fvalue);
+ if (ufvalue > ULONG_MAX) {
+ /* Number too big */
+ return 0;
+ }
+ intpart = (unsigned long)ufvalue;
+
+ /*
+ * sorry, we only support 9 digits past the decimal because of our
+ * conversion method
+ */
+ if (max > 9)
+ max = 9;
+
+ /*
+ * we "cheat" by converting the fractional part to integer by multiplying
+ * by a factor of 10
+ */
+ max10 = roundv(pow_10(max));
+ fracpart = roundv(pow_10(max) * (ufvalue - intpart));
+
+ if (fracpart >= max10) {
+ intpart++;
+ fracpart -= max10;
+ }
+
+ /* convert integer part */
+ do {
+ iconvert[iplace++] = "0123456789"[intpart % 10];
+ intpart = (intpart / 10);
+ } while (intpart && (iplace < (int)sizeof(iconvert)));
+ if (iplace == sizeof(iconvert))
+ iplace--;
+ iconvert[iplace] = 0;
+
+ /* convert fractional part */
+ while (fplace < max) {
+ if (style == G_FORMAT && fplace == 0 && (fracpart % 10) == 0) {
+ /* We strip trailing zeros in G_FORMAT */
+ max--;
+ fracpart = fracpart / 10;
+ if (fplace < max)
+ continue;
+ break;
+ }
+ fconvert[fplace++] = "0123456789"[fracpart % 10];
+ fracpart = (fracpart / 10);
+ }
+
+ if (fplace == sizeof(fconvert))
+ fplace--;
+ fconvert[fplace] = 0;
+
+ /* convert exponent part */
+ if (realstyle == E_FORMAT) {
+ int tmpexp;
+ if (exp < 0)
+ tmpexp = -exp;
+ else
+ tmpexp = exp;
+
+ do {
+ econvert[eplace++] = "0123456789"[tmpexp % 10];
+ tmpexp = (tmpexp / 10);
+ } while (tmpexp > 0 && eplace < (int)sizeof(econvert));
+ /* Exponent is huge!! Too big to print */
+ if (tmpexp > 0)
+ return 0;
+ /* Add a leading 0 for single digit exponents */
+ if (eplace == 1)
+ econvert[eplace++] = '0';
+ }
+
+ /*
+ * -1 for decimal point (if we have one, i.e. max > 0),
+ * another -1 if we are printing a sign
+ */
+ padlen = min - iplace - max - (max > 0 ? 1 : 0) - ((signvalue) ? 1 : 0);
+ /* Take some off for exponent prefix "+e" and exponent */
+ if (realstyle == E_FORMAT)
+ padlen -= 2 + eplace;
+ zpadlen = max - fplace;
+ if (zpadlen < 0)
+ zpadlen = 0;
+ if (padlen < 0)
+ padlen = 0;
+ if (flags & DP_F_MINUS)
+ padlen = -padlen;
+
+ if ((flags & DP_F_ZERO) && (padlen > 0)) {
+ if (signvalue) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
+ --padlen;
+ signvalue = 0;
+ }
+ while (padlen > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
+ --padlen;
+ }
+ }
+ while (padlen > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
+ --padlen;
+ }
+ if (signvalue && !doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+ return 0;
+
+ while (iplace > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]))
+ return 0;
+ }
+
+ /*
+ * Decimal point. This should probably use locale to find the correct
+ * char to print out.
+ */
+ if (max > 0 || (flags & DP_F_NUM)) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '.'))
+ return 0;
+
+ while (fplace > 0) {
+ if(!doapr_outch(sbuffer, buffer, currlen, maxlen,
+ fconvert[--fplace]))
+ return 0;
+ }
+ }
+ while (zpadlen > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+ return 0;
+ --zpadlen;
+ }
+ if (realstyle == E_FORMAT) {
+ char ech;
+
+ if ((flags & DP_F_UP) == 0)
+ ech = 'e';
+ else
+ ech = 'E';
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ech))
+ return 0;
+ if (exp < 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '-'))
+ return 0;
+ } else {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '+'))
+ return 0;
+ }
+ while (eplace > 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen,
+ econvert[--eplace]))
+ return 0;
+ }
+ }
+
+ while (padlen < 0) {
+ if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+ return 0;
+ ++padlen;
+ }
+ return 1;
+}
+
+#define BUFFER_INC 1024
+
+static int
+doapr_outch(char **sbuffer,
+ char **buffer, size_t *currlen, size_t *maxlen, int c)
+{
+ /* If we haven't at least one buffer, someone has doe a big booboo */
+ OPENSSL_assert(*sbuffer != NULL || buffer != NULL);
+
+ /* |currlen| must always be <= |*maxlen| */
+ OPENSSL_assert(*currlen <= *maxlen);
+
+ if (buffer && *currlen == *maxlen) {
+ if (*maxlen > INT_MAX - BUFFER_INC)
+ return 0;
+
+ *maxlen += BUFFER_INC;
+ if (*buffer == NULL) {
+ *buffer = OPENSSL_malloc(*maxlen);
+ if (*buffer == NULL)
+ return 0;
+ if (*currlen > 0) {
+ OPENSSL_assert(*sbuffer != NULL);
+ memcpy(*buffer, *sbuffer, *currlen);
+ }
+ *sbuffer = NULL;
+ } else {
+ char *tmpbuf;
+ tmpbuf = OPENSSL_realloc(*buffer, *maxlen);
+ if (tmpbuf == NULL)
+ return 0;
+ *buffer = tmpbuf;
+ }
+ }
+
+ if (*currlen < *maxlen) {
+ if (*sbuffer)
+ (*sbuffer)[(*currlen)++] = (char)c;
+ else
+ (*buffer)[(*currlen)++] = (char)c;
+ }
+
+ return 1;
+}
+
+/***************************************************************************/
+
+int BIO_printf(BIO *bio, const char *format, ...)
+{
+ va_list args;
+ int ret;
+
+ va_start(args, format);
+
+ ret = BIO_vprintf(bio, format, args);
+
+ va_end(args);
+ return (ret);
+}
+
+int BIO_vprintf(BIO *bio, const char *format, va_list args)
+{
+ int ret;
+ size_t retlen;
+ char hugebuf[1024 * 2]; /* Was previously 10k, which is unreasonable
+ * in small-stack environments, like threads
+ * or DOS programs. */
+ char *hugebufp = hugebuf;
+ size_t hugebufsize = sizeof(hugebuf);
+ char *dynbuf = NULL;
+ int ignored;
+
+ dynbuf = NULL;
+ if (!_dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format,
+ args)) {
+ OPENSSL_free(dynbuf);
+ return -1;
+ }
+ if (dynbuf) {
+ ret = BIO_write(bio, dynbuf, (int)retlen);
+ OPENSSL_free(dynbuf);
+ } else {
+ ret = BIO_write(bio, hugebuf, (int)retlen);
+ }
+ return (ret);
+}
+
+/*
+ * As snprintf is not available everywhere, we provide our own
+ * implementation. This function has nothing to do with BIOs, but it's
+ * closely related to BIO_printf, and we need *some* name prefix ... (XXX the
+ * function should be renamed, but to what?)
+ */
+int BIO_snprintf(char *buf, size_t n, const char *format, ...)
+{
+ va_list args;
+ int ret;
+
+ va_start(args, format);
+
+ ret = BIO_vsnprintf(buf, n, format, args);
+
+ va_end(args);
+ return (ret);
+}
+
+int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args)
+{
+ size_t retlen;
+ int truncated;
+
+ if(!_dopr(&buf, NULL, &n, &retlen, &truncated, format, args))
+ return -1;
+
+ if (truncated)
+ /*
+ * In case of truncation, return -1 like traditional snprintf.
+ * (Current drafts for ISO/IEC 9899 say snprintf should return the
+ * number of characters that would have been written, had the buffer
+ * been large enough.)
+ */
+ return -1;
+ else
+ return (retlen <= INT_MAX) ? (int)retlen : -1;
+}
diff --git a/openssl-1.1.0h/crypto/bio/b_sock.c b/openssl-1.1.0h/crypto/bio/b_sock.c
new file mode 100644
index 0000000..97dcc70
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/b_sock.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#if defined(NETWARE_CLIB)
+# include <sys/ioctl.h>
+NETDB_DEFINE_CONTEXT
+#endif
+#ifndef OPENSSL_NO_SOCK
+# define SOCKET_PROTOCOL IPPROTO_TCP
+# ifdef SO_MAXCONN
+# define MAX_LISTEN SO_MAXCONN
+# elif defined(SOMAXCONN)
+# define MAX_LISTEN SOMAXCONN
+# else
+# define MAX_LISTEN 32
+# endif
+# if defined(OPENSSL_SYS_WINDOWS)
+static int wsa_init_done = 0;
+# endif
+
+# if OPENSSL_API_COMPAT < 0x10100000L
+int BIO_get_host_ip(const char *str, unsigned char *ip)
+{
+ BIO_ADDRINFO *res = NULL;
+ int ret = 0;
+
+ if (BIO_sock_init() != 1)
+ return 0; /* don't generate another error code here */
+
+ if (BIO_lookup(str, NULL, BIO_LOOKUP_CLIENT, AF_INET, SOCK_STREAM, &res)) {
+ size_t l;
+
+ if (BIO_ADDRINFO_family(res) != AF_INET) {
+ BIOerr(BIO_F_BIO_GET_HOST_IP,
+ BIO_R_GETHOSTBYNAME_ADDR_IS_NOT_AF_INET);
+ } else {
+ BIO_ADDR_rawaddress(BIO_ADDRINFO_address(res), NULL, &l);
+ /* Because only AF_INET addresses will reach this far,
+ we can assert that l should be 4 */
+ OPENSSL_assert(l == 4);
+
+ BIO_ADDR_rawaddress(BIO_ADDRINFO_address(res), ip, &l);
+ ret = 1;
+ }
+ BIO_ADDRINFO_free(res);
+ } else {
+ ERR_add_error_data(2, "host=", str);
+ }
+
+ return ret;
+}
+
+int BIO_get_port(const char *str, unsigned short *port_ptr)
+{
+ BIO_ADDRINFO *res = NULL;
+ int ret = 0;
+
+ if (str == NULL) {
+ BIOerr(BIO_F_BIO_GET_PORT, BIO_R_NO_PORT_DEFINED);
+ return (0);
+ }
+
+ if (BIO_sock_init() != 1)
+ return 0; /* don't generate another error code here */
+
+ if (BIO_lookup(NULL, str, BIO_LOOKUP_CLIENT, AF_INET, SOCK_STREAM, &res)) {
+ if (BIO_ADDRINFO_family(res) != AF_INET) {
+ BIOerr(BIO_F_BIO_GET_PORT,
+ BIO_R_ADDRINFO_ADDR_IS_NOT_AF_INET);
+ } else {
+ *port_ptr = ntohs(BIO_ADDR_rawport(BIO_ADDRINFO_address(res)));
+ ret = 1;
+ }
+ BIO_ADDRINFO_free(res);
+ } else {
+ ERR_add_error_data(2, "host=", str);
+ }
+
+ return ret;
+}
+# endif
+
+int BIO_sock_error(int sock)
+{
+ int j = 0, i;
+ socklen_t size = sizeof(j);
+
+ /*
+ * Note: under Windows the third parameter is of type (char *) whereas
+ * under other systems it is (void *) if you don't have a cast it will
+ * choke the compiler: if you do have a cast then you can either go for
+ * (char *) or (void *).
+ */
+ i = getsockopt(sock, SOL_SOCKET, SO_ERROR, (void *)&j, &size);
+ if (i < 0)
+ return (get_last_socket_error());
+ else
+ return (j);
+}
+
+# if OPENSSL_API_COMPAT < 0x10100000L
+struct hostent *BIO_gethostbyname(const char *name)
+{
+ /*
+ * Caching gethostbyname() results forever is wrong, so we have to let
+ * the true gethostbyname() worry about this
+ */
+# if (defined(NETWARE_BSDSOCK) && !defined(__NOVELL_LIBC__))
+ return gethostbyname((char *)name);
+# else
+ return gethostbyname(name);
+# endif
+}
+# endif
+
+int BIO_sock_init(void)
+{
+# ifdef OPENSSL_SYS_WINDOWS
+ static struct WSAData wsa_state;
+
+ if (!wsa_init_done) {
+ int err;
+
+ wsa_init_done = 1;
+ memset(&wsa_state, 0, sizeof(wsa_state));
+ /*
+ * Not making wsa_state available to the rest of the code is formally
+ * wrong. But the structures we use are [believed to be] invariable
+ * among Winsock DLLs, while API availability is [expected to be]
+ * probed at run-time with DSO_global_lookup.
+ */
+ if (WSAStartup(0x0202, &wsa_state) != 0) {
+ err = WSAGetLastError();
+ SYSerr(SYS_F_WSASTARTUP, err);
+ BIOerr(BIO_F_BIO_SOCK_INIT, BIO_R_WSASTARTUP);
+ return (-1);
+ }
+ }
+# endif /* OPENSSL_SYS_WINDOWS */
+# ifdef WATT32
+ extern int _watt_do_exit;
+ _watt_do_exit = 0; /* don't make sock_init() call exit() */
+ if (sock_init())
+ return (-1);
+# endif
+
+ return (1);
+}
+
+void bio_sock_cleanup_int(void)
+{
+# ifdef OPENSSL_SYS_WINDOWS
+ if (wsa_init_done) {
+ wsa_init_done = 0;
+ WSACleanup();
+ }
+# endif
+}
+
+int BIO_socket_ioctl(int fd, long type, void *arg)
+{
+ int i;
+
+# ifdef __DJGPP__
+ i = ioctlsocket(fd, type, (char *)arg);
+# else
+# if defined(OPENSSL_SYS_VMS)
+ /*-
+ * 2011-02-18 SMS.
+ * VMS ioctl() can't tolerate a 64-bit "void *arg", but we
+ * observe that all the consumers pass in an "unsigned long *",
+ * so we arrange a local copy with a short pointer, and use
+ * that, instead.
+ */
+# if __INITIAL_POINTER_SIZE == 64
+# define ARG arg_32p
+# pragma pointer_size save
+# pragma pointer_size 32
+ unsigned long arg_32;
+ unsigned long *arg_32p;
+# pragma pointer_size restore
+ arg_32p = &arg_32;
+ arg_32 = *((unsigned long *)arg);
+# else /* __INITIAL_POINTER_SIZE == 64 */
+# define ARG arg
+# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
+# else /* defined(OPENSSL_SYS_VMS) */
+# define ARG arg
+# endif /* defined(OPENSSL_SYS_VMS) [else] */
+
+ i = ioctlsocket(fd, type, ARG);
+# endif /* __DJGPP__ */
+ if (i < 0)
+ SYSerr(SYS_F_IOCTLSOCKET, get_last_socket_error());
+ return (i);
+}
+
+# if OPENSSL_API_COMPAT < 0x10100000L
+int BIO_get_accept_socket(char *host, int bind_mode)
+{
+ int s = INVALID_SOCKET;
+ char *h = NULL, *p = NULL;
+ BIO_ADDRINFO *res = NULL;
+
+ if (!BIO_parse_hostserv(host, &h, &p, BIO_PARSE_PRIO_SERV))
+ return INVALID_SOCKET;
+
+ if (BIO_sock_init() != 1)
+ return INVALID_SOCKET;
+
+ if (BIO_lookup(h, p, BIO_LOOKUP_SERVER, AF_UNSPEC, SOCK_STREAM, &res) != 0)
+ goto err;
+
+ if ((s = BIO_socket(BIO_ADDRINFO_family(res), BIO_ADDRINFO_socktype(res),
+ BIO_ADDRINFO_protocol(res), 0)) == INVALID_SOCKET) {
+ s = INVALID_SOCKET;
+ goto err;
+ }
+
+ if (!BIO_listen(s, BIO_ADDRINFO_address(res),
+ bind_mode ? BIO_SOCK_REUSEADDR : 0)) {
+ BIO_closesocket(s);
+ s = INVALID_SOCKET;
+ }
+
+ err:
+ BIO_ADDRINFO_free(res);
+ OPENSSL_free(h);
+ OPENSSL_free(p);
+
+ return s;
+}
+
+int BIO_accept(int sock, char **ip_port)
+{
+ BIO_ADDR res;
+ int ret = -1;
+
+ ret = BIO_accept_ex(sock, &res, 0);
+ if (ret == (int)INVALID_SOCKET) {
+ if (BIO_sock_should_retry(ret)) {
+ ret = -2;
+ goto end;
+ }
+ SYSerr(SYS_F_ACCEPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_ACCEPT, BIO_R_ACCEPT_ERROR);
+ goto end;
+ }
+
+ if (ip_port != NULL) {
+ char *host = BIO_ADDR_hostname_string(&res, 1);
+ char *port = BIO_ADDR_service_string(&res, 1);
+ if (host != NULL && port != NULL)
+ *ip_port = OPENSSL_zalloc(strlen(host) + strlen(port) + 2);
+ else
+ *ip_port = NULL;
+
+ if (*ip_port == NULL) {
+ BIOerr(BIO_F_BIO_ACCEPT, ERR_R_MALLOC_FAILURE);
+ BIO_closesocket(ret);
+ ret = (int)INVALID_SOCKET;
+ } else {
+ strcpy(*ip_port, host);
+ strcat(*ip_port, ":");
+ strcat(*ip_port, port);
+ }
+ OPENSSL_free(host);
+ OPENSSL_free(port);
+ }
+
+ end:
+ return ret;
+}
+# endif
+
+int BIO_set_tcp_ndelay(int s, int on)
+{
+ int ret = 0;
+# if defined(TCP_NODELAY) && (defined(IPPROTO_TCP) || defined(SOL_TCP))
+ int opt;
+
+# ifdef SOL_TCP
+ opt = SOL_TCP;
+# else
+# ifdef IPPROTO_TCP
+ opt = IPPROTO_TCP;
+# endif
+# endif
+
+ ret = setsockopt(s, opt, TCP_NODELAY, (char *)&on, sizeof(on));
+# endif
+ return (ret == 0);
+}
+
+int BIO_socket_nbio(int s, int mode)
+{
+ int ret = -1;
+ int l;
+
+ l = mode;
+# ifdef FIONBIO
+ l = mode;
+
+ ret = BIO_socket_ioctl(s, FIONBIO, &l);
+# elif defined(F_GETFL) && defined(F_SETFL) && (defined(O_NONBLOCK) || defined(FNDELAY))
+ /* make sure this call always pushes an error level; BIO_socket_ioctl() does so, so we do too. */
+
+ l = fcntl(s, F_GETFL, 0);
+ if (l == -1) {
+ SYSerr(SYS_F_FCNTL, get_last_rtl_error());
+ ret = -1;
+ } else {
+# if defined(O_NONBLOCK)
+ l &= ~O_NONBLOCK;
+# else
+ l &= ~FNDELAY; /* BSD4.x */
+# endif
+ if (mode) {
+# if defined(O_NONBLOCK)
+ l |= O_NONBLOCK;
+# else
+ l |= FNDELAY; /* BSD4.x */
+# endif
+ }
+ ret = fcntl(s, F_SETFL, l);
+
+ if (ret < 0) {
+ SYSerr(SYS_F_FCNTL, get_last_rtl_error());
+ }
+ }
+# else
+ /* make sure this call always pushes an error level; BIO_socket_ioctl() does so, so we do too. */
+ BIOerr(BIO_F_BIO_SOCKET_NBIO, ERR_R_PASSED_INVALID_ARGUMENT);
+# endif
+
+ return (ret == 0);
+}
+
+int BIO_sock_info(int sock,
+ enum BIO_sock_info_type type, union BIO_sock_info_u *info)
+{
+ switch (type) {
+ case BIO_SOCK_INFO_ADDRESS:
+ {
+ socklen_t addr_len;
+ int ret = 0;
+ addr_len = sizeof(*info->addr);
+ ret = getsockname(sock, BIO_ADDR_sockaddr_noconst(info->addr),
+ &addr_len);
+ if (ret == -1) {
+ SYSerr(SYS_F_GETSOCKNAME, get_last_socket_error());
+ BIOerr(BIO_F_BIO_SOCK_INFO, BIO_R_GETSOCKNAME_ERROR);
+ return 0;
+ }
+ if ((size_t)addr_len > sizeof(*info->addr)) {
+ BIOerr(BIO_F_BIO_SOCK_INFO, BIO_R_GETSOCKNAME_TRUNCATED_ADDRESS);
+ return 0;
+ }
+ }
+ break;
+ default:
+ BIOerr(BIO_F_BIO_SOCK_INFO, BIO_R_UNKNOWN_INFO_TYPE);
+ return 0;
+ }
+ return 1;
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/bio/b_sock2.c b/openssl-1.1.0h/crypto/bio/b_sock2.c
new file mode 100644
index 0000000..d8b49d0
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/b_sock2.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "bio_lcl.h"
+
+#include <openssl/err.h>
+
+#ifndef OPENSSL_NO_SOCK
+# ifdef SO_MAXCONN
+# define MAX_LISTEN SO_MAXCONN
+# elif defined(SOMAXCONN)
+# define MAX_LISTEN SOMAXCONN
+# else
+# define MAX_LISTEN 32
+# endif
+
+/*-
+ * BIO_socket - create a socket
+ * @domain: the socket domain (AF_INET, AF_INET6, AF_UNIX, ...)
+ * @socktype: the socket type (SOCK_STEAM, SOCK_DGRAM)
+ * @protocol: the protocol to use (IPPROTO_TCP, IPPROTO_UDP)
+ * @options: BIO socket options (currently unused)
+ *
+ * Creates a socket. This should be called before calling any
+ * of BIO_connect and BIO_listen.
+ *
+ * Returns the file descriptor on success or INVALID_SOCKET on failure. On
+ * failure errno is set, and a status is added to the OpenSSL error stack.
+ */
+int BIO_socket(int domain, int socktype, int protocol, int options)
+{
+ int sock = -1;
+
+ if (BIO_sock_init() != 1)
+ return INVALID_SOCKET;
+
+ sock = socket(domain, socktype, protocol);
+ if (sock == -1) {
+ SYSerr(SYS_F_SOCKET, get_last_socket_error());
+ BIOerr(BIO_F_BIO_SOCKET, BIO_R_UNABLE_TO_CREATE_SOCKET);
+ return INVALID_SOCKET;
+ }
+
+ return sock;
+}
+
+/*-
+ * BIO_connect - connect to an address
+ * @sock: the socket to connect with
+ * @addr: the address to connect to
+ * @options: BIO socket options
+ *
+ * Connects to the address using the given socket and options.
+ *
+ * Options can be a combination of the following:
+ * - BIO_SOCK_KEEPALIVE: enable regularly sending keep-alive messages.
+ * - BIO_SOCK_NONBLOCK: Make the socket non-blocking.
+ * - BIO_SOCK_NODELAY: don't delay small messages.
+ *
+ * options holds BIO socket options that can be used
+ * You should call this for every address returned by BIO_lookup
+ * until the connection is successful.
+ *
+ * Returns 1 on success or 0 on failure. On failure errno is set
+ * and an error status is added to the OpenSSL error stack.
+ */
+int BIO_connect(int sock, const BIO_ADDR *addr, int options)
+{
+ int on = 1;
+
+ if (sock == -1) {
+ BIOerr(BIO_F_BIO_CONNECT, BIO_R_INVALID_SOCKET);
+ return 0;
+ }
+
+ if (!BIO_socket_nbio(sock, (options & BIO_SOCK_NONBLOCK) != 0))
+ return 0;
+
+ if (options & BIO_SOCK_KEEPALIVE) {
+ if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)) != 0) {
+ SYSerr(SYS_F_SETSOCKOPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_CONNECT, BIO_R_UNABLE_TO_KEEPALIVE);
+ return 0;
+ }
+ }
+
+ if (options & BIO_SOCK_NODELAY) {
+ if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)) != 0) {
+ SYSerr(SYS_F_SETSOCKOPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_CONNECT, BIO_R_UNABLE_TO_NODELAY);
+ return 0;
+ }
+ }
+
+ if (connect(sock, BIO_ADDR_sockaddr(addr),
+ BIO_ADDR_sockaddr_size(addr)) == -1) {
+ if (!BIO_sock_should_retry(-1)) {
+ SYSerr(SYS_F_CONNECT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_CONNECT, BIO_R_CONNECT_ERROR);
+ }
+ return 0;
+ }
+ return 1;
+}
+
+/*-
+ * BIO_listen - Creates a listen socket
+ * @sock: the socket to listen with
+ * @addr: local address to bind to
+ * @options: BIO socket options
+ *
+ * Binds to the address using the given socket and options, then
+ * starts listening for incoming connections.
+ *
+ * Options can be a combination of the following:
+ * - BIO_SOCK_KEEPALIVE: enable regularly sending keep-alive messages.
+ * - BIO_SOCK_NONBLOCK: Make the socket non-blocking.
+ * - BIO_SOCK_NODELAY: don't delay small messages.
+ * - BIO_SOCK_REUSEADDR: Try to reuse the address and port combination
+ * for a recently closed port.
+ * - BIO_SOCK_V6_ONLY: When creating an IPv6 socket, make it listen only
+ * for IPv6 addresses and not IPv4 addresses mapped to IPv6.
+ *
+ * It's recommended that you set up both an IPv6 and IPv4 listen socket, and
+ * then check both for new clients that connect to it. You want to set up
+ * the socket as non-blocking in that case since else it could hang.
+ *
+ * Not all operating systems support IPv4 addresses on an IPv6 socket, and for
+ * others it's an option. If you pass the BIO_LISTEN_V6_ONLY it will try to
+ * create the IPv6 sockets to only listen for IPv6 connection.
+ *
+ * It could be that the first BIO_listen() call will listen to all the IPv6
+ * and IPv4 addresses and that then trying to bind to the IPv4 address will
+ * fail. We can't tell the difference between already listening ourself to
+ * it and someone else listening to it when failing and errno is EADDRINUSE, so
+ * it's recommended to not give an error in that case if the first call was
+ * successful.
+ *
+ * When restarting the program it could be that the port is still in use. If
+ * you set to BIO_SOCK_REUSEADDR option it will try to reuse the port anyway.
+ * It's recommended that you use this.
+ */
+int BIO_listen(int sock, const BIO_ADDR *addr, int options)
+{
+ int on = 1;
+ int socktype;
+ socklen_t socktype_len = sizeof(socktype);
+
+ if (sock == -1) {
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_INVALID_SOCKET);
+ return 0;
+ }
+
+ if (getsockopt(sock, SOL_SOCKET, SO_TYPE, &socktype, &socktype_len) != 0
+ || socktype_len != sizeof(socktype)) {
+ SYSerr(SYS_F_GETSOCKOPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_GETTING_SOCKTYPE);
+ return 0;
+ }
+
+ if (!BIO_socket_nbio(sock, (options & BIO_SOCK_NONBLOCK) != 0))
+ return 0;
+
+# ifndef OPENSSL_SYS_WINDOWS
+ /*
+ * SO_REUSEADDR has different behavior on Windows than on
+ * other operating systems, don't set it there.
+ */
+ if (options & BIO_SOCK_REUSEADDR) {
+ if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) != 0) {
+ SYSerr(SYS_F_SETSOCKOPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_UNABLE_TO_REUSEADDR);
+ return 0;
+ }
+ }
+# endif
+
+ if (options & BIO_SOCK_KEEPALIVE) {
+ if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)) != 0) {
+ SYSerr(SYS_F_SETSOCKOPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_UNABLE_TO_KEEPALIVE);
+ return 0;
+ }
+ }
+
+ if (options & BIO_SOCK_NODELAY) {
+ if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)) != 0) {
+ SYSerr(SYS_F_SETSOCKOPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_UNABLE_TO_NODELAY);
+ return 0;
+ }
+ }
+
+# ifdef IPV6_V6ONLY
+ if (BIO_ADDR_family(addr) == AF_INET6) {
+ /*
+ * Note: Windows default of IPV6_V6ONLY is ON, and Linux is OFF.
+ * Therefore we always have to use setsockopt here.
+ */
+ on = options & BIO_SOCK_V6_ONLY ? 1 : 0;
+ if (setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)) != 0) {
+ SYSerr(SYS_F_SETSOCKOPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_LISTEN_V6_ONLY);
+ return 0;
+ }
+ }
+# endif
+
+ if (bind(sock, BIO_ADDR_sockaddr(addr), BIO_ADDR_sockaddr_size(addr)) != 0) {
+ SYSerr(SYS_F_BIND, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_UNABLE_TO_BIND_SOCKET);
+ return 0;
+ }
+
+ if (socktype != SOCK_DGRAM && listen(sock, MAX_LISTEN) == -1) {
+ SYSerr(SYS_F_LISTEN, get_last_socket_error());
+ BIOerr(BIO_F_BIO_LISTEN, BIO_R_UNABLE_TO_LISTEN_SOCKET);
+ return 0;
+ }
+
+ return 1;
+}
+
+/*-
+ * BIO_accept_ex - Accept new incoming connections
+ * @sock: the listening socket
+ * @addr: the BIO_ADDR to store the peer address in
+ * @options: BIO socket options, applied on the accepted socket.
+ *
+ */
+int BIO_accept_ex(int accept_sock, BIO_ADDR *addr_, int options)
+{
+ socklen_t len;
+ int accepted_sock;
+ BIO_ADDR locaddr;
+ BIO_ADDR *addr = addr_ == NULL ? &locaddr : addr_;
+
+ len = sizeof(*addr);
+ accepted_sock = accept(accept_sock,
+ BIO_ADDR_sockaddr_noconst(addr), &len);
+ if (accepted_sock == -1) {
+ if (!BIO_sock_should_retry(accepted_sock)) {
+ SYSerr(SYS_F_ACCEPT, get_last_socket_error());
+ BIOerr(BIO_F_BIO_ACCEPT_EX, BIO_R_ACCEPT_ERROR);
+ }
+ return INVALID_SOCKET;
+ }
+
+ if (!BIO_socket_nbio(accepted_sock, (options & BIO_SOCK_NONBLOCK) != 0)) {
+ closesocket(accepted_sock);
+ return INVALID_SOCKET;
+ }
+
+ return accepted_sock;
+}
+
+/*-
+ * BIO_closesocket - Close a socket
+ * @sock: the socket to close
+ */
+int BIO_closesocket(int sock)
+{
+ if (closesocket(sock) < 0)
+ return 0;
+ return 1;
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/bio/bf_buff.c b/openssl-1.1.0h/crypto/bio/bf_buff.c
new file mode 100644
index 0000000..8509956
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bf_buff.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+
+static int buffer_write(BIO *h, const char *buf, int num);
+static int buffer_read(BIO *h, char *buf, int size);
+static int buffer_puts(BIO *h, const char *str);
+static int buffer_gets(BIO *h, char *str, int size);
+static long buffer_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int buffer_new(BIO *h);
+static int buffer_free(BIO *data);
+static long buffer_callback_ctrl(BIO *h, int cmd, BIO_info_cb *fp);
+#define DEFAULT_BUFFER_SIZE 4096
+
+static const BIO_METHOD methods_buffer = {
+ BIO_TYPE_BUFFER,
+ "buffer",
+ buffer_write,
+ buffer_read,
+ buffer_puts,
+ buffer_gets,
+ buffer_ctrl,
+ buffer_new,
+ buffer_free,
+ buffer_callback_ctrl,
+};
+
+const BIO_METHOD *BIO_f_buffer(void)
+{
+ return (&methods_buffer);
+}
+
+static int buffer_new(BIO *bi)
+{
+ BIO_F_BUFFER_CTX *ctx = OPENSSL_zalloc(sizeof(*ctx));
+
+ if (ctx == NULL)
+ return (0);
+ ctx->ibuf_size = DEFAULT_BUFFER_SIZE;
+ ctx->ibuf = OPENSSL_malloc(DEFAULT_BUFFER_SIZE);
+ if (ctx->ibuf == NULL) {
+ OPENSSL_free(ctx);
+ return (0);
+ }
+ ctx->obuf_size = DEFAULT_BUFFER_SIZE;
+ ctx->obuf = OPENSSL_malloc(DEFAULT_BUFFER_SIZE);
+ if (ctx->obuf == NULL) {
+ OPENSSL_free(ctx->ibuf);
+ OPENSSL_free(ctx);
+ return (0);
+ }
+
+ bi->init = 1;
+ bi->ptr = (char *)ctx;
+ bi->flags = 0;
+ return (1);
+}
+
+static int buffer_free(BIO *a)
+{
+ BIO_F_BUFFER_CTX *b;
+
+ if (a == NULL)
+ return (0);
+ b = (BIO_F_BUFFER_CTX *)a->ptr;
+ OPENSSL_free(b->ibuf);
+ OPENSSL_free(b->obuf);
+ OPENSSL_free(a->ptr);
+ a->ptr = NULL;
+ a->init = 0;
+ a->flags = 0;
+ return (1);
+}
+
+static int buffer_read(BIO *b, char *out, int outl)
+{
+ int i, num = 0;
+ BIO_F_BUFFER_CTX *ctx;
+
+ if (out == NULL)
+ return (0);
+ ctx = (BIO_F_BUFFER_CTX *)b->ptr;
+
+ if ((ctx == NULL) || (b->next_bio == NULL))
+ return (0);
+ num = 0;
+ BIO_clear_retry_flags(b);
+
+ start:
+ i = ctx->ibuf_len;
+ /* If there is stuff left over, grab it */
+ if (i != 0) {
+ if (i > outl)
+ i = outl;
+ memcpy(out, &(ctx->ibuf[ctx->ibuf_off]), i);
+ ctx->ibuf_off += i;
+ ctx->ibuf_len -= i;
+ num += i;
+ if (outl == i)
+ return (num);
+ outl -= i;
+ out += i;
+ }
+
+ /*
+ * We may have done a partial read. try to do more. We have nothing in
+ * the buffer. If we get an error and have read some data, just return it
+ * and let them retry to get the error again. copy direct to parent
+ * address space
+ */
+ if (outl > ctx->ibuf_size) {
+ for (;;) {
+ i = BIO_read(b->next_bio, out, outl);
+ if (i <= 0) {
+ BIO_copy_next_retry(b);
+ if (i < 0)
+ return ((num > 0) ? num : i);
+ if (i == 0)
+ return (num);
+ }
+ num += i;
+ if (outl == i)
+ return (num);
+ out += i;
+ outl -= i;
+ }
+ }
+ /* else */
+
+ /* we are going to be doing some buffering */
+ i = BIO_read(b->next_bio, ctx->ibuf, ctx->ibuf_size);
+ if (i <= 0) {
+ BIO_copy_next_retry(b);
+ if (i < 0)
+ return ((num > 0) ? num : i);
+ if (i == 0)
+ return (num);
+ }
+ ctx->ibuf_off = 0;
+ ctx->ibuf_len = i;
+
+ /* Lets re-read using ourselves :-) */
+ goto start;
+}
+
+static int buffer_write(BIO *b, const char *in, int inl)
+{
+ int i, num = 0;
+ BIO_F_BUFFER_CTX *ctx;
+
+ if ((in == NULL) || (inl <= 0))
+ return (0);
+ ctx = (BIO_F_BUFFER_CTX *)b->ptr;
+ if ((ctx == NULL) || (b->next_bio == NULL))
+ return (0);
+
+ BIO_clear_retry_flags(b);
+ start:
+ i = ctx->obuf_size - (ctx->obuf_len + ctx->obuf_off);
+ /* add to buffer and return */
+ if (i >= inl) {
+ memcpy(&(ctx->obuf[ctx->obuf_off + ctx->obuf_len]), in, inl);
+ ctx->obuf_len += inl;
+ return (num + inl);
+ }
+ /* else */
+ /* stuff already in buffer, so add to it first, then flush */
+ if (ctx->obuf_len != 0) {
+ if (i > 0) { /* lets fill it up if we can */
+ memcpy(&(ctx->obuf[ctx->obuf_off + ctx->obuf_len]), in, i);
+ in += i;
+ inl -= i;
+ num += i;
+ ctx->obuf_len += i;
+ }
+ /* we now have a full buffer needing flushing */
+ for (;;) {
+ i = BIO_write(b->next_bio, &(ctx->obuf[ctx->obuf_off]),
+ ctx->obuf_len);
+ if (i <= 0) {
+ BIO_copy_next_retry(b);
+
+ if (i < 0)
+ return ((num > 0) ? num : i);
+ if (i == 0)
+ return (num);
+ }
+ ctx->obuf_off += i;
+ ctx->obuf_len -= i;
+ if (ctx->obuf_len == 0)
+ break;
+ }
+ }
+ /*
+ * we only get here if the buffer has been flushed and we still have
+ * stuff to write
+ */
+ ctx->obuf_off = 0;
+
+ /* we now have inl bytes to write */
+ while (inl >= ctx->obuf_size) {
+ i = BIO_write(b->next_bio, in, inl);
+ if (i <= 0) {
+ BIO_copy_next_retry(b);
+ if (i < 0)
+ return ((num > 0) ? num : i);
+ if (i == 0)
+ return (num);
+ }
+ num += i;
+ in += i;
+ inl -= i;
+ if (inl == 0)
+ return (num);
+ }
+
+ /*
+ * copy the rest into the buffer since we have only a small amount left
+ */
+ goto start;
+}
+
+static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ BIO *dbio;
+ BIO_F_BUFFER_CTX *ctx;
+ long ret = 1;
+ char *p1, *p2;
+ int r, i, *ip;
+ int ibs, obs;
+
+ ctx = (BIO_F_BUFFER_CTX *)b->ptr;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ ctx->ibuf_off = 0;
+ ctx->ibuf_len = 0;
+ ctx->obuf_off = 0;
+ ctx->obuf_len = 0;
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ case BIO_CTRL_INFO:
+ ret = (long)ctx->obuf_len;
+ break;
+ case BIO_C_GET_BUFF_NUM_LINES:
+ ret = 0;
+ p1 = ctx->ibuf;
+ for (i = 0; i < ctx->ibuf_len; i++) {
+ if (p1[ctx->ibuf_off + i] == '\n')
+ ret++;
+ }
+ break;
+ case BIO_CTRL_WPENDING:
+ ret = (long)ctx->obuf_len;
+ if (ret == 0) {
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ }
+ break;
+ case BIO_CTRL_PENDING:
+ ret = (long)ctx->ibuf_len;
+ if (ret == 0) {
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ }
+ break;
+ case BIO_C_SET_BUFF_READ_DATA:
+ if (num > ctx->ibuf_size) {
+ p1 = OPENSSL_malloc((int)num);
+ if (p1 == NULL)
+ goto malloc_error;
+ OPENSSL_free(ctx->ibuf);
+ ctx->ibuf = p1;
+ }
+ ctx->ibuf_off = 0;
+ ctx->ibuf_len = (int)num;
+ memcpy(ctx->ibuf, ptr, (int)num);
+ ret = 1;
+ break;
+ case BIO_C_SET_BUFF_SIZE:
+ if (ptr != NULL) {
+ ip = (int *)ptr;
+ if (*ip == 0) {
+ ibs = (int)num;
+ obs = ctx->obuf_size;
+ } else { /* if (*ip == 1) */
+
+ ibs = ctx->ibuf_size;
+ obs = (int)num;
+ }
+ } else {
+ ibs = (int)num;
+ obs = (int)num;
+ }
+ p1 = ctx->ibuf;
+ p2 = ctx->obuf;
+ if ((ibs > DEFAULT_BUFFER_SIZE) && (ibs != ctx->ibuf_size)) {
+ p1 = OPENSSL_malloc((int)num);
+ if (p1 == NULL)
+ goto malloc_error;
+ }
+ if ((obs > DEFAULT_BUFFER_SIZE) && (obs != ctx->obuf_size)) {
+ p2 = OPENSSL_malloc((int)num);
+ if (p2 == NULL) {
+ if (p1 != ctx->ibuf)
+ OPENSSL_free(p1);
+ goto malloc_error;
+ }
+ }
+ if (ctx->ibuf != p1) {
+ OPENSSL_free(ctx->ibuf);
+ ctx->ibuf = p1;
+ ctx->ibuf_off = 0;
+ ctx->ibuf_len = 0;
+ ctx->ibuf_size = ibs;
+ }
+ if (ctx->obuf != p2) {
+ OPENSSL_free(ctx->obuf);
+ ctx->obuf = p2;
+ ctx->obuf_off = 0;
+ ctx->obuf_len = 0;
+ ctx->obuf_size = obs;
+ }
+ break;
+ case BIO_C_DO_STATE_MACHINE:
+ if (b->next_bio == NULL)
+ return (0);
+ BIO_clear_retry_flags(b);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ BIO_copy_next_retry(b);
+ break;
+
+ case BIO_CTRL_FLUSH:
+ if (b->next_bio == NULL)
+ return (0);
+ if (ctx->obuf_len <= 0) {
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ }
+
+ for (;;) {
+ BIO_clear_retry_flags(b);
+ if (ctx->obuf_len > 0) {
+ r = BIO_write(b->next_bio,
+ &(ctx->obuf[ctx->obuf_off]), ctx->obuf_len);
+ BIO_copy_next_retry(b);
+ if (r <= 0)
+ return ((long)r);
+ ctx->obuf_off += r;
+ ctx->obuf_len -= r;
+ } else {
+ ctx->obuf_len = 0;
+ ctx->obuf_off = 0;
+ break;
+ }
+ }
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ case BIO_CTRL_DUP:
+ dbio = (BIO *)ptr;
+ if (!BIO_set_read_buffer_size(dbio, ctx->ibuf_size) ||
+ !BIO_set_write_buffer_size(dbio, ctx->obuf_size))
+ ret = 0;
+ break;
+ default:
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ }
+ return (ret);
+ malloc_error:
+ BIOerr(BIO_F_BUFFER_CTRL, ERR_R_MALLOC_FAILURE);
+ return (0);
+}
+
+static long buffer_callback_ctrl(BIO *b, int cmd, BIO_info_cb *fp)
+{
+ long ret = 1;
+
+ if (b->next_bio == NULL)
+ return (0);
+ switch (cmd) {
+ default:
+ ret = BIO_callback_ctrl(b->next_bio, cmd, fp);
+ break;
+ }
+ return (ret);
+}
+
+static int buffer_gets(BIO *b, char *buf, int size)
+{
+ BIO_F_BUFFER_CTX *ctx;
+ int num = 0, i, flag;
+ char *p;
+
+ ctx = (BIO_F_BUFFER_CTX *)b->ptr;
+ size--; /* reserve space for a '\0' */
+ BIO_clear_retry_flags(b);
+
+ for (;;) {
+ if (ctx->ibuf_len > 0) {
+ p = &(ctx->ibuf[ctx->ibuf_off]);
+ flag = 0;
+ for (i = 0; (i < ctx->ibuf_len) && (i < size); i++) {
+ *(buf++) = p[i];
+ if (p[i] == '\n') {
+ flag = 1;
+ i++;
+ break;
+ }
+ }
+ num += i;
+ size -= i;
+ ctx->ibuf_len -= i;
+ ctx->ibuf_off += i;
+ if (flag || size == 0) {
+ *buf = '\0';
+ return (num);
+ }
+ } else { /* read another chunk */
+
+ i = BIO_read(b->next_bio, ctx->ibuf, ctx->ibuf_size);
+ if (i <= 0) {
+ BIO_copy_next_retry(b);
+ *buf = '\0';
+ if (i < 0)
+ return ((num > 0) ? num : i);
+ if (i == 0)
+ return (num);
+ }
+ ctx->ibuf_len = i;
+ ctx->ibuf_off = 0;
+ }
+ }
+}
+
+static int buffer_puts(BIO *b, const char *str)
+{
+ return (buffer_write(b, str, strlen(str)));
+}
diff --git a/openssl-1.1.0h/crypto/bio/bf_lbuf.c b/openssl-1.1.0h/crypto/bio/bf_lbuf.c
new file mode 100644
index 0000000..a80f899
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bf_lbuf.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+#include <openssl/evp.h>
+
+static int linebuffer_write(BIO *h, const char *buf, int num);
+static int linebuffer_read(BIO *h, char *buf, int size);
+static int linebuffer_puts(BIO *h, const char *str);
+static int linebuffer_gets(BIO *h, char *str, int size);
+static long linebuffer_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int linebuffer_new(BIO *h);
+static int linebuffer_free(BIO *data);
+static long linebuffer_callback_ctrl(BIO *h, int cmd, BIO_info_cb *fp);
+
+/* A 10k maximum should be enough for most purposes */
+#define DEFAULT_LINEBUFFER_SIZE 1024*10
+
+/* #define DEBUG */
+
+static const BIO_METHOD methods_linebuffer = {
+ BIO_TYPE_LINEBUFFER,
+ "linebuffer",
+ linebuffer_write,
+ linebuffer_read,
+ linebuffer_puts,
+ linebuffer_gets,
+ linebuffer_ctrl,
+ linebuffer_new,
+ linebuffer_free,
+ linebuffer_callback_ctrl,
+};
+
+const BIO_METHOD *BIO_f_linebuffer(void)
+{
+ return (&methods_linebuffer);
+}
+
+typedef struct bio_linebuffer_ctx_struct {
+ char *obuf; /* the output char array */
+ int obuf_size; /* how big is the output buffer */
+ int obuf_len; /* how many bytes are in it */
+} BIO_LINEBUFFER_CTX;
+
+static int linebuffer_new(BIO *bi)
+{
+ BIO_LINEBUFFER_CTX *ctx;
+
+ ctx = OPENSSL_malloc(sizeof(*ctx));
+ if (ctx == NULL)
+ return (0);
+ ctx->obuf = OPENSSL_malloc(DEFAULT_LINEBUFFER_SIZE);
+ if (ctx->obuf == NULL) {
+ OPENSSL_free(ctx);
+ return (0);
+ }
+ ctx->obuf_size = DEFAULT_LINEBUFFER_SIZE;
+ ctx->obuf_len = 0;
+
+ bi->init = 1;
+ bi->ptr = (char *)ctx;
+ bi->flags = 0;
+ return (1);
+}
+
+static int linebuffer_free(BIO *a)
+{
+ BIO_LINEBUFFER_CTX *b;
+
+ if (a == NULL)
+ return (0);
+ b = (BIO_LINEBUFFER_CTX *)a->ptr;
+ OPENSSL_free(b->obuf);
+ OPENSSL_free(a->ptr);
+ a->ptr = NULL;
+ a->init = 0;
+ a->flags = 0;
+ return (1);
+}
+
+static int linebuffer_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+
+ if (out == NULL)
+ return (0);
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_read(b->next_bio, out, outl);
+ BIO_clear_retry_flags(b);
+ BIO_copy_next_retry(b);
+ return (ret);
+}
+
+static int linebuffer_write(BIO *b, const char *in, int inl)
+{
+ int i, num = 0, foundnl;
+ BIO_LINEBUFFER_CTX *ctx;
+
+ if ((in == NULL) || (inl <= 0))
+ return (0);
+ ctx = (BIO_LINEBUFFER_CTX *)b->ptr;
+ if ((ctx == NULL) || (b->next_bio == NULL))
+ return (0);
+
+ BIO_clear_retry_flags(b);
+
+ do {
+ const char *p;
+ char c;
+
+ for (p = in, c = '\0'; p < in + inl && (c = *p) != '\n'; p++) ;
+ if (c == '\n') {
+ p++;
+ foundnl = 1;
+ } else
+ foundnl = 0;
+
+ /*
+ * If a NL was found and we already have text in the save buffer,
+ * concatenate them and write
+ */
+ while ((foundnl || p - in > ctx->obuf_size - ctx->obuf_len)
+ && ctx->obuf_len > 0) {
+ int orig_olen = ctx->obuf_len;
+
+ i = ctx->obuf_size - ctx->obuf_len;
+ if (p - in > 0) {
+ if (i >= p - in) {
+ memcpy(&(ctx->obuf[ctx->obuf_len]), in, p - in);
+ ctx->obuf_len += p - in;
+ inl -= p - in;
+ num += p - in;
+ in = p;
+ } else {
+ memcpy(&(ctx->obuf[ctx->obuf_len]), in, i);
+ ctx->obuf_len += i;
+ inl -= i;
+ in += i;
+ num += i;
+ }
+ }
+ i = BIO_write(b->next_bio, ctx->obuf, ctx->obuf_len);
+ if (i <= 0) {
+ ctx->obuf_len = orig_olen;
+ BIO_copy_next_retry(b);
+
+ if (i < 0)
+ return ((num > 0) ? num : i);
+ if (i == 0)
+ return (num);
+ }
+ if (i < ctx->obuf_len)
+ memmove(ctx->obuf, ctx->obuf + i, ctx->obuf_len - i);
+ ctx->obuf_len -= i;
+ }
+
+ /*
+ * Now that the save buffer is emptied, let's write the input buffer
+ * if a NL was found and there is anything to write.
+ */
+ if ((foundnl || p - in > ctx->obuf_size) && p - in > 0) {
+ i = BIO_write(b->next_bio, in, p - in);
+ if (i <= 0) {
+ BIO_copy_next_retry(b);
+ if (i < 0)
+ return ((num > 0) ? num : i);
+ if (i == 0)
+ return (num);
+ }
+ num += i;
+ in += i;
+ inl -= i;
+ }
+ }
+ while (foundnl && inl > 0);
+ /*
+ * We've written as much as we can. The rest of the input buffer, if
+ * any, is text that doesn't and with a NL and therefore needs to be
+ * saved for the next trip.
+ */
+ if (inl > 0) {
+ memcpy(&(ctx->obuf[ctx->obuf_len]), in, inl);
+ ctx->obuf_len += inl;
+ num += inl;
+ }
+ return num;
+}
+
+static long linebuffer_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ BIO *dbio;
+ BIO_LINEBUFFER_CTX *ctx;
+ long ret = 1;
+ char *p;
+ int r;
+ int obs;
+
+ ctx = (BIO_LINEBUFFER_CTX *)b->ptr;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ ctx->obuf_len = 0;
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ case BIO_CTRL_INFO:
+ ret = (long)ctx->obuf_len;
+ break;
+ case BIO_CTRL_WPENDING:
+ ret = (long)ctx->obuf_len;
+ if (ret == 0) {
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ }
+ break;
+ case BIO_C_SET_BUFF_SIZE:
+ obs = (int)num;
+ p = ctx->obuf;
+ if ((obs > DEFAULT_LINEBUFFER_SIZE) && (obs != ctx->obuf_size)) {
+ p = OPENSSL_malloc((int)num);
+ if (p == NULL)
+ goto malloc_error;
+ }
+ if (ctx->obuf != p) {
+ if (ctx->obuf_len > obs) {
+ ctx->obuf_len = obs;
+ }
+ memcpy(p, ctx->obuf, ctx->obuf_len);
+ OPENSSL_free(ctx->obuf);
+ ctx->obuf = p;
+ ctx->obuf_size = obs;
+ }
+ break;
+ case BIO_C_DO_STATE_MACHINE:
+ if (b->next_bio == NULL)
+ return (0);
+ BIO_clear_retry_flags(b);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ BIO_copy_next_retry(b);
+ break;
+
+ case BIO_CTRL_FLUSH:
+ if (b->next_bio == NULL)
+ return (0);
+ if (ctx->obuf_len <= 0) {
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ }
+
+ for (;;) {
+ BIO_clear_retry_flags(b);
+ if (ctx->obuf_len > 0) {
+ r = BIO_write(b->next_bio, ctx->obuf, ctx->obuf_len);
+ BIO_copy_next_retry(b);
+ if (r <= 0)
+ return ((long)r);
+ if (r < ctx->obuf_len)
+ memmove(ctx->obuf, ctx->obuf + r, ctx->obuf_len - r);
+ ctx->obuf_len -= r;
+ } else {
+ ctx->obuf_len = 0;
+ break;
+ }
+ }
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ case BIO_CTRL_DUP:
+ dbio = (BIO *)ptr;
+ if (!BIO_set_write_buffer_size(dbio, ctx->obuf_size))
+ ret = 0;
+ break;
+ default:
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ }
+ return (ret);
+ malloc_error:
+ BIOerr(BIO_F_LINEBUFFER_CTRL, ERR_R_MALLOC_FAILURE);
+ return (0);
+}
+
+static long linebuffer_callback_ctrl(BIO *b, int cmd, BIO_info_cb *fp)
+{
+ long ret = 1;
+
+ if (b->next_bio == NULL)
+ return (0);
+ switch (cmd) {
+ default:
+ ret = BIO_callback_ctrl(b->next_bio, cmd, fp);
+ break;
+ }
+ return (ret);
+}
+
+static int linebuffer_gets(BIO *b, char *buf, int size)
+{
+ if (b->next_bio == NULL)
+ return (0);
+ return (BIO_gets(b->next_bio, buf, size));
+}
+
+static int linebuffer_puts(BIO *b, const char *str)
+{
+ return (linebuffer_write(b, str, strlen(str)));
+}
diff --git a/openssl-1.1.0h/crypto/bio/bf_nbio.c b/openssl-1.1.0h/crypto/bio/bf_nbio.c
new file mode 100644
index 0000000..3328506
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bf_nbio.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+#include <openssl/rand.h>
+
+/*
+ * BIO_put and BIO_get both add to the digest, BIO_gets returns the digest
+ */
+
+static int nbiof_write(BIO *h, const char *buf, int num);
+static int nbiof_read(BIO *h, char *buf, int size);
+static int nbiof_puts(BIO *h, const char *str);
+static int nbiof_gets(BIO *h, char *str, int size);
+static long nbiof_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int nbiof_new(BIO *h);
+static int nbiof_free(BIO *data);
+static long nbiof_callback_ctrl(BIO *h, int cmd, BIO_info_cb *fp);
+typedef struct nbio_test_st {
+ /* only set if we sent a 'should retry' error */
+ int lrn;
+ int lwn;
+} NBIO_TEST;
+
+static const BIO_METHOD methods_nbiof = {
+ BIO_TYPE_NBIO_TEST,
+ "non-blocking IO test filter",
+ nbiof_write,
+ nbiof_read,
+ nbiof_puts,
+ nbiof_gets,
+ nbiof_ctrl,
+ nbiof_new,
+ nbiof_free,
+ nbiof_callback_ctrl,
+};
+
+const BIO_METHOD *BIO_f_nbio_test(void)
+{
+ return (&methods_nbiof);
+}
+
+static int nbiof_new(BIO *bi)
+{
+ NBIO_TEST *nt;
+
+ if ((nt = OPENSSL_zalloc(sizeof(*nt))) == NULL)
+ return (0);
+ nt->lrn = -1;
+ nt->lwn = -1;
+ bi->ptr = (char *)nt;
+ bi->init = 1;
+ return (1);
+}
+
+static int nbiof_free(BIO *a)
+{
+ if (a == NULL)
+ return (0);
+ OPENSSL_free(a->ptr);
+ a->ptr = NULL;
+ a->init = 0;
+ a->flags = 0;
+ return (1);
+}
+
+static int nbiof_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+ int num;
+ unsigned char n;
+
+ if (out == NULL)
+ return (0);
+ if (b->next_bio == NULL)
+ return (0);
+
+ BIO_clear_retry_flags(b);
+ if (RAND_bytes(&n, 1) <= 0)
+ return -1;
+ num = (n & 0x07);
+
+ if (outl > num)
+ outl = num;
+
+ if (num == 0) {
+ ret = -1;
+ BIO_set_retry_read(b);
+ } else {
+ ret = BIO_read(b->next_bio, out, outl);
+ if (ret < 0)
+ BIO_copy_next_retry(b);
+ }
+ return (ret);
+}
+
+static int nbiof_write(BIO *b, const char *in, int inl)
+{
+ NBIO_TEST *nt;
+ int ret = 0;
+ int num;
+ unsigned char n;
+
+ if ((in == NULL) || (inl <= 0))
+ return (0);
+ if (b->next_bio == NULL)
+ return (0);
+ nt = (NBIO_TEST *)b->ptr;
+
+ BIO_clear_retry_flags(b);
+
+ if (nt->lwn > 0) {
+ num = nt->lwn;
+ nt->lwn = 0;
+ } else {
+ if (RAND_bytes(&n, 1) <= 0)
+ return -1;
+ num = (n & 7);
+ }
+
+ if (inl > num)
+ inl = num;
+
+ if (num == 0) {
+ ret = -1;
+ BIO_set_retry_write(b);
+ } else {
+ ret = BIO_write(b->next_bio, in, inl);
+ if (ret < 0) {
+ BIO_copy_next_retry(b);
+ nt->lwn = inl;
+ }
+ }
+ return (ret);
+}
+
+static long nbiof_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret;
+
+ if (b->next_bio == NULL)
+ return (0);
+ switch (cmd) {
+ case BIO_C_DO_STATE_MACHINE:
+ BIO_clear_retry_flags(b);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ BIO_copy_next_retry(b);
+ break;
+ case BIO_CTRL_DUP:
+ ret = 0L;
+ break;
+ default:
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ break;
+ }
+ return (ret);
+}
+
+static long nbiof_callback_ctrl(BIO *b, int cmd, BIO_info_cb *fp)
+{
+ long ret = 1;
+
+ if (b->next_bio == NULL)
+ return (0);
+ switch (cmd) {
+ default:
+ ret = BIO_callback_ctrl(b->next_bio, cmd, fp);
+ break;
+ }
+ return (ret);
+}
+
+static int nbiof_gets(BIO *bp, char *buf, int size)
+{
+ if (bp->next_bio == NULL)
+ return (0);
+ return (BIO_gets(bp->next_bio, buf, size));
+}
+
+static int nbiof_puts(BIO *bp, const char *str)
+{
+ if (bp->next_bio == NULL)
+ return (0);
+ return (BIO_puts(bp->next_bio, str));
+}
diff --git a/openssl-1.1.0h/crypto/bio/bf_null.c b/openssl-1.1.0h/crypto/bio/bf_null.c
new file mode 100644
index 0000000..6b86aa5
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bf_null.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+
+/*
+ * BIO_put and BIO_get both add to the digest, BIO_gets returns the digest
+ */
+
+static int nullf_write(BIO *h, const char *buf, int num);
+static int nullf_read(BIO *h, char *buf, int size);
+static int nullf_puts(BIO *h, const char *str);
+static int nullf_gets(BIO *h, char *str, int size);
+static long nullf_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static long nullf_callback_ctrl(BIO *h, int cmd, BIO_info_cb *fp);
+static const BIO_METHOD methods_nullf = {
+ BIO_TYPE_NULL_FILTER,
+ "NULL filter",
+ nullf_write,
+ nullf_read,
+ nullf_puts,
+ nullf_gets,
+ nullf_ctrl,
+ NULL,
+ NULL,
+ nullf_callback_ctrl,
+};
+
+const BIO_METHOD *BIO_f_null(void)
+{
+ return (&methods_nullf);
+}
+
+static int nullf_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+
+ if (out == NULL)
+ return (0);
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_read(b->next_bio, out, outl);
+ BIO_clear_retry_flags(b);
+ BIO_copy_next_retry(b);
+ return (ret);
+}
+
+static int nullf_write(BIO *b, const char *in, int inl)
+{
+ int ret = 0;
+
+ if ((in == NULL) || (inl <= 0))
+ return (0);
+ if (b->next_bio == NULL)
+ return (0);
+ ret = BIO_write(b->next_bio, in, inl);
+ BIO_clear_retry_flags(b);
+ BIO_copy_next_retry(b);
+ return (ret);
+}
+
+static long nullf_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret;
+
+ if (b->next_bio == NULL)
+ return (0);
+ switch (cmd) {
+ case BIO_C_DO_STATE_MACHINE:
+ BIO_clear_retry_flags(b);
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ BIO_copy_next_retry(b);
+ break;
+ case BIO_CTRL_DUP:
+ ret = 0L;
+ break;
+ default:
+ ret = BIO_ctrl(b->next_bio, cmd, num, ptr);
+ }
+ return (ret);
+}
+
+static long nullf_callback_ctrl(BIO *b, int cmd, BIO_info_cb *fp)
+{
+ long ret = 1;
+
+ if (b->next_bio == NULL)
+ return (0);
+ switch (cmd) {
+ default:
+ ret = BIO_callback_ctrl(b->next_bio, cmd, fp);
+ break;
+ }
+ return (ret);
+}
+
+static int nullf_gets(BIO *bp, char *buf, int size)
+{
+ if (bp->next_bio == NULL)
+ return (0);
+ return (BIO_gets(bp->next_bio, buf, size));
+}
+
+static int nullf_puts(BIO *bp, const char *str)
+{
+ if (bp->next_bio == NULL)
+ return (0);
+ return (BIO_puts(bp->next_bio, str));
+}
diff --git a/openssl-1.1.0h/crypto/bio/bio_cb.c b/openssl-1.1.0h/crypto/bio/bio_cb.c
new file mode 100644
index 0000000..412387b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bio_cb.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+#include <openssl/err.h>
+
+long BIO_debug_callback(BIO *bio, int cmd, const char *argp,
+ int argi, long argl, long ret)
+{
+ BIO *b;
+ char buf[256];
+ char *p;
+ long r = 1;
+ int len;
+ size_t p_maxlen;
+
+ if (BIO_CB_RETURN & cmd)
+ r = ret;
+
+ len = BIO_snprintf(buf, sizeof(buf), "BIO[%p]: ", (void *)bio);
+
+ /* Ignore errors and continue printing the other information. */
+ if (len < 0)
+ len = 0;
+ p = buf + len;
+ p_maxlen = sizeof(buf) - len;
+
+ switch (cmd) {
+ case BIO_CB_FREE:
+ BIO_snprintf(p, p_maxlen, "Free - %s\n", bio->method->name);
+ break;
+ case BIO_CB_READ:
+ if (bio->method->type & BIO_TYPE_DESCRIPTOR)
+ BIO_snprintf(p, p_maxlen, "read(%d,%lu) - %s fd=%d\n",
+ bio->num, (unsigned long)argi,
+ bio->method->name, bio->num);
+ else
+ BIO_snprintf(p, p_maxlen, "read(%d,%lu) - %s\n",
+ bio->num, (unsigned long)argi, bio->method->name);
+ break;
+ case BIO_CB_WRITE:
+ if (bio->method->type & BIO_TYPE_DESCRIPTOR)
+ BIO_snprintf(p, p_maxlen, "write(%d,%lu) - %s fd=%d\n",
+ bio->num, (unsigned long)argi,
+ bio->method->name, bio->num);
+ else
+ BIO_snprintf(p, p_maxlen, "write(%d,%lu) - %s\n",
+ bio->num, (unsigned long)argi, bio->method->name);
+ break;
+ case BIO_CB_PUTS:
+ BIO_snprintf(p, p_maxlen, "puts() - %s\n", bio->method->name);
+ break;
+ case BIO_CB_GETS:
+ BIO_snprintf(p, p_maxlen, "gets(%lu) - %s\n", (unsigned long)argi,
+ bio->method->name);
+ break;
+ case BIO_CB_CTRL:
+ BIO_snprintf(p, p_maxlen, "ctrl(%lu) - %s\n", (unsigned long)argi,
+ bio->method->name);
+ break;
+ case BIO_CB_RETURN | BIO_CB_READ:
+ BIO_snprintf(p, p_maxlen, "read return %ld\n", ret);
+ break;
+ case BIO_CB_RETURN | BIO_CB_WRITE:
+ BIO_snprintf(p, p_maxlen, "write return %ld\n", ret);
+ break;
+ case BIO_CB_RETURN | BIO_CB_GETS:
+ BIO_snprintf(p, p_maxlen, "gets return %ld\n", ret);
+ break;
+ case BIO_CB_RETURN | BIO_CB_PUTS:
+ BIO_snprintf(p, p_maxlen, "puts return %ld\n", ret);
+ break;
+ case BIO_CB_RETURN | BIO_CB_CTRL:
+ BIO_snprintf(p, p_maxlen, "ctrl return %ld\n", ret);
+ break;
+ default:
+ BIO_snprintf(p, p_maxlen, "bio callback - unknown type (%d)\n", cmd);
+ break;
+ }
+
+ b = (BIO *)bio->cb_arg;
+ if (b != NULL)
+ BIO_write(b, buf, strlen(buf));
+#if !defined(OPENSSL_NO_STDIO)
+ else
+ fputs(buf, stderr);
+#endif
+ return (r);
+}
diff --git a/openssl-1.1.0h/crypto/bio/bio_err.c b/openssl-1.1.0h/crypto/bio/bio_err.c
new file mode 100644
index 0000000..c914dcf
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bio_err.c
@@ -0,0 +1,126 @@
+/*
+ * Generated by util/mkerr.pl DO NOT EDIT
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <openssl/err.h>
+#include <openssl/bio.h>
+
+/* BEGIN ERROR CODES */
+#ifndef OPENSSL_NO_ERR
+
+# define ERR_FUNC(func) ERR_PACK(ERR_LIB_BIO,func,0)
+# define ERR_REASON(reason) ERR_PACK(ERR_LIB_BIO,0,reason)
+
+static ERR_STRING_DATA BIO_str_functs[] = {
+ {ERR_FUNC(BIO_F_ACPT_STATE), "acpt_state"},
+ {ERR_FUNC(BIO_F_ADDR_STRINGS), "addr_strings"},
+ {ERR_FUNC(BIO_F_BIO_ACCEPT), "BIO_accept"},
+ {ERR_FUNC(BIO_F_BIO_ACCEPT_EX), "BIO_accept_ex"},
+ {ERR_FUNC(BIO_F_BIO_ADDR_NEW), "BIO_ADDR_new"},
+ {ERR_FUNC(BIO_F_BIO_CALLBACK_CTRL), "BIO_callback_ctrl"},
+ {ERR_FUNC(BIO_F_BIO_CONNECT), "BIO_connect"},
+ {ERR_FUNC(BIO_F_BIO_CTRL), "BIO_ctrl"},
+ {ERR_FUNC(BIO_F_BIO_GETS), "BIO_gets"},
+ {ERR_FUNC(BIO_F_BIO_GET_HOST_IP), "BIO_get_host_ip"},
+ {ERR_FUNC(BIO_F_BIO_GET_NEW_INDEX), "BIO_get_new_index"},
+ {ERR_FUNC(BIO_F_BIO_GET_PORT), "BIO_get_port"},
+ {ERR_FUNC(BIO_F_BIO_LISTEN), "BIO_listen"},
+ {ERR_FUNC(BIO_F_BIO_LOOKUP), "BIO_lookup"},
+ {ERR_FUNC(BIO_F_BIO_MAKE_PAIR), "bio_make_pair"},
+ {ERR_FUNC(BIO_F_BIO_METH_NEW), "BIO_meth_new"},
+ {ERR_FUNC(BIO_F_BIO_NEW), "BIO_new"},
+ {ERR_FUNC(BIO_F_BIO_NEW_FILE), "BIO_new_file"},
+ {ERR_FUNC(BIO_F_BIO_NEW_MEM_BUF), "BIO_new_mem_buf"},
+ {ERR_FUNC(BIO_F_BIO_NREAD), "BIO_nread"},
+ {ERR_FUNC(BIO_F_BIO_NREAD0), "BIO_nread0"},
+ {ERR_FUNC(BIO_F_BIO_NWRITE), "BIO_nwrite"},
+ {ERR_FUNC(BIO_F_BIO_NWRITE0), "BIO_nwrite0"},
+ {ERR_FUNC(BIO_F_BIO_PARSE_HOSTSERV), "BIO_parse_hostserv"},
+ {ERR_FUNC(BIO_F_BIO_PUTS), "BIO_puts"},
+ {ERR_FUNC(BIO_F_BIO_READ), "BIO_read"},
+ {ERR_FUNC(BIO_F_BIO_SOCKET), "BIO_socket"},
+ {ERR_FUNC(BIO_F_BIO_SOCKET_NBIO), "BIO_socket_nbio"},
+ {ERR_FUNC(BIO_F_BIO_SOCK_INFO), "BIO_sock_info"},
+ {ERR_FUNC(BIO_F_BIO_SOCK_INIT), "BIO_sock_init"},
+ {ERR_FUNC(BIO_F_BIO_WRITE), "BIO_write"},
+ {ERR_FUNC(BIO_F_BUFFER_CTRL), "buffer_ctrl"},
+ {ERR_FUNC(BIO_F_CONN_CTRL), "conn_ctrl"},
+ {ERR_FUNC(BIO_F_CONN_STATE), "conn_state"},
+ {ERR_FUNC(BIO_F_DGRAM_SCTP_READ), "dgram_sctp_read"},
+ {ERR_FUNC(BIO_F_DGRAM_SCTP_WRITE), "dgram_sctp_write"},
+ {ERR_FUNC(BIO_F_FILE_CTRL), "file_ctrl"},
+ {ERR_FUNC(BIO_F_FILE_READ), "file_read"},
+ {ERR_FUNC(BIO_F_LINEBUFFER_CTRL), "linebuffer_ctrl"},
+ {ERR_FUNC(BIO_F_MEM_WRITE), "mem_write"},
+ {ERR_FUNC(BIO_F_SSL_NEW), "SSL_new"},
+ {0, NULL}
+};
+
+static ERR_STRING_DATA BIO_str_reasons[] = {
+ {ERR_REASON(BIO_R_ACCEPT_ERROR), "accept error"},
+ {ERR_REASON(BIO_R_ADDRINFO_ADDR_IS_NOT_AF_INET),
+ "addrinfo addr is not af inet"},
+ {ERR_REASON(BIO_R_AMBIGUOUS_HOST_OR_SERVICE),
+ "ambiguous host or service"},
+ {ERR_REASON(BIO_R_BAD_FOPEN_MODE), "bad fopen mode"},
+ {ERR_REASON(BIO_R_BROKEN_PIPE), "broken pipe"},
+ {ERR_REASON(BIO_R_CONNECT_ERROR), "connect error"},
+ {ERR_REASON(BIO_R_GETHOSTBYNAME_ADDR_IS_NOT_AF_INET),
+ "gethostbyname addr is not af inet"},
+ {ERR_REASON(BIO_R_GETSOCKNAME_ERROR), "getsockname error"},
+ {ERR_REASON(BIO_R_GETSOCKNAME_TRUNCATED_ADDRESS),
+ "getsockname truncated address"},
+ {ERR_REASON(BIO_R_GETTING_SOCKTYPE), "getting socktype"},
+ {ERR_REASON(BIO_R_INVALID_ARGUMENT), "invalid argument"},
+ {ERR_REASON(BIO_R_INVALID_SOCKET), "invalid socket"},
+ {ERR_REASON(BIO_R_IN_USE), "in use"},
+ {ERR_REASON(BIO_R_LISTEN_V6_ONLY), "listen v6 only"},
+ {ERR_REASON(BIO_R_LOOKUP_RETURNED_NOTHING), "lookup returned nothing"},
+ {ERR_REASON(BIO_R_MALFORMED_HOST_OR_SERVICE),
+ "malformed host or service"},
+ {ERR_REASON(BIO_R_NBIO_CONNECT_ERROR), "nbio connect error"},
+ {ERR_REASON(BIO_R_NO_ACCEPT_ADDR_OR_SERVICE_SPECIFIED),
+ "no accept addr or service specified"},
+ {ERR_REASON(BIO_R_NO_HOSTNAME_OR_SERVICE_SPECIFIED),
+ "no hostname or service specified"},
+ {ERR_REASON(BIO_R_NO_PORT_DEFINED), "no port defined"},
+ {ERR_REASON(BIO_R_NO_SUCH_FILE), "no such file"},
+ {ERR_REASON(BIO_R_NULL_PARAMETER), "null parameter"},
+ {ERR_REASON(BIO_R_UNABLE_TO_BIND_SOCKET), "unable to bind socket"},
+ {ERR_REASON(BIO_R_UNABLE_TO_CREATE_SOCKET), "unable to create socket"},
+ {ERR_REASON(BIO_R_UNABLE_TO_KEEPALIVE), "unable to keepalive"},
+ {ERR_REASON(BIO_R_UNABLE_TO_LISTEN_SOCKET), "unable to listen socket"},
+ {ERR_REASON(BIO_R_UNABLE_TO_NODELAY), "unable to nodelay"},
+ {ERR_REASON(BIO_R_UNABLE_TO_REUSEADDR), "unable to reuseaddr"},
+ {ERR_REASON(BIO_R_UNAVAILABLE_IP_FAMILY), "unavailable ip family"},
+ {ERR_REASON(BIO_R_UNINITIALIZED), "uninitialized"},
+ {ERR_REASON(BIO_R_UNKNOWN_INFO_TYPE), "unknown info type"},
+ {ERR_REASON(BIO_R_UNSUPPORTED_IP_FAMILY), "unsupported ip family"},
+ {ERR_REASON(BIO_R_UNSUPPORTED_METHOD), "unsupported method"},
+ {ERR_REASON(BIO_R_UNSUPPORTED_PROTOCOL_FAMILY),
+ "unsupported protocol family"},
+ {ERR_REASON(BIO_R_WRITE_TO_READ_ONLY_BIO), "write to read only BIO"},
+ {ERR_REASON(BIO_R_WSASTARTUP), "WSAStartup"},
+ {0, NULL}
+};
+
+#endif
+
+int ERR_load_BIO_strings(void)
+{
+#ifndef OPENSSL_NO_ERR
+
+ if (ERR_func_error_string(BIO_str_functs[0].error) == NULL) {
+ ERR_load_strings(0, BIO_str_functs);
+ ERR_load_strings(0, BIO_str_reasons);
+ }
+#endif
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/bio/bio_lcl.h b/openssl-1.1.0h/crypto/bio/bio_lcl.h
new file mode 100644
index 0000000..39178cf
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bio_lcl.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#define USE_SOCKETS
+#include "e_os.h"
+
+/* BEGIN BIO_ADDRINFO/BIO_ADDR stuff. */
+
+#ifndef OPENSSL_NO_SOCK
+/*
+ * Throughout this file and b_addr.c, the existence of the macro
+ * AI_PASSIVE is used to detect the availability of struct addrinfo,
+ * getnameinfo() and getaddrinfo(). If that macro doesn't exist,
+ * we use our own implementation instead.
+ */
+
+/*
+ * It's imperative that these macros get defined before openssl/bio.h gets
+ * included. Otherwise, the AI_PASSIVE hack will not work properly.
+ * For clarity, we check for internal/cryptlib.h since it's a common header
+ * that also includes bio.h.
+ */
+# ifdef HEADER_CRYPTLIB_H
+# error internal/cryptlib.h included before bio_lcl.h
+# endif
+# ifdef HEADER_BIO_H
+# error openssl/bio.h included before bio_lcl.h
+# endif
+
+/*
+ * Undefine AF_UNIX on systems that define it but don't support it.
+ */
+# if defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_VMS)
+# undef AF_UNIX
+# endif
+
+# ifdef AI_PASSIVE
+
+/*
+ * There's a bug in VMS C header file netdb.h, where struct addrinfo
+ * always is the P32 variant, but the functions that handle that structure,
+ * such as getaddrinfo() and freeaddrinfo() adapt to the initial pointer
+ * size. The easiest workaround is to force struct addrinfo to be the
+ * 64-bit variant when compiling in P64 mode.
+ */
+# if defined(OPENSSL_SYS_VMS) && __INITIAL_POINTER_SIZE == 64
+# define addrinfo __addrinfo64
+# endif
+
+# define bio_addrinfo_st addrinfo
+# define bai_family ai_family
+# define bai_socktype ai_socktype
+# define bai_protocol ai_protocol
+# define bai_addrlen ai_addrlen
+# define bai_addr ai_addr
+# define bai_next ai_next
+# else
+struct bio_addrinfo_st {
+ int bai_family;
+ int bai_socktype;
+ int bai_protocol;
+ size_t bai_addrlen;
+ struct sockaddr *bai_addr;
+ struct bio_addrinfo_st *bai_next;
+};
+# endif
+
+union bio_addr_st {
+ struct sockaddr sa;
+# ifdef AF_INET6
+ struct sockaddr_in6 s_in6;
+# endif
+ struct sockaddr_in s_in;
+# ifdef AF_UNIX
+ struct sockaddr_un s_un;
+# endif
+};
+#endif
+
+/* END BIO_ADDRINFO/BIO_ADDR stuff. */
+
+#include "internal/cryptlib.h"
+#include <internal/bio.h>
+
+typedef struct bio_f_buffer_ctx_struct {
+ /*-
+ * Buffers are setup like this:
+ *
+ * <---------------------- size ----------------------->
+ * +---------------------------------------------------+
+ * | consumed | remaining | free space |
+ * +---------------------------------------------------+
+ * <-- off --><------- len ------->
+ */
+ /*- BIO *bio; *//*
+ * this is now in the BIO struct
+ */
+ int ibuf_size; /* how big is the input buffer */
+ int obuf_size; /* how big is the output buffer */
+ char *ibuf; /* the char array */
+ int ibuf_len; /* how many bytes are in it */
+ int ibuf_off; /* write/read offset */
+ char *obuf; /* the char array */
+ int obuf_len; /* how many bytes are in it */
+ int obuf_off; /* write/read offset */
+} BIO_F_BUFFER_CTX;
+
+struct bio_st {
+ const BIO_METHOD *method;
+ /* bio, mode, argp, argi, argl, ret */
+ long (*callback) (struct bio_st *, int, const char *, int, long, long);
+ char *cb_arg; /* first argument for the callback */
+ int init;
+ int shutdown;
+ int flags; /* extra storage */
+ int retry_reason;
+ int num;
+ void *ptr;
+ struct bio_st *next_bio; /* used by filter BIOs */
+ struct bio_st *prev_bio; /* used by filter BIOs */
+ int references;
+ uint64_t num_read;
+ uint64_t num_write;
+ CRYPTO_EX_DATA ex_data;
+ CRYPTO_RWLOCK *lock;
+};
+
+#ifndef OPENSSL_NO_SOCK
+# ifdef OPENSSL_SYS_VMS
+typedef unsigned int socklen_t;
+# endif
+
+extern CRYPTO_RWLOCK *bio_lookup_lock;
+
+int BIO_ADDR_make(BIO_ADDR *ap, const struct sockaddr *sa);
+const struct sockaddr *BIO_ADDR_sockaddr(const BIO_ADDR *ap);
+struct sockaddr *BIO_ADDR_sockaddr_noconst(BIO_ADDR *ap);
+socklen_t BIO_ADDR_sockaddr_size(const BIO_ADDR *ap);
+socklen_t BIO_ADDRINFO_sockaddr_size(const BIO_ADDRINFO *bai);
+const struct sockaddr *BIO_ADDRINFO_sockaddr(const BIO_ADDRINFO *bai);
+#endif
+
+extern CRYPTO_RWLOCK *bio_type_lock;
+
+void bio_sock_cleanup_int(void);
+
+#if BIO_FLAGS_UPLINK==0
+/* Shortcut UPLINK calls on most platforms... */
+# define UP_stdin stdin
+# define UP_stdout stdout
+# define UP_stderr stderr
+# define UP_fprintf fprintf
+# define UP_fgets fgets
+# define UP_fread fread
+# define UP_fwrite fwrite
+# undef UP_fsetmod
+# define UP_feof feof
+# define UP_fclose fclose
+
+# define UP_fopen fopen
+# define UP_fseek fseek
+# define UP_ftell ftell
+# define UP_fflush fflush
+# define UP_ferror ferror
+# ifdef _WIN32
+# define UP_fileno _fileno
+# define UP_open _open
+# define UP_read _read
+# define UP_write _write
+# define UP_lseek _lseek
+# define UP_close _close
+# else
+# define UP_fileno fileno
+# define UP_open open
+# define UP_read read
+# define UP_write write
+# define UP_lseek lseek
+# define UP_close close
+# endif
+
+#endif
+
diff --git a/openssl-1.1.0h/crypto/bio/bio_lib.c b/openssl-1.1.0h/crypto/bio/bio_lib.c
new file mode 100644
index 0000000..7b98dc9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bio_lib.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <openssl/crypto.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+
+BIO *BIO_new(const BIO_METHOD *method)
+{
+ BIO *bio = OPENSSL_zalloc(sizeof(*bio));
+
+ if (bio == NULL) {
+ BIOerr(BIO_F_BIO_NEW, ERR_R_MALLOC_FAILURE);
+ return (NULL);
+ }
+
+ bio->method = method;
+ bio->shutdown = 1;
+ bio->references = 1;
+
+ if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_BIO, bio, &bio->ex_data))
+ goto err;
+
+ bio->lock = CRYPTO_THREAD_lock_new();
+ if (bio->lock == NULL) {
+ BIOerr(BIO_F_BIO_NEW, ERR_R_MALLOC_FAILURE);
+ CRYPTO_free_ex_data(CRYPTO_EX_INDEX_BIO, bio, &bio->ex_data);
+ goto err;
+ }
+
+ if (method->create != NULL && !method->create(bio)) {
+ BIOerr(BIO_F_BIO_NEW, ERR_R_INIT_FAIL);
+ CRYPTO_free_ex_data(CRYPTO_EX_INDEX_BIO, bio, &bio->ex_data);
+ CRYPTO_THREAD_lock_free(bio->lock);
+ goto err;
+ }
+ if (method->create == NULL)
+ bio->init = 1;
+
+ return bio;
+
+err:
+ OPENSSL_free(bio);
+ return NULL;
+}
+
+int BIO_free(BIO *a)
+{
+ int i;
+
+ if (a == NULL)
+ return 0;
+
+ if (CRYPTO_atomic_add(&a->references, -1, &i, a->lock) <= 0)
+ return 0;
+
+ REF_PRINT_COUNT("BIO", a);
+ if (i > 0)
+ return 1;
+ REF_ASSERT_ISNT(i < 0);
+ if ((a->callback != NULL) &&
+ ((i = (int)a->callback(a, BIO_CB_FREE, NULL, 0, 0L, 1L)) <= 0))
+ return i;
+
+ if ((a->method != NULL) && (a->method->destroy != NULL))
+ a->method->destroy(a);
+
+ CRYPTO_free_ex_data(CRYPTO_EX_INDEX_BIO, a, &a->ex_data);
+
+ CRYPTO_THREAD_lock_free(a->lock);
+
+ OPENSSL_free(a);
+
+ return 1;
+}
+
+void BIO_set_data(BIO *a, void *ptr)
+{
+ a->ptr = ptr;
+}
+
+void *BIO_get_data(BIO *a)
+{
+ return a->ptr;
+}
+
+void BIO_set_init(BIO *a, int init)
+{
+ a->init = init;
+}
+
+int BIO_get_init(BIO *a)
+{
+ return a->init;
+}
+
+void BIO_set_shutdown(BIO *a, int shut)
+{
+ a->shutdown = shut;
+}
+
+int BIO_get_shutdown(BIO *a)
+{
+ return a->shutdown;
+}
+
+void BIO_vfree(BIO *a)
+{
+ BIO_free(a);
+}
+
+int BIO_up_ref(BIO *a)
+{
+ int i;
+
+ if (CRYPTO_atomic_add(&a->references, 1, &i, a->lock) <= 0)
+ return 0;
+
+ REF_PRINT_COUNT("BIO", a);
+ REF_ASSERT_ISNT(i < 2);
+ return ((i > 1) ? 1 : 0);
+}
+
+void BIO_clear_flags(BIO *b, int flags)
+{
+ b->flags &= ~flags;
+}
+
+int BIO_test_flags(const BIO *b, int flags)
+{
+ return (b->flags & flags);
+}
+
+void BIO_set_flags(BIO *b, int flags)
+{
+ b->flags |= flags;
+}
+
+long (*BIO_get_callback(const BIO *b)) (struct bio_st *, int, const char *,
+ int, long, long) {
+ return b->callback;
+}
+
+void BIO_set_callback(BIO *b,
+ long (*cb) (struct bio_st *, int, const char *, int,
+ long, long))
+{
+ b->callback = cb;
+}
+
+void BIO_set_callback_arg(BIO *b, char *arg)
+{
+ b->cb_arg = arg;
+}
+
+char *BIO_get_callback_arg(const BIO *b)
+{
+ return b->cb_arg;
+}
+
+const char *BIO_method_name(const BIO *b)
+{
+ return b->method->name;
+}
+
+int BIO_method_type(const BIO *b)
+{
+ return b->method->type;
+}
+
+int BIO_read(BIO *b, void *out, int outl)
+{
+ int i;
+ long (*cb) (BIO *, int, const char *, int, long, long);
+
+ if ((b == NULL) || (b->method == NULL) || (b->method->bread == NULL)) {
+ BIOerr(BIO_F_BIO_READ, BIO_R_UNSUPPORTED_METHOD);
+ return (-2);
+ }
+
+ cb = b->callback;
+ if ((cb != NULL) &&
+ ((i = (int)cb(b, BIO_CB_READ, out, outl, 0L, 1L)) <= 0))
+ return (i);
+
+ if (!b->init) {
+ BIOerr(BIO_F_BIO_READ, BIO_R_UNINITIALIZED);
+ return (-2);
+ }
+
+ i = b->method->bread(b, out, outl);
+
+ if (i > 0)
+ b->num_read += (uint64_t)i;
+
+ if (cb != NULL)
+ i = (int)cb(b, BIO_CB_READ | BIO_CB_RETURN, out, outl, 0L, (long)i);
+ return (i);
+}
+
+int BIO_write(BIO *b, const void *in, int inl)
+{
+ int i;
+ long (*cb) (BIO *, int, const char *, int, long, long);
+
+ if (b == NULL)
+ return (0);
+
+ cb = b->callback;
+ if ((b->method == NULL) || (b->method->bwrite == NULL)) {
+ BIOerr(BIO_F_BIO_WRITE, BIO_R_UNSUPPORTED_METHOD);
+ return (-2);
+ }
+
+ if ((cb != NULL) &&
+ ((i = (int)cb(b, BIO_CB_WRITE, in, inl, 0L, 1L)) <= 0))
+ return (i);
+
+ if (!b->init) {
+ BIOerr(BIO_F_BIO_WRITE, BIO_R_UNINITIALIZED);
+ return (-2);
+ }
+
+ i = b->method->bwrite(b, in, inl);
+
+ if (i > 0)
+ b->num_write += (uint64_t)i;
+
+ if (cb != NULL)
+ i = (int)cb(b, BIO_CB_WRITE | BIO_CB_RETURN, in, inl, 0L, (long)i);
+ return (i);
+}
+
+int BIO_puts(BIO *b, const char *in)
+{
+ int i;
+ long (*cb) (BIO *, int, const char *, int, long, long);
+
+ if ((b == NULL) || (b->method == NULL) || (b->method->bputs == NULL)) {
+ BIOerr(BIO_F_BIO_PUTS, BIO_R_UNSUPPORTED_METHOD);
+ return (-2);
+ }
+
+ cb = b->callback;
+
+ if ((cb != NULL) && ((i = (int)cb(b, BIO_CB_PUTS, in, 0, 0L, 1L)) <= 0))
+ return (i);
+
+ if (!b->init) {
+ BIOerr(BIO_F_BIO_PUTS, BIO_R_UNINITIALIZED);
+ return (-2);
+ }
+
+ i = b->method->bputs(b, in);
+
+ if (i > 0)
+ b->num_write += (uint64_t)i;
+
+ if (cb != NULL)
+ i = (int)cb(b, BIO_CB_PUTS | BIO_CB_RETURN, in, 0, 0L, (long)i);
+ return (i);
+}
+
+int BIO_gets(BIO *b, char *in, int inl)
+{
+ int i;
+ long (*cb) (BIO *, int, const char *, int, long, long);
+
+ if ((b == NULL) || (b->method == NULL) || (b->method->bgets == NULL)) {
+ BIOerr(BIO_F_BIO_GETS, BIO_R_UNSUPPORTED_METHOD);
+ return (-2);
+ }
+
+ cb = b->callback;
+
+ if ((cb != NULL) && ((i = (int)cb(b, BIO_CB_GETS, in, inl, 0L, 1L)) <= 0))
+ return (i);
+
+ if (!b->init) {
+ BIOerr(BIO_F_BIO_GETS, BIO_R_UNINITIALIZED);
+ return (-2);
+ }
+
+ i = b->method->bgets(b, in, inl);
+
+ if (cb != NULL)
+ i = (int)cb(b, BIO_CB_GETS | BIO_CB_RETURN, in, inl, 0L, (long)i);
+ return (i);
+}
+
+int BIO_indent(BIO *b, int indent, int max)
+{
+ if (indent < 0)
+ indent = 0;
+ if (indent > max)
+ indent = max;
+ while (indent--)
+ if (BIO_puts(b, " ") != 1)
+ return 0;
+ return 1;
+}
+
+long BIO_int_ctrl(BIO *b, int cmd, long larg, int iarg)
+{
+ int i;
+
+ i = iarg;
+ return (BIO_ctrl(b, cmd, larg, (char *)&i));
+}
+
+void *BIO_ptr_ctrl(BIO *b, int cmd, long larg)
+{
+ void *p = NULL;
+
+ if (BIO_ctrl(b, cmd, larg, (char *)&p) <= 0)
+ return (NULL);
+ else
+ return (p);
+}
+
+long BIO_ctrl(BIO *b, int cmd, long larg, void *parg)
+{
+ long ret;
+ long (*cb) (BIO *, int, const char *, int, long, long);
+
+ if (b == NULL)
+ return (0);
+
+ if ((b->method == NULL) || (b->method->ctrl == NULL)) {
+ BIOerr(BIO_F_BIO_CTRL, BIO_R_UNSUPPORTED_METHOD);
+ return (-2);
+ }
+
+ cb = b->callback;
+
+ if ((cb != NULL) &&
+ ((ret = cb(b, BIO_CB_CTRL, parg, cmd, larg, 1L)) <= 0))
+ return (ret);
+
+ ret = b->method->ctrl(b, cmd, larg, parg);
+
+ if (cb != NULL)
+ ret = cb(b, BIO_CB_CTRL | BIO_CB_RETURN, parg, cmd, larg, ret);
+ return (ret);
+}
+
+long BIO_callback_ctrl(BIO *b, int cmd, BIO_info_cb *fp)
+{
+ long ret;
+ long (*cb) (BIO *, int, const char *, int, long, long);
+
+ if (b == NULL)
+ return (0);
+
+ if ((b->method == NULL) || (b->method->callback_ctrl == NULL)) {
+ BIOerr(BIO_F_BIO_CALLBACK_CTRL, BIO_R_UNSUPPORTED_METHOD);
+ return (-2);
+ }
+
+ cb = b->callback;
+
+ if ((cb != NULL) &&
+ ((ret = cb(b, BIO_CB_CTRL, (void *)&fp, cmd, 0, 1L)) <= 0))
+ return (ret);
+
+ ret = b->method->callback_ctrl(b, cmd, fp);
+
+ if (cb != NULL)
+ ret = cb(b, BIO_CB_CTRL | BIO_CB_RETURN, (void *)&fp, cmd, 0, ret);
+ return (ret);
+}
+
+/*
+ * It is unfortunate to duplicate in functions what the BIO_(w)pending macros
+ * do; but those macros have inappropriate return type, and for interfacing
+ * from other programming languages, C macros aren't much of a help anyway.
+ */
+size_t BIO_ctrl_pending(BIO *bio)
+{
+ return BIO_ctrl(bio, BIO_CTRL_PENDING, 0, NULL);
+}
+
+size_t BIO_ctrl_wpending(BIO *bio)
+{
+ return BIO_ctrl(bio, BIO_CTRL_WPENDING, 0, NULL);
+}
+
+/* put the 'bio' on the end of b's list of operators */
+BIO *BIO_push(BIO *b, BIO *bio)
+{
+ BIO *lb;
+
+ if (b == NULL)
+ return (bio);
+ lb = b;
+ while (lb->next_bio != NULL)
+ lb = lb->next_bio;
+ lb->next_bio = bio;
+ if (bio != NULL)
+ bio->prev_bio = lb;
+ /* called to do internal processing */
+ BIO_ctrl(b, BIO_CTRL_PUSH, 0, lb);
+ return (b);
+}
+
+/* Remove the first and return the rest */
+BIO *BIO_pop(BIO *b)
+{
+ BIO *ret;
+
+ if (b == NULL)
+ return (NULL);
+ ret = b->next_bio;
+
+ BIO_ctrl(b, BIO_CTRL_POP, 0, b);
+
+ if (b->prev_bio != NULL)
+ b->prev_bio->next_bio = b->next_bio;
+ if (b->next_bio != NULL)
+ b->next_bio->prev_bio = b->prev_bio;
+
+ b->next_bio = NULL;
+ b->prev_bio = NULL;
+ return (ret);
+}
+
+BIO *BIO_get_retry_BIO(BIO *bio, int *reason)
+{
+ BIO *b, *last;
+
+ b = last = bio;
+ for (;;) {
+ if (!BIO_should_retry(b))
+ break;
+ last = b;
+ b = b->next_bio;
+ if (b == NULL)
+ break;
+ }
+ if (reason != NULL)
+ *reason = last->retry_reason;
+ return (last);
+}
+
+int BIO_get_retry_reason(BIO *bio)
+{
+ return (bio->retry_reason);
+}
+
+void BIO_set_retry_reason(BIO *bio, int reason)
+{
+ bio->retry_reason = reason;
+}
+
+BIO *BIO_find_type(BIO *bio, int type)
+{
+ int mt, mask;
+
+ if (bio == NULL)
+ return NULL;
+ mask = type & 0xff;
+ do {
+ if (bio->method != NULL) {
+ mt = bio->method->type;
+
+ if (!mask) {
+ if (mt & type)
+ return (bio);
+ } else if (mt == type)
+ return (bio);
+ }
+ bio = bio->next_bio;
+ } while (bio != NULL);
+ return (NULL);
+}
+
+BIO *BIO_next(BIO *b)
+{
+ if (b == NULL)
+ return NULL;
+ return b->next_bio;
+}
+
+void BIO_set_next(BIO *b, BIO *next)
+{
+ b->next_bio = next;
+}
+
+void BIO_free_all(BIO *bio)
+{
+ BIO *b;
+ int ref;
+
+ while (bio != NULL) {
+ b = bio;
+ ref = b->references;
+ bio = bio->next_bio;
+ BIO_free(b);
+ /* Since ref count > 1, don't free anyone else. */
+ if (ref > 1)
+ break;
+ }
+}
+
+BIO *BIO_dup_chain(BIO *in)
+{
+ BIO *ret = NULL, *eoc = NULL, *bio, *new_bio;
+
+ for (bio = in; bio != NULL; bio = bio->next_bio) {
+ if ((new_bio = BIO_new(bio->method)) == NULL)
+ goto err;
+ new_bio->callback = bio->callback;
+ new_bio->cb_arg = bio->cb_arg;
+ new_bio->init = bio->init;
+ new_bio->shutdown = bio->shutdown;
+ new_bio->flags = bio->flags;
+
+ /* This will let SSL_s_sock() work with stdin/stdout */
+ new_bio->num = bio->num;
+
+ if (!BIO_dup_state(bio, (char *)new_bio)) {
+ BIO_free(new_bio);
+ goto err;
+ }
+
+ /* copy app data */
+ if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new_bio->ex_data,
+ &bio->ex_data)) {
+ BIO_free(new_bio);
+ goto err;
+ }
+
+ if (ret == NULL) {
+ eoc = new_bio;
+ ret = eoc;
+ } else {
+ BIO_push(eoc, new_bio);
+ eoc = new_bio;
+ }
+ }
+ return (ret);
+ err:
+ BIO_free_all(ret);
+
+ return (NULL);
+}
+
+void BIO_copy_next_retry(BIO *b)
+{
+ BIO_set_flags(b, BIO_get_retry_flags(b->next_bio));
+ b->retry_reason = b->next_bio->retry_reason;
+}
+
+int BIO_set_ex_data(BIO *bio, int idx, void *data)
+{
+ return (CRYPTO_set_ex_data(&(bio->ex_data), idx, data));
+}
+
+void *BIO_get_ex_data(BIO *bio, int idx)
+{
+ return (CRYPTO_get_ex_data(&(bio->ex_data), idx));
+}
+
+uint64_t BIO_number_read(BIO *bio)
+{
+ if (bio)
+ return bio->num_read;
+ return 0;
+}
+
+uint64_t BIO_number_written(BIO *bio)
+{
+ if (bio)
+ return bio->num_write;
+ return 0;
+}
+
+void bio_free_ex_data(BIO *bio)
+{
+ CRYPTO_free_ex_data(CRYPTO_EX_INDEX_BIO, bio, &bio->ex_data);
+}
+
+void bio_cleanup(void)
+{
+#ifndef OPENSSL_NO_SOCK
+ bio_sock_cleanup_int();
+ CRYPTO_THREAD_lock_free(bio_lookup_lock);
+ bio_lookup_lock = NULL;
+#endif
+ CRYPTO_THREAD_lock_free(bio_type_lock);
+ bio_type_lock = NULL;
+}
diff --git a/openssl-1.1.0h/crypto/bio/bio_meth.c b/openssl-1.1.0h/crypto/bio/bio_meth.c
new file mode 100644
index 0000000..1e785d3
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bio_meth.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "bio_lcl.h"
+#include <internal/thread_once.h>
+
+CRYPTO_RWLOCK *bio_type_lock = NULL;
+static CRYPTO_ONCE bio_type_init = CRYPTO_ONCE_STATIC_INIT;
+
+DEFINE_RUN_ONCE_STATIC(do_bio_type_init)
+{
+ bio_type_lock = CRYPTO_THREAD_lock_new();
+ return bio_type_lock != NULL;
+}
+
+int BIO_get_new_index()
+{
+ static int bio_count = BIO_TYPE_START;
+ int newval;
+
+ if (!RUN_ONCE(&bio_type_init, do_bio_type_init)) {
+ BIOerr(BIO_F_BIO_GET_NEW_INDEX, ERR_R_MALLOC_FAILURE);
+ return -1;
+ }
+ if (!CRYPTO_atomic_add(&bio_count, 1, &newval, bio_type_lock))
+ return -1;
+ return newval;
+}
+
+BIO_METHOD *BIO_meth_new(int type, const char *name)
+{
+ BIO_METHOD *biom = OPENSSL_zalloc(sizeof(BIO_METHOD));
+
+ if (biom == NULL
+ || (biom->name = OPENSSL_strdup(name)) == NULL) {
+ OPENSSL_free(biom);
+ BIOerr(BIO_F_BIO_METH_NEW, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+ return biom;
+}
+
+void BIO_meth_free(BIO_METHOD *biom)
+{
+ if (biom != NULL) {
+ OPENSSL_free(biom->name);
+ OPENSSL_free(biom);
+ }
+}
+
+int (*BIO_meth_get_write(BIO_METHOD *biom)) (BIO *, const char *, int)
+{
+ return biom->bwrite;
+}
+
+int BIO_meth_set_write(BIO_METHOD *biom,
+ int (*bwrite) (BIO *, const char *, int))
+{
+ biom->bwrite = bwrite;
+ return 1;
+}
+
+int (*BIO_meth_get_read(BIO_METHOD *biom)) (BIO *, char *, int)
+{
+ return biom->bread;
+}
+
+int BIO_meth_set_read(BIO_METHOD *biom,
+ int (*bread) (BIO *, char *, int))
+{
+ biom->bread = bread;
+ return 1;
+}
+
+int (*BIO_meth_get_puts(BIO_METHOD *biom)) (BIO *, const char *)
+{
+ return biom->bputs;
+}
+
+int BIO_meth_set_puts(BIO_METHOD *biom,
+ int (*bputs) (BIO *, const char *))
+{
+ biom->bputs = bputs;
+ return 1;
+}
+
+int (*BIO_meth_get_gets(BIO_METHOD *biom)) (BIO *, char *, int)
+{
+ return biom->bgets;
+}
+
+int BIO_meth_set_gets(BIO_METHOD *biom,
+ int (*bgets) (BIO *, char *, int))
+{
+ biom->bgets = bgets;
+ return 1;
+}
+
+long (*BIO_meth_get_ctrl(BIO_METHOD *biom)) (BIO *, int, long, void *)
+{
+ return biom->ctrl;
+}
+
+int BIO_meth_set_ctrl(BIO_METHOD *biom,
+ long (*ctrl) (BIO *, int, long, void *))
+{
+ biom->ctrl = ctrl;
+ return 1;
+}
+
+int (*BIO_meth_get_create(BIO_METHOD *biom)) (BIO *)
+{
+ return biom->create;
+}
+
+int BIO_meth_set_create(BIO_METHOD *biom, int (*create) (BIO *))
+{
+ biom->create = create;
+ return 1;
+}
+
+int (*BIO_meth_get_destroy(BIO_METHOD *biom)) (BIO *)
+{
+ return biom->destroy;
+}
+
+int BIO_meth_set_destroy(BIO_METHOD *biom, int (*destroy) (BIO *))
+{
+ biom->destroy = destroy;
+ return 1;
+}
+
+long (*BIO_meth_get_callback_ctrl(BIO_METHOD *biom)) (BIO *, int, BIO_info_cb *)
+{
+ return biom->callback_ctrl;
+}
+
+int BIO_meth_set_callback_ctrl(BIO_METHOD *biom,
+ long (*callback_ctrl) (BIO *, int,
+ BIO_info_cb *))
+{
+ biom->callback_ctrl = callback_ctrl;
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/bio/bss_acpt.c b/openssl-1.1.0h/crypto/bio/bss_acpt.c
new file mode 100644
index 0000000..21d21c1
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_acpt.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+
+#ifndef OPENSSL_NO_SOCK
+
+typedef struct bio_accept_st {
+ int state;
+ int accept_family;
+ int bind_mode; /* Socket mode for BIO_listen */
+ int accepted_mode; /* Socket mode for BIO_accept (set on accepted sock) */
+ char *param_addr;
+ char *param_serv;
+
+ int accept_sock;
+
+ BIO_ADDRINFO *addr_first;
+ const BIO_ADDRINFO *addr_iter;
+ BIO_ADDR cache_accepting_addr; /* Useful if we asked for port 0 */
+ char *cache_accepting_name, *cache_accepting_serv;
+ BIO_ADDR cache_peer_addr;
+ char *cache_peer_name, *cache_peer_serv;
+
+ BIO *bio_chain;
+} BIO_ACCEPT;
+
+static int acpt_write(BIO *h, const char *buf, int num);
+static int acpt_read(BIO *h, char *buf, int size);
+static int acpt_puts(BIO *h, const char *str);
+static long acpt_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int acpt_new(BIO *h);
+static int acpt_free(BIO *data);
+static int acpt_state(BIO *b, BIO_ACCEPT *c);
+static void acpt_close_socket(BIO *data);
+static BIO_ACCEPT *BIO_ACCEPT_new(void);
+static void BIO_ACCEPT_free(BIO_ACCEPT *a);
+
+# define ACPT_S_BEFORE 1
+# define ACPT_S_GET_ADDR 2
+# define ACPT_S_CREATE_SOCKET 3
+# define ACPT_S_LISTEN 4
+# define ACPT_S_ACCEPT 5
+# define ACPT_S_OK 6
+
+static const BIO_METHOD methods_acceptp = {
+ BIO_TYPE_ACCEPT,
+ "socket accept",
+ acpt_write,
+ acpt_read,
+ acpt_puts,
+ NULL, /* connect_gets, */
+ acpt_ctrl,
+ acpt_new,
+ acpt_free,
+ NULL, /* connect_callback_ctrl */
+};
+
+const BIO_METHOD *BIO_s_accept(void)
+{
+ return (&methods_acceptp);
+}
+
+static int acpt_new(BIO *bi)
+{
+ BIO_ACCEPT *ba;
+
+ bi->init = 0;
+ bi->num = (int)INVALID_SOCKET;
+ bi->flags = 0;
+ if ((ba = BIO_ACCEPT_new()) == NULL)
+ return (0);
+ bi->ptr = (char *)ba;
+ ba->state = ACPT_S_BEFORE;
+ bi->shutdown = 1;
+ return (1);
+}
+
+static BIO_ACCEPT *BIO_ACCEPT_new(void)
+{
+ BIO_ACCEPT *ret;
+
+ if ((ret = OPENSSL_zalloc(sizeof(*ret))) == NULL)
+ return (NULL);
+ ret->accept_family = BIO_FAMILY_IPANY;
+ ret->accept_sock = (int)INVALID_SOCKET;
+ return (ret);
+}
+
+static void BIO_ACCEPT_free(BIO_ACCEPT *a)
+{
+ if (a == NULL)
+ return;
+
+ OPENSSL_free(a->param_addr);
+ OPENSSL_free(a->param_serv);
+ BIO_ADDRINFO_free(a->addr_first);
+ OPENSSL_free(a->cache_accepting_name);
+ OPENSSL_free(a->cache_accepting_serv);
+ OPENSSL_free(a->cache_peer_name);
+ OPENSSL_free(a->cache_peer_serv);
+ BIO_free(a->bio_chain);
+ OPENSSL_free(a);
+}
+
+static void acpt_close_socket(BIO *bio)
+{
+ BIO_ACCEPT *c;
+
+ c = (BIO_ACCEPT *)bio->ptr;
+ if (c->accept_sock != (int)INVALID_SOCKET) {
+ shutdown(c->accept_sock, 2);
+ closesocket(c->accept_sock);
+ c->accept_sock = (int)INVALID_SOCKET;
+ bio->num = (int)INVALID_SOCKET;
+ }
+}
+
+static int acpt_free(BIO *a)
+{
+ BIO_ACCEPT *data;
+
+ if (a == NULL)
+ return (0);
+ data = (BIO_ACCEPT *)a->ptr;
+
+ if (a->shutdown) {
+ acpt_close_socket(a);
+ BIO_ACCEPT_free(data);
+ a->ptr = NULL;
+ a->flags = 0;
+ a->init = 0;
+ }
+ return (1);
+}
+
+static int acpt_state(BIO *b, BIO_ACCEPT *c)
+{
+ BIO *bio = NULL, *dbio;
+ int s = -1, ret = -1;
+
+ for (;;) {
+ switch (c->state) {
+ case ACPT_S_BEFORE:
+ if (c->param_addr == NULL && c->param_serv == NULL) {
+ BIOerr(BIO_F_ACPT_STATE, BIO_R_NO_ACCEPT_ADDR_OR_SERVICE_SPECIFIED);
+ ERR_add_error_data(4,
+ "hostname=", c->param_addr,
+ " service=", c->param_serv);
+ goto exit_loop;
+ }
+
+ /* Because we're starting a new bind, any cached name and serv
+ * are now obsolete and need to be cleaned out.
+ * QUESTION: should this be done in acpt_close_socket() instead?
+ */
+ OPENSSL_free(c->cache_accepting_name);
+ c->cache_accepting_name = NULL;
+ OPENSSL_free(c->cache_accepting_serv);
+ c->cache_accepting_serv = NULL;
+ OPENSSL_free(c->cache_peer_name);
+ c->cache_peer_name = NULL;
+ OPENSSL_free(c->cache_peer_serv);
+ c->cache_peer_serv = NULL;
+
+ c->state = ACPT_S_GET_ADDR;
+ break;
+
+ case ACPT_S_GET_ADDR:
+ {
+ int family = AF_UNSPEC;
+ switch (c->accept_family) {
+ case BIO_FAMILY_IPV6:
+ if (1) { /* This is a trick we use to avoid bit rot.
+ * at least the "else" part will always be
+ * compiled.
+ */
+#ifdef AF_INET6
+ family = AF_INET6;
+ } else {
+#endif
+ BIOerr(BIO_F_ACPT_STATE, BIO_R_UNAVAILABLE_IP_FAMILY);
+ goto exit_loop;
+ }
+ break;
+ case BIO_FAMILY_IPV4:
+ family = AF_INET;
+ break;
+ case BIO_FAMILY_IPANY:
+ family = AF_UNSPEC;
+ break;
+ default:
+ BIOerr(BIO_F_ACPT_STATE, BIO_R_UNSUPPORTED_IP_FAMILY);
+ goto exit_loop;
+ }
+ if (BIO_lookup(c->param_addr, c->param_serv, BIO_LOOKUP_SERVER,
+ family, SOCK_STREAM, &c->addr_first) == 0)
+ goto exit_loop;
+ }
+ if (c->addr_first == NULL) {
+ BIOerr(BIO_F_ACPT_STATE, BIO_R_LOOKUP_RETURNED_NOTHING);
+ goto exit_loop;
+ }
+ /* We're currently not iterating, but set this as preparation
+ * for possible future development in that regard
+ */
+ c->addr_iter = c->addr_first;
+ c->state = ACPT_S_CREATE_SOCKET;
+ break;
+
+ case ACPT_S_CREATE_SOCKET:
+ ret = BIO_socket(BIO_ADDRINFO_family(c->addr_iter),
+ BIO_ADDRINFO_socktype(c->addr_iter),
+ BIO_ADDRINFO_protocol(c->addr_iter), 0);
+ if (ret == (int)INVALID_SOCKET) {
+ SYSerr(SYS_F_SOCKET, get_last_socket_error());
+ ERR_add_error_data(4,
+ "hostname=", c->param_addr,
+ " service=", c->param_serv);
+ BIOerr(BIO_F_ACPT_STATE, BIO_R_UNABLE_TO_CREATE_SOCKET);
+ goto exit_loop;
+ }
+ c->accept_sock = ret;
+ b->num = ret;
+ c->state = ACPT_S_LISTEN;
+ break;
+
+ case ACPT_S_LISTEN:
+ {
+ if (!BIO_listen(c->accept_sock,
+ BIO_ADDRINFO_address(c->addr_iter),
+ c->bind_mode)) {
+ BIO_closesocket(c->accept_sock);
+ goto exit_loop;
+ }
+ }
+
+ {
+ union BIO_sock_info_u info;
+
+ info.addr = &c->cache_accepting_addr;
+ if (!BIO_sock_info(c->accept_sock, BIO_SOCK_INFO_ADDRESS,
+ &info)) {
+ BIO_closesocket(c->accept_sock);
+ goto exit_loop;
+ }
+ }
+
+ c->cache_accepting_name =
+ BIO_ADDR_hostname_string(&c->cache_accepting_addr, 1);
+ c->cache_accepting_serv =
+ BIO_ADDR_service_string(&c->cache_accepting_addr, 1);
+ c->state = ACPT_S_ACCEPT;
+ s = -1;
+ ret = 1;
+ goto end;
+
+ case ACPT_S_ACCEPT:
+ if (b->next_bio != NULL) {
+ c->state = ACPT_S_OK;
+ break;
+ }
+ BIO_clear_retry_flags(b);
+ b->retry_reason = 0;
+
+ OPENSSL_free(c->cache_peer_name);
+ c->cache_peer_name = NULL;
+ OPENSSL_free(c->cache_peer_serv);
+ c->cache_peer_serv = NULL;
+
+ s = BIO_accept_ex(c->accept_sock, &c->cache_peer_addr,
+ c->accepted_mode);
+
+ /* If the returned socket is invalid, this might still be
+ * retryable
+ */
+ if (s < 0) {
+ if (BIO_sock_should_retry(s)) {
+ BIO_set_retry_special(b);
+ b->retry_reason = BIO_RR_ACCEPT;
+ goto end;
+ }
+ }
+
+ /* If it wasn't retryable, we fail */
+ if (s < 0) {
+ ret = s;
+ goto exit_loop;
+ }
+
+ bio = BIO_new_socket(s, BIO_CLOSE);
+ if (bio == NULL)
+ goto exit_loop;
+
+ BIO_set_callback(bio, BIO_get_callback(b));
+ BIO_set_callback_arg(bio, BIO_get_callback_arg(b));
+
+ /*
+ * If the accept BIO has an bio_chain, we dup it and put the new
+ * socket at the end.
+ */
+ if (c->bio_chain != NULL) {
+ if ((dbio = BIO_dup_chain(c->bio_chain)) == NULL)
+ goto exit_loop;
+ if (!BIO_push(dbio, bio))
+ goto exit_loop;
+ bio = dbio;
+ }
+ if (BIO_push(b, bio) == NULL)
+ goto exit_loop;
+
+ c->cache_peer_name =
+ BIO_ADDR_hostname_string(&c->cache_peer_addr, 1);
+ c->cache_peer_serv =
+ BIO_ADDR_service_string(&c->cache_peer_addr, 1);
+ c->state = ACPT_S_OK;
+ bio = NULL;
+ ret = 1;
+ goto end;
+
+ case ACPT_S_OK:
+ if (b->next_bio == NULL) {
+ c->state = ACPT_S_ACCEPT;
+ break;
+ }
+ ret = 1;
+ goto end;
+
+ default:
+ ret = 0;
+ goto end;
+ }
+ }
+
+ exit_loop:
+ if (bio != NULL)
+ BIO_free(bio);
+ else if (s >= 0)
+ BIO_closesocket(s);
+ end:
+ return ret;
+}
+
+static int acpt_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+ BIO_ACCEPT *data;
+
+ BIO_clear_retry_flags(b);
+ data = (BIO_ACCEPT *)b->ptr;
+
+ while (b->next_bio == NULL) {
+ ret = acpt_state(b, data);
+ if (ret <= 0)
+ return (ret);
+ }
+
+ ret = BIO_read(b->next_bio, out, outl);
+ BIO_copy_next_retry(b);
+ return (ret);
+}
+
+static int acpt_write(BIO *b, const char *in, int inl)
+{
+ int ret;
+ BIO_ACCEPT *data;
+
+ BIO_clear_retry_flags(b);
+ data = (BIO_ACCEPT *)b->ptr;
+
+ while (b->next_bio == NULL) {
+ ret = acpt_state(b, data);
+ if (ret <= 0)
+ return (ret);
+ }
+
+ ret = BIO_write(b->next_bio, in, inl);
+ BIO_copy_next_retry(b);
+ return (ret);
+}
+
+static long acpt_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ int *ip;
+ long ret = 1;
+ BIO_ACCEPT *data;
+ char **pp;
+
+ data = (BIO_ACCEPT *)b->ptr;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ ret = 0;
+ data->state = ACPT_S_BEFORE;
+ acpt_close_socket(b);
+ BIO_ADDRINFO_free(data->addr_first);
+ data->addr_first = NULL;
+ b->flags = 0;
+ break;
+ case BIO_C_DO_STATE_MACHINE:
+ /* use this one to start the connection */
+ ret = (long)acpt_state(b, data);
+ break;
+ case BIO_C_SET_ACCEPT:
+ if (ptr != NULL) {
+ if (num == 0) {
+ char *hold_serv = data->param_serv;
+ /* We affect the hostname regardless. However, the input
+ * string might contain a host:service spec, so we must
+ * parse it, which might or might not affect the service
+ */
+ OPENSSL_free(data->param_addr);
+ data->param_addr = NULL;
+ ret = BIO_parse_hostserv(ptr,
+ &data->param_addr,
+ &data->param_serv,
+ BIO_PARSE_PRIO_SERV);
+ if (hold_serv != data->param_serv)
+ OPENSSL_free(hold_serv);
+ b->init = 1;
+ } else if (num == 1) {
+ OPENSSL_free(data->param_serv);
+ data->param_serv = BUF_strdup(ptr);
+ b->init = 1;
+ } else if (num == 2) {
+ data->bind_mode |= BIO_SOCK_NONBLOCK;
+ } else if (num == 3) {
+ BIO_free(data->bio_chain);
+ data->bio_chain = (BIO *)ptr;
+ } else if (num == 4) {
+ data->accept_family = *(int *)ptr;
+ }
+ } else {
+ if (num == 2) {
+ data->bind_mode &= ~BIO_SOCK_NONBLOCK;
+ }
+ }
+ break;
+ case BIO_C_SET_NBIO:
+ if (num != 0)
+ data->accepted_mode |= BIO_SOCK_NONBLOCK;
+ else
+ data->accepted_mode &= ~BIO_SOCK_NONBLOCK;
+ break;
+ case BIO_C_SET_FD:
+ b->init = 1;
+ b->num = *((int *)ptr);
+ data->accept_sock = b->num;
+ data->state = ACPT_S_ACCEPT;
+ b->shutdown = (int)num;
+ b->init = 1;
+ break;
+ case BIO_C_GET_FD:
+ if (b->init) {
+ ip = (int *)ptr;
+ if (ip != NULL)
+ *ip = data->accept_sock;
+ ret = data->accept_sock;
+ } else
+ ret = -1;
+ break;
+ case BIO_C_GET_ACCEPT:
+ if (b->init) {
+ if (num == 0 && ptr != NULL) {
+ pp = (char **)ptr;
+ *pp = data->cache_accepting_name;
+ } else if (num == 1 && ptr != NULL) {
+ pp = (char **)ptr;
+ *pp = data->cache_accepting_serv;
+ } else if (num == 2 && ptr != NULL) {
+ pp = (char **)ptr;
+ *pp = data->cache_peer_name;
+ } else if (num == 3 && ptr != NULL) {
+ pp = (char **)ptr;
+ *pp = data->cache_peer_serv;
+ } else if (num == 4) {
+ switch (BIO_ADDRINFO_family(data->addr_iter)) {
+#ifdef AF_INET6
+ case AF_INET6:
+ ret = BIO_FAMILY_IPV6;
+ break;
+#endif
+ case AF_INET:
+ ret = BIO_FAMILY_IPV4;
+ break;
+ case 0:
+ ret = data->accept_family;
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+ } else
+ ret = -1;
+ } else
+ ret = -1;
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ ret = b->shutdown;
+ break;
+ case BIO_CTRL_SET_CLOSE:
+ b->shutdown = (int)num;
+ break;
+ case BIO_CTRL_PENDING:
+ case BIO_CTRL_WPENDING:
+ ret = 0;
+ break;
+ case BIO_CTRL_FLUSH:
+ break;
+ case BIO_C_SET_BIND_MODE:
+ data->bind_mode = (int)num;
+ break;
+ case BIO_C_GET_BIND_MODE:
+ ret = (long)data->bind_mode;
+ break;
+ case BIO_CTRL_DUP:
+/*- dbio=(BIO *)ptr;
+ if (data->param_port) EAY EAY
+ BIO_set_port(dbio,data->param_port);
+ if (data->param_hostname)
+ BIO_set_hostname(dbio,data->param_hostname);
+ BIO_set_nbio(dbio,data->nbio); */
+ break;
+
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int acpt_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = acpt_write(bp, str, n);
+ return (ret);
+}
+
+BIO *BIO_new_accept(const char *str)
+{
+ BIO *ret;
+
+ ret = BIO_new(BIO_s_accept());
+ if (ret == NULL)
+ return (NULL);
+ if (BIO_set_accept_name(ret, str))
+ return (ret);
+ BIO_free(ret);
+ return (NULL);
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/bio/bss_bio.c b/openssl-1.1.0h/crypto/bio/bss_bio.c
new file mode 100644
index 0000000..de34f6b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_bio.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Special method for a BIO where the other endpoint is also a BIO of this
+ * kind, handled by the same thread (i.e. the "peer" is actually ourselves,
+ * wearing a different hat). Such "BIO pairs" are mainly for using the SSL
+ * library with I/O interfaces for which no specific BIO method is available.
+ * See ssl/ssltest.c for some hints on how this can be used.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bio_lcl.h"
+#include <openssl/err.h>
+#include <openssl/crypto.h>
+
+#include "e_os.h"
+
+static int bio_new(BIO *bio);
+static int bio_free(BIO *bio);
+static int bio_read(BIO *bio, char *buf, int size);
+static int bio_write(BIO *bio, const char *buf, int num);
+static long bio_ctrl(BIO *bio, int cmd, long num, void *ptr);
+static int bio_puts(BIO *bio, const char *str);
+
+static int bio_make_pair(BIO *bio1, BIO *bio2);
+static void bio_destroy_pair(BIO *bio);
+
+static const BIO_METHOD methods_biop = {
+ BIO_TYPE_BIO,
+ "BIO pair",
+ bio_write,
+ bio_read,
+ bio_puts,
+ NULL /* no bio_gets */ ,
+ bio_ctrl,
+ bio_new,
+ bio_free,
+ NULL /* no bio_callback_ctrl */
+};
+
+const BIO_METHOD *BIO_s_bio(void)
+{
+ return &methods_biop;
+}
+
+struct bio_bio_st {
+ BIO *peer; /* NULL if buf == NULL. If peer != NULL, then
+ * peer->ptr is also a bio_bio_st, and its
+ * "peer" member points back to us. peer !=
+ * NULL iff init != 0 in the BIO. */
+ /* This is for what we write (i.e. reading uses peer's struct): */
+ int closed; /* valid iff peer != NULL */
+ size_t len; /* valid iff buf != NULL; 0 if peer == NULL */
+ size_t offset; /* valid iff buf != NULL; 0 if len == 0 */
+ size_t size;
+ char *buf; /* "size" elements (if != NULL) */
+ size_t request; /* valid iff peer != NULL; 0 if len != 0,
+ * otherwise set by peer to number of bytes
+ * it (unsuccessfully) tried to read, never
+ * more than buffer space (size-len)
+ * warrants. */
+};
+
+static int bio_new(BIO *bio)
+{
+ struct bio_bio_st *b = OPENSSL_zalloc(sizeof(*b));
+
+ if (b == NULL)
+ return 0;
+
+ /* enough for one TLS record (just a default) */
+ b->size = 17 * 1024;
+
+ bio->ptr = b;
+ return 1;
+}
+
+static int bio_free(BIO *bio)
+{
+ struct bio_bio_st *b;
+
+ if (bio == NULL)
+ return 0;
+ b = bio->ptr;
+
+ assert(b != NULL);
+
+ if (b->peer)
+ bio_destroy_pair(bio);
+
+ OPENSSL_free(b->buf);
+ OPENSSL_free(b);
+
+ return 1;
+}
+
+static int bio_read(BIO *bio, char *buf, int size_)
+{
+ size_t size = size_;
+ size_t rest;
+ struct bio_bio_st *b, *peer_b;
+
+ BIO_clear_retry_flags(bio);
+
+ if (!bio->init)
+ return 0;
+
+ b = bio->ptr;
+ assert(b != NULL);
+ assert(b->peer != NULL);
+ peer_b = b->peer->ptr;
+ assert(peer_b != NULL);
+ assert(peer_b->buf != NULL);
+
+ peer_b->request = 0; /* will be set in "retry_read" situation */
+
+ if (buf == NULL || size == 0)
+ return 0;
+
+ if (peer_b->len == 0) {
+ if (peer_b->closed)
+ return 0; /* writer has closed, and no data is left */
+ else {
+ BIO_set_retry_read(bio); /* buffer is empty */
+ if (size <= peer_b->size)
+ peer_b->request = size;
+ else
+ /*
+ * don't ask for more than the peer can deliver in one write
+ */
+ peer_b->request = peer_b->size;
+ return -1;
+ }
+ }
+
+ /* we can read */
+ if (peer_b->len < size)
+ size = peer_b->len;
+
+ /* now read "size" bytes */
+
+ rest = size;
+
+ assert(rest > 0);
+ do { /* one or two iterations */
+ size_t chunk;
+
+ assert(rest <= peer_b->len);
+ if (peer_b->offset + rest <= peer_b->size)
+ chunk = rest;
+ else
+ /* wrap around ring buffer */
+ chunk = peer_b->size - peer_b->offset;
+ assert(peer_b->offset + chunk <= peer_b->size);
+
+ memcpy(buf, peer_b->buf + peer_b->offset, chunk);
+
+ peer_b->len -= chunk;
+ if (peer_b->len) {
+ peer_b->offset += chunk;
+ assert(peer_b->offset <= peer_b->size);
+ if (peer_b->offset == peer_b->size)
+ peer_b->offset = 0;
+ buf += chunk;
+ } else {
+ /* buffer now empty, no need to advance "buf" */
+ assert(chunk == rest);
+ peer_b->offset = 0;
+ }
+ rest -= chunk;
+ }
+ while (rest);
+
+ return size;
+}
+
+/*-
+ * non-copying interface: provide pointer to available data in buffer
+ * bio_nread0: return number of available bytes
+ * bio_nread: also advance index
+ * (example usage: bio_nread0(), read from buffer, bio_nread()
+ * or just bio_nread(), read from buffer)
+ */
+/*
+ * WARNING: The non-copying interface is largely untested as of yet and may
+ * contain bugs.
+ */
+static ossl_ssize_t bio_nread0(BIO *bio, char **buf)
+{
+ struct bio_bio_st *b, *peer_b;
+ ossl_ssize_t num;
+
+ BIO_clear_retry_flags(bio);
+
+ if (!bio->init)
+ return 0;
+
+ b = bio->ptr;
+ assert(b != NULL);
+ assert(b->peer != NULL);
+ peer_b = b->peer->ptr;
+ assert(peer_b != NULL);
+ assert(peer_b->buf != NULL);
+
+ peer_b->request = 0;
+
+ if (peer_b->len == 0) {
+ char dummy;
+
+ /* avoid code duplication -- nothing available for reading */
+ return bio_read(bio, &dummy, 1); /* returns 0 or -1 */
+ }
+
+ num = peer_b->len;
+ if (peer_b->size < peer_b->offset + num)
+ /* no ring buffer wrap-around for non-copying interface */
+ num = peer_b->size - peer_b->offset;
+ assert(num > 0);
+
+ if (buf != NULL)
+ *buf = peer_b->buf + peer_b->offset;
+ return num;
+}
+
+static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
+{
+ struct bio_bio_st *b, *peer_b;
+ ossl_ssize_t num, available;
+
+ if (num_ > OSSL_SSIZE_MAX)
+ num = OSSL_SSIZE_MAX;
+ else
+ num = (ossl_ssize_t) num_;
+
+ available = bio_nread0(bio, buf);
+ if (num > available)
+ num = available;
+ if (num <= 0)
+ return num;
+
+ b = bio->ptr;
+ peer_b = b->peer->ptr;
+
+ peer_b->len -= num;
+ if (peer_b->len) {
+ peer_b->offset += num;
+ assert(peer_b->offset <= peer_b->size);
+ if (peer_b->offset == peer_b->size)
+ peer_b->offset = 0;
+ } else
+ peer_b->offset = 0;
+
+ return num;
+}
+
+static int bio_write(BIO *bio, const char *buf, int num_)
+{
+ size_t num = num_;
+ size_t rest;
+ struct bio_bio_st *b;
+
+ BIO_clear_retry_flags(bio);
+
+ if (!bio->init || buf == NULL || num == 0)
+ return 0;
+
+ b = bio->ptr;
+ assert(b != NULL);
+ assert(b->peer != NULL);
+ assert(b->buf != NULL);
+
+ b->request = 0;
+ if (b->closed) {
+ /* we already closed */
+ BIOerr(BIO_F_BIO_WRITE, BIO_R_BROKEN_PIPE);
+ return -1;
+ }
+
+ assert(b->len <= b->size);
+
+ if (b->len == b->size) {
+ BIO_set_retry_write(bio); /* buffer is full */
+ return -1;
+ }
+
+ /* we can write */
+ if (num > b->size - b->len)
+ num = b->size - b->len;
+
+ /* now write "num" bytes */
+
+ rest = num;
+
+ assert(rest > 0);
+ do { /* one or two iterations */
+ size_t write_offset;
+ size_t chunk;
+
+ assert(b->len + rest <= b->size);
+
+ write_offset = b->offset + b->len;
+ if (write_offset >= b->size)
+ write_offset -= b->size;
+ /* b->buf[write_offset] is the first byte we can write to. */
+
+ if (write_offset + rest <= b->size)
+ chunk = rest;
+ else
+ /* wrap around ring buffer */
+ chunk = b->size - write_offset;
+
+ memcpy(b->buf + write_offset, buf, chunk);
+
+ b->len += chunk;
+
+ assert(b->len <= b->size);
+
+ rest -= chunk;
+ buf += chunk;
+ }
+ while (rest);
+
+ return num;
+}
+
+/*-
+ * non-copying interface: provide pointer to region to write to
+ * bio_nwrite0: check how much space is available
+ * bio_nwrite: also increase length
+ * (example usage: bio_nwrite0(), write to buffer, bio_nwrite()
+ * or just bio_nwrite(), write to buffer)
+ */
+static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf)
+{
+ struct bio_bio_st *b;
+ size_t num;
+ size_t write_offset;
+
+ BIO_clear_retry_flags(bio);
+
+ if (!bio->init)
+ return 0;
+
+ b = bio->ptr;
+ assert(b != NULL);
+ assert(b->peer != NULL);
+ assert(b->buf != NULL);
+
+ b->request = 0;
+ if (b->closed) {
+ BIOerr(BIO_F_BIO_NWRITE0, BIO_R_BROKEN_PIPE);
+ return -1;
+ }
+
+ assert(b->len <= b->size);
+
+ if (b->len == b->size) {
+ BIO_set_retry_write(bio);
+ return -1;
+ }
+
+ num = b->size - b->len;
+ write_offset = b->offset + b->len;
+ if (write_offset >= b->size)
+ write_offset -= b->size;
+ if (write_offset + num > b->size)
+ /*
+ * no ring buffer wrap-around for non-copying interface (to fulfil
+ * the promise by BIO_ctrl_get_write_guarantee, BIO_nwrite may have
+ * to be called twice)
+ */
+ num = b->size - write_offset;
+
+ if (buf != NULL)
+ *buf = b->buf + write_offset;
+ assert(write_offset + num <= b->size);
+
+ return num;
+}
+
+static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
+{
+ struct bio_bio_st *b;
+ ossl_ssize_t num, space;
+
+ if (num_ > OSSL_SSIZE_MAX)
+ num = OSSL_SSIZE_MAX;
+ else
+ num = (ossl_ssize_t) num_;
+
+ space = bio_nwrite0(bio, buf);
+ if (num > space)
+ num = space;
+ if (num <= 0)
+ return num;
+ b = bio->ptr;
+ assert(b != NULL);
+ b->len += num;
+ assert(b->len <= b->size);
+
+ return num;
+}
+
+static long bio_ctrl(BIO *bio, int cmd, long num, void *ptr)
+{
+ long ret;
+ struct bio_bio_st *b = bio->ptr;
+
+ assert(b != NULL);
+
+ switch (cmd) {
+ /* specific CTRL codes */
+
+ case BIO_C_SET_WRITE_BUF_SIZE:
+ if (b->peer) {
+ BIOerr(BIO_F_BIO_CTRL, BIO_R_IN_USE);
+ ret = 0;
+ } else if (num == 0) {
+ BIOerr(BIO_F_BIO_CTRL, BIO_R_INVALID_ARGUMENT);
+ ret = 0;
+ } else {
+ size_t new_size = num;
+
+ if (b->size != new_size) {
+ OPENSSL_free(b->buf);
+ b->buf = NULL;
+ b->size = new_size;
+ }
+ ret = 1;
+ }
+ break;
+
+ case BIO_C_GET_WRITE_BUF_SIZE:
+ ret = (long)b->size;
+ break;
+
+ case BIO_C_MAKE_BIO_PAIR:
+ {
+ BIO *other_bio = ptr;
+
+ if (bio_make_pair(bio, other_bio))
+ ret = 1;
+ else
+ ret = 0;
+ }
+ break;
+
+ case BIO_C_DESTROY_BIO_PAIR:
+ /*
+ * Affects both BIOs in the pair -- call just once! Or let
+ * BIO_free(bio1); BIO_free(bio2); do the job.
+ */
+ bio_destroy_pair(bio);
+ ret = 1;
+ break;
+
+ case BIO_C_GET_WRITE_GUARANTEE:
+ /*
+ * How many bytes can the caller feed to the next write without
+ * having to keep any?
+ */
+ if (b->peer == NULL || b->closed)
+ ret = 0;
+ else
+ ret = (long)b->size - b->len;
+ break;
+
+ case BIO_C_GET_READ_REQUEST:
+ /*
+ * If the peer unsuccessfully tried to read, how many bytes were
+ * requested? (As with BIO_CTRL_PENDING, that number can usually be
+ * treated as boolean.)
+ */
+ ret = (long)b->request;
+ break;
+
+ case BIO_C_RESET_READ_REQUEST:
+ /*
+ * Reset request. (Can be useful after read attempts at the other
+ * side that are meant to be non-blocking, e.g. when probing SSL_read
+ * to see if any data is available.)
+ */
+ b->request = 0;
+ ret = 1;
+ break;
+
+ case BIO_C_SHUTDOWN_WR:
+ /* similar to shutdown(..., SHUT_WR) */
+ b->closed = 1;
+ ret = 1;
+ break;
+
+ case BIO_C_NREAD0:
+ /* prepare for non-copying read */
+ ret = (long)bio_nread0(bio, ptr);
+ break;
+
+ case BIO_C_NREAD:
+ /* non-copying read */
+ ret = (long)bio_nread(bio, ptr, (size_t)num);
+ break;
+
+ case BIO_C_NWRITE0:
+ /* prepare for non-copying write */
+ ret = (long)bio_nwrite0(bio, ptr);
+ break;
+
+ case BIO_C_NWRITE:
+ /* non-copying write */
+ ret = (long)bio_nwrite(bio, ptr, (size_t)num);
+ break;
+
+ /* standard CTRL codes follow */
+
+ case BIO_CTRL_RESET:
+ if (b->buf != NULL) {
+ b->len = 0;
+ b->offset = 0;
+ }
+ ret = 0;
+ break;
+
+ case BIO_CTRL_GET_CLOSE:
+ ret = bio->shutdown;
+ break;
+
+ case BIO_CTRL_SET_CLOSE:
+ bio->shutdown = (int)num;
+ ret = 1;
+ break;
+
+ case BIO_CTRL_PENDING:
+ if (b->peer != NULL) {
+ struct bio_bio_st *peer_b = b->peer->ptr;
+
+ ret = (long)peer_b->len;
+ } else
+ ret = 0;
+ break;
+
+ case BIO_CTRL_WPENDING:
+ if (b->buf != NULL)
+ ret = (long)b->len;
+ else
+ ret = 0;
+ break;
+
+ case BIO_CTRL_DUP:
+ /* See BIO_dup_chain for circumstances we have to expect. */
+ {
+ BIO *other_bio = ptr;
+ struct bio_bio_st *other_b;
+
+ assert(other_bio != NULL);
+ other_b = other_bio->ptr;
+ assert(other_b != NULL);
+
+ assert(other_b->buf == NULL); /* other_bio is always fresh */
+
+ other_b->size = b->size;
+ }
+
+ ret = 1;
+ break;
+
+ case BIO_CTRL_FLUSH:
+ ret = 1;
+ break;
+
+ case BIO_CTRL_EOF:
+ if (b->peer != NULL) {
+ struct bio_bio_st *peer_b = b->peer->ptr;
+
+ if (peer_b->len == 0 && peer_b->closed)
+ ret = 1;
+ else
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+ break;
+
+ default:
+ ret = 0;
+ }
+ return ret;
+}
+
+static int bio_puts(BIO *bio, const char *str)
+{
+ return bio_write(bio, str, strlen(str));
+}
+
+static int bio_make_pair(BIO *bio1, BIO *bio2)
+{
+ struct bio_bio_st *b1, *b2;
+
+ assert(bio1 != NULL);
+ assert(bio2 != NULL);
+
+ b1 = bio1->ptr;
+ b2 = bio2->ptr;
+
+ if (b1->peer != NULL || b2->peer != NULL) {
+ BIOerr(BIO_F_BIO_MAKE_PAIR, BIO_R_IN_USE);
+ return 0;
+ }
+
+ if (b1->buf == NULL) {
+ b1->buf = OPENSSL_malloc(b1->size);
+ if (b1->buf == NULL) {
+ BIOerr(BIO_F_BIO_MAKE_PAIR, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ b1->len = 0;
+ b1->offset = 0;
+ }
+
+ if (b2->buf == NULL) {
+ b2->buf = OPENSSL_malloc(b2->size);
+ if (b2->buf == NULL) {
+ BIOerr(BIO_F_BIO_MAKE_PAIR, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ b2->len = 0;
+ b2->offset = 0;
+ }
+
+ b1->peer = bio2;
+ b1->closed = 0;
+ b1->request = 0;
+ b2->peer = bio1;
+ b2->closed = 0;
+ b2->request = 0;
+
+ bio1->init = 1;
+ bio2->init = 1;
+
+ return 1;
+}
+
+static void bio_destroy_pair(BIO *bio)
+{
+ struct bio_bio_st *b = bio->ptr;
+
+ if (b != NULL) {
+ BIO *peer_bio = b->peer;
+
+ if (peer_bio != NULL) {
+ struct bio_bio_st *peer_b = peer_bio->ptr;
+
+ assert(peer_b != NULL);
+ assert(peer_b->peer == bio);
+
+ peer_b->peer = NULL;
+ peer_bio->init = 0;
+ assert(peer_b->buf != NULL);
+ peer_b->len = 0;
+ peer_b->offset = 0;
+
+ b->peer = NULL;
+ bio->init = 0;
+ assert(b->buf != NULL);
+ b->len = 0;
+ b->offset = 0;
+ }
+ }
+}
+
+/* Exported convenience functions */
+int BIO_new_bio_pair(BIO **bio1_p, size_t writebuf1,
+ BIO **bio2_p, size_t writebuf2)
+{
+ BIO *bio1 = NULL, *bio2 = NULL;
+ long r;
+ int ret = 0;
+
+ bio1 = BIO_new(BIO_s_bio());
+ if (bio1 == NULL)
+ goto err;
+ bio2 = BIO_new(BIO_s_bio());
+ if (bio2 == NULL)
+ goto err;
+
+ if (writebuf1) {
+ r = BIO_set_write_buf_size(bio1, writebuf1);
+ if (!r)
+ goto err;
+ }
+ if (writebuf2) {
+ r = BIO_set_write_buf_size(bio2, writebuf2);
+ if (!r)
+ goto err;
+ }
+
+ r = BIO_make_bio_pair(bio1, bio2);
+ if (!r)
+ goto err;
+ ret = 1;
+
+ err:
+ if (ret == 0) {
+ BIO_free(bio1);
+ bio1 = NULL;
+ BIO_free(bio2);
+ bio2 = NULL;
+ }
+
+ *bio1_p = bio1;
+ *bio2_p = bio2;
+ return ret;
+}
+
+size_t BIO_ctrl_get_write_guarantee(BIO *bio)
+{
+ return BIO_ctrl(bio, BIO_C_GET_WRITE_GUARANTEE, 0, NULL);
+}
+
+size_t BIO_ctrl_get_read_request(BIO *bio)
+{
+ return BIO_ctrl(bio, BIO_C_GET_READ_REQUEST, 0, NULL);
+}
+
+int BIO_ctrl_reset_read_request(BIO *bio)
+{
+ return (BIO_ctrl(bio, BIO_C_RESET_READ_REQUEST, 0, NULL) != 0);
+}
+
+/*
+ * BIO_nread0/nread/nwrite0/nwrite are available only for BIO pairs for now
+ * (conceivably some other BIOs could allow non-copying reads and writes
+ * too.)
+ */
+int BIO_nread0(BIO *bio, char **buf)
+{
+ long ret;
+
+ if (!bio->init) {
+ BIOerr(BIO_F_BIO_NREAD0, BIO_R_UNINITIALIZED);
+ return -2;
+ }
+
+ ret = BIO_ctrl(bio, BIO_C_NREAD0, 0, buf);
+ if (ret > INT_MAX)
+ return INT_MAX;
+ else
+ return (int)ret;
+}
+
+int BIO_nread(BIO *bio, char **buf, int num)
+{
+ int ret;
+
+ if (!bio->init) {
+ BIOerr(BIO_F_BIO_NREAD, BIO_R_UNINITIALIZED);
+ return -2;
+ }
+
+ ret = (int)BIO_ctrl(bio, BIO_C_NREAD, num, buf);
+ if (ret > 0)
+ bio->num_read += ret;
+ return ret;
+}
+
+int BIO_nwrite0(BIO *bio, char **buf)
+{
+ long ret;
+
+ if (!bio->init) {
+ BIOerr(BIO_F_BIO_NWRITE0, BIO_R_UNINITIALIZED);
+ return -2;
+ }
+
+ ret = BIO_ctrl(bio, BIO_C_NWRITE0, 0, buf);
+ if (ret > INT_MAX)
+ return INT_MAX;
+ else
+ return (int)ret;
+}
+
+int BIO_nwrite(BIO *bio, char **buf, int num)
+{
+ int ret;
+
+ if (!bio->init) {
+ BIOerr(BIO_F_BIO_NWRITE, BIO_R_UNINITIALIZED);
+ return -2;
+ }
+
+ ret = BIO_ctrl(bio, BIO_C_NWRITE, num, buf);
+ if (ret > 0)
+ bio->num_write += ret;
+ return ret;
+}
diff --git a/openssl-1.1.0h/crypto/bio/bss_conn.c b/openssl-1.1.0h/crypto/bio/bss_conn.c
new file mode 100644
index 0000000..e343bcd
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_conn.c
@@ -0,0 +1,543 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "bio_lcl.h"
+
+#ifndef OPENSSL_NO_SOCK
+
+typedef struct bio_connect_st {
+ int state;
+ int connect_family;
+ char *param_hostname;
+ char *param_service;
+ int connect_mode;
+
+ BIO_ADDRINFO *addr_first;
+ const BIO_ADDRINFO *addr_iter;
+ /*
+ * int socket; this will be kept in bio->num so that it is compatible
+ * with the bss_sock bio
+ */
+ /*
+ * called when the connection is initially made callback(BIO,state,ret);
+ * The callback should return 'ret'. state is for compatibility with the
+ * ssl info_callback
+ */
+ BIO_info_cb *info_callback;
+} BIO_CONNECT;
+
+static int conn_write(BIO *h, const char *buf, int num);
+static int conn_read(BIO *h, char *buf, int size);
+static int conn_puts(BIO *h, const char *str);
+static long conn_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int conn_new(BIO *h);
+static int conn_free(BIO *data);
+static long conn_callback_ctrl(BIO *h, int cmd, BIO_info_cb *);
+
+static int conn_state(BIO *b, BIO_CONNECT *c);
+static void conn_close_socket(BIO *data);
+BIO_CONNECT *BIO_CONNECT_new(void);
+void BIO_CONNECT_free(BIO_CONNECT *a);
+
+#define BIO_CONN_S_BEFORE 1
+#define BIO_CONN_S_GET_ADDR 2
+#define BIO_CONN_S_CREATE_SOCKET 3
+#define BIO_CONN_S_CONNECT 4
+#define BIO_CONN_S_OK 5
+#define BIO_CONN_S_BLOCKED_CONNECT 6
+
+static const BIO_METHOD methods_connectp = {
+ BIO_TYPE_CONNECT,
+ "socket connect",
+ conn_write,
+ conn_read,
+ conn_puts,
+ NULL, /* conn_gets, */
+ conn_ctrl,
+ conn_new,
+ conn_free,
+ conn_callback_ctrl,
+};
+
+static int conn_state(BIO *b, BIO_CONNECT *c)
+{
+ int ret = -1, i;
+ BIO_info_cb *cb = NULL;
+
+ if (c->info_callback != NULL)
+ cb = c->info_callback;
+
+ for (;;) {
+ switch (c->state) {
+ case BIO_CONN_S_BEFORE:
+ if (c->param_hostname == NULL && c->param_service == NULL) {
+ BIOerr(BIO_F_CONN_STATE, BIO_R_NO_HOSTNAME_OR_SERVICE_SPECIFIED);
+ ERR_add_error_data(4,
+ "hostname=", c->param_hostname,
+ " service=", c->param_service);
+ goto exit_loop;
+ }
+ c->state = BIO_CONN_S_GET_ADDR;
+ break;
+
+ case BIO_CONN_S_GET_ADDR:
+ {
+ int family = AF_UNSPEC;
+ switch (c->connect_family) {
+ case BIO_FAMILY_IPV6:
+ if (1) { /* This is a trick we use to avoid bit rot.
+ * at least the "else" part will always be
+ * compiled.
+ */
+#ifdef AF_INET6
+ family = AF_INET6;
+ } else {
+#endif
+ BIOerr(BIO_F_CONN_STATE, BIO_R_UNAVAILABLE_IP_FAMILY);
+ goto exit_loop;
+ }
+ break;
+ case BIO_FAMILY_IPV4:
+ family = AF_INET;
+ break;
+ case BIO_FAMILY_IPANY:
+ family = AF_UNSPEC;
+ break;
+ default:
+ BIOerr(BIO_F_CONN_STATE, BIO_R_UNSUPPORTED_IP_FAMILY);
+ goto exit_loop;
+ }
+ if (BIO_lookup(c->param_hostname, c->param_service,
+ BIO_LOOKUP_CLIENT,
+ family, SOCK_STREAM, &c->addr_first) == 0)
+ goto exit_loop;
+ }
+ if (c->addr_first == NULL) {
+ BIOerr(BIO_F_CONN_STATE, BIO_R_LOOKUP_RETURNED_NOTHING);
+ goto exit_loop;
+ }
+ c->addr_iter = c->addr_first;
+ c->state = BIO_CONN_S_CREATE_SOCKET;
+ break;
+
+ case BIO_CONN_S_CREATE_SOCKET:
+ ret = BIO_socket(BIO_ADDRINFO_family(c->addr_iter),
+ BIO_ADDRINFO_socktype(c->addr_iter),
+ BIO_ADDRINFO_protocol(c->addr_iter), 0);
+ if (ret == (int)INVALID_SOCKET) {
+ SYSerr(SYS_F_SOCKET, get_last_socket_error());
+ ERR_add_error_data(4,
+ "hostname=", c->param_hostname,
+ " service=", c->param_service);
+ BIOerr(BIO_F_CONN_STATE, BIO_R_UNABLE_TO_CREATE_SOCKET);
+ goto exit_loop;
+ }
+ b->num = ret;
+ c->state = BIO_CONN_S_CONNECT;
+ break;
+
+ case BIO_CONN_S_CONNECT:
+ BIO_clear_retry_flags(b);
+ ret = BIO_connect(b->num, BIO_ADDRINFO_address(c->addr_iter),
+ BIO_SOCK_KEEPALIVE | c->connect_mode);
+ b->retry_reason = 0;
+ if (ret == 0) {
+ if (BIO_sock_should_retry(ret)) {
+ BIO_set_retry_special(b);
+ c->state = BIO_CONN_S_BLOCKED_CONNECT;
+ b->retry_reason = BIO_RR_CONNECT;
+ ERR_clear_error();
+ } else if ((c->addr_iter = BIO_ADDRINFO_next(c->addr_iter))
+ != NULL) {
+ /*
+ * if there are more addresses to try, do that first
+ */
+ BIO_closesocket(b->num);
+ c->state = BIO_CONN_S_CREATE_SOCKET;
+ ERR_clear_error();
+ break;
+ } else {
+ SYSerr(SYS_F_CONNECT, get_last_socket_error());
+ ERR_add_error_data(4,
+ "hostname=", c->param_hostname,
+ " service=", c->param_service);
+ BIOerr(BIO_F_CONN_STATE, BIO_R_CONNECT_ERROR);
+ }
+ goto exit_loop;
+ } else {
+ c->state = BIO_CONN_S_OK;
+ }
+ break;
+
+ case BIO_CONN_S_BLOCKED_CONNECT:
+ i = BIO_sock_error(b->num);
+ if (i) {
+ BIO_clear_retry_flags(b);
+ SYSerr(SYS_F_CONNECT, i);
+ ERR_add_error_data(4,
+ "hostname=", c->param_hostname,
+ " service=", c->param_service);
+ BIOerr(BIO_F_CONN_STATE, BIO_R_NBIO_CONNECT_ERROR);
+ ret = 0;
+ goto exit_loop;
+ } else
+ c->state = BIO_CONN_S_OK;
+ break;
+
+ case BIO_CONN_S_OK:
+ ret = 1;
+ goto exit_loop;
+ default:
+ /* abort(); */
+ goto exit_loop;
+ }
+
+ if (cb != NULL) {
+ if ((ret = cb((BIO *)b, c->state, ret)) == 0)
+ goto end;
+ }
+ }
+
+ /* Loop does not exit */
+ exit_loop:
+ if (cb != NULL)
+ ret = cb((BIO *)b, c->state, ret);
+ end:
+ return (ret);
+}
+
+BIO_CONNECT *BIO_CONNECT_new(void)
+{
+ BIO_CONNECT *ret;
+
+ if ((ret = OPENSSL_zalloc(sizeof(*ret))) == NULL)
+ return (NULL);
+ ret->state = BIO_CONN_S_BEFORE;
+ ret->connect_family = BIO_FAMILY_IPANY;
+ return (ret);
+}
+
+void BIO_CONNECT_free(BIO_CONNECT *a)
+{
+ if (a == NULL)
+ return;
+
+ OPENSSL_free(a->param_hostname);
+ OPENSSL_free(a->param_service);
+ BIO_ADDRINFO_free(a->addr_first);
+ OPENSSL_free(a);
+}
+
+const BIO_METHOD *BIO_s_connect(void)
+{
+ return (&methods_connectp);
+}
+
+static int conn_new(BIO *bi)
+{
+ bi->init = 0;
+ bi->num = (int)INVALID_SOCKET;
+ bi->flags = 0;
+ if ((bi->ptr = (char *)BIO_CONNECT_new()) == NULL)
+ return (0);
+ else
+ return (1);
+}
+
+static void conn_close_socket(BIO *bio)
+{
+ BIO_CONNECT *c;
+
+ c = (BIO_CONNECT *)bio->ptr;
+ if (bio->num != (int)INVALID_SOCKET) {
+ /* Only do a shutdown if things were established */
+ if (c->state == BIO_CONN_S_OK)
+ shutdown(bio->num, 2);
+ BIO_closesocket(bio->num);
+ bio->num = (int)INVALID_SOCKET;
+ }
+}
+
+static int conn_free(BIO *a)
+{
+ BIO_CONNECT *data;
+
+ if (a == NULL)
+ return (0);
+ data = (BIO_CONNECT *)a->ptr;
+
+ if (a->shutdown) {
+ conn_close_socket(a);
+ BIO_CONNECT_free(data);
+ a->ptr = NULL;
+ a->flags = 0;
+ a->init = 0;
+ }
+ return (1);
+}
+
+static int conn_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+ BIO_CONNECT *data;
+
+ data = (BIO_CONNECT *)b->ptr;
+ if (data->state != BIO_CONN_S_OK) {
+ ret = conn_state(b, data);
+ if (ret <= 0)
+ return (ret);
+ }
+
+ if (out != NULL) {
+ clear_socket_error();
+ ret = readsocket(b->num, out, outl);
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_sock_should_retry(ret))
+ BIO_set_retry_read(b);
+ }
+ }
+ return (ret);
+}
+
+static int conn_write(BIO *b, const char *in, int inl)
+{
+ int ret;
+ BIO_CONNECT *data;
+
+ data = (BIO_CONNECT *)b->ptr;
+ if (data->state != BIO_CONN_S_OK) {
+ ret = conn_state(b, data);
+ if (ret <= 0)
+ return (ret);
+ }
+
+ clear_socket_error();
+ ret = writesocket(b->num, in, inl);
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_sock_should_retry(ret))
+ BIO_set_retry_write(b);
+ }
+ return (ret);
+}
+
+static long conn_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ BIO *dbio;
+ int *ip;
+ const char **pptr = NULL;
+ long ret = 1;
+ BIO_CONNECT *data;
+
+ data = (BIO_CONNECT *)b->ptr;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ ret = 0;
+ data->state = BIO_CONN_S_BEFORE;
+ conn_close_socket(b);
+ BIO_ADDRINFO_free(data->addr_first);
+ data->addr_first = NULL;
+ b->flags = 0;
+ break;
+ case BIO_C_DO_STATE_MACHINE:
+ /* use this one to start the connection */
+ if (data->state != BIO_CONN_S_OK)
+ ret = (long)conn_state(b, data);
+ else
+ ret = 1;
+ break;
+ case BIO_C_GET_CONNECT:
+ if (ptr != NULL) {
+ pptr = (const char **)ptr;
+ if (num == 0) {
+ *pptr = data->param_hostname;
+ } else if (num == 1) {
+ *pptr = data->param_service;
+ } else if (num == 2) {
+ *pptr = (const char *)BIO_ADDRINFO_address(data->addr_iter);
+ } else if (num == 3) {
+ switch (BIO_ADDRINFO_family(data->addr_iter)) {
+# ifdef AF_INET6
+ case AF_INET6:
+ ret = BIO_FAMILY_IPV6;
+ break;
+# endif
+ case AF_INET:
+ ret = BIO_FAMILY_IPV4;
+ break;
+ case 0:
+ ret = data->connect_family;
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+ } else {
+ ret = 0;
+ }
+ } else {
+ ret = 0;
+ }
+ break;
+ case BIO_C_SET_CONNECT:
+ if (ptr != NULL) {
+ b->init = 1;
+ if (num == 0) {
+ char *hold_service = data->param_service;
+ /* We affect the hostname regardless. However, the input
+ * string might contain a host:service spec, so we must
+ * parse it, which might or might not affect the service
+ */
+ OPENSSL_free(data->param_hostname);
+ data->param_hostname = NULL;
+ ret = BIO_parse_hostserv(ptr,
+ &data->param_hostname,
+ &data->param_service,
+ BIO_PARSE_PRIO_HOST);
+ if (hold_service != data->param_service)
+ OPENSSL_free(hold_service);
+ } else if (num == 1) {
+ OPENSSL_free(data->param_service);
+ data->param_service = BUF_strdup(ptr);
+ } else if (num == 2) {
+ const BIO_ADDR *addr = (const BIO_ADDR *)ptr;
+ if (ret) {
+ data->param_hostname = BIO_ADDR_hostname_string(addr, 1);
+ data->param_service = BIO_ADDR_service_string(addr, 1);
+ BIO_ADDRINFO_free(data->addr_first);
+ data->addr_first = NULL;
+ data->addr_iter = NULL;
+ }
+ } else if (num == 3) {
+ data->connect_family = *(int *)ptr;
+ } else {
+ ret = 0;
+ }
+ }
+ break;
+ case BIO_C_SET_NBIO:
+ if (num != 0)
+ data->connect_mode |= BIO_SOCK_NONBLOCK;
+ else
+ data->connect_mode &= ~BIO_SOCK_NONBLOCK;
+ break;
+ case BIO_C_SET_CONNECT_MODE:
+ data->connect_mode = (int)num;
+ break;
+ case BIO_C_GET_FD:
+ if (b->init) {
+ ip = (int *)ptr;
+ if (ip != NULL)
+ *ip = b->num;
+ ret = b->num;
+ } else
+ ret = -1;
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ ret = b->shutdown;
+ break;
+ case BIO_CTRL_SET_CLOSE:
+ b->shutdown = (int)num;
+ break;
+ case BIO_CTRL_PENDING:
+ case BIO_CTRL_WPENDING:
+ ret = 0;
+ break;
+ case BIO_CTRL_FLUSH:
+ break;
+ case BIO_CTRL_DUP:
+ {
+ dbio = (BIO *)ptr;
+ if (data->param_hostname)
+ BIO_set_conn_hostname(dbio, data->param_hostname);
+ if (data->param_service)
+ BIO_set_conn_port(dbio, data->param_service);
+ BIO_set_conn_ip_family(dbio, data->connect_family);
+ BIO_set_conn_mode(dbio, data->connect_mode);
+ /*
+ * FIXME: the cast of the function seems unlikely to be a good
+ * idea
+ */
+ (void)BIO_set_info_callback(dbio, data->info_callback);
+ }
+ break;
+ case BIO_CTRL_SET_CALLBACK:
+ {
+# if 0 /* FIXME: Should this be used? -- Richard
+ * Levitte */
+ BIOerr(BIO_F_CONN_CTRL, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ ret = -1;
+# else
+ ret = 0;
+# endif
+ }
+ break;
+ case BIO_CTRL_GET_CALLBACK:
+ {
+ BIO_info_cb **fptr;
+
+ fptr = (BIO_info_cb **)ptr;
+ *fptr = data->info_callback;
+ }
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static long conn_callback_ctrl(BIO *b, int cmd, BIO_info_cb *fp)
+{
+ long ret = 1;
+ BIO_CONNECT *data;
+
+ data = (BIO_CONNECT *)b->ptr;
+
+ switch (cmd) {
+ case BIO_CTRL_SET_CALLBACK:
+ {
+ data->info_callback = fp;
+ }
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int conn_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = conn_write(bp, str, n);
+ return (ret);
+}
+
+BIO *BIO_new_connect(const char *str)
+{
+ BIO *ret;
+
+ ret = BIO_new(BIO_s_connect());
+ if (ret == NULL)
+ return (NULL);
+ if (BIO_set_conn_hostname(ret, str))
+ return (ret);
+ BIO_free(ret);
+ return (NULL);
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/bio/bss_dgram.c b/openssl-1.1.0h/crypto/bio/bss_dgram.c
new file mode 100644
index 0000000..c772d95
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_dgram.c
@@ -0,0 +1,1923 @@
+/*
+ * Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "bio_lcl.h"
+#ifndef OPENSSL_NO_DGRAM
+
+# if !(defined(_WIN32) || defined(OPENSSL_SYS_VMS))
+# include <sys/time.h>
+# endif
+# if defined(OPENSSL_SYS_VMS)
+# include <sys/timeb.h>
+# endif
+
+# ifndef OPENSSL_NO_SCTP
+# include <netinet/sctp.h>
+# include <fcntl.h>
+# define OPENSSL_SCTP_DATA_CHUNK_TYPE 0x00
+# define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0
+# endif
+
+# if defined(OPENSSL_SYS_LINUX) && !defined(IP_MTU)
+# define IP_MTU 14 /* linux is lame */
+# endif
+
+# if OPENSSL_USE_IPV6 && !defined(IPPROTO_IPV6)
+# define IPPROTO_IPV6 41 /* windows is lame */
+# endif
+
+# if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
+/* Standard definition causes type-punning problems. */
+# undef IN6_IS_ADDR_V4MAPPED
+# define s6_addr32 __u6_addr.__u6_addr32
+# define IN6_IS_ADDR_V4MAPPED(a) \
+ (((a)->s6_addr32[0] == 0) && \
+ ((a)->s6_addr32[1] == 0) && \
+ ((a)->s6_addr32[2] == htonl(0x0000ffff)))
+# endif
+
+static int dgram_write(BIO *h, const char *buf, int num);
+static int dgram_read(BIO *h, char *buf, int size);
+static int dgram_puts(BIO *h, const char *str);
+static long dgram_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_new(BIO *h);
+static int dgram_free(BIO *data);
+static int dgram_clear(BIO *bio);
+
+# ifndef OPENSSL_NO_SCTP
+static int dgram_sctp_write(BIO *h, const char *buf, int num);
+static int dgram_sctp_read(BIO *h, char *buf, int size);
+static int dgram_sctp_puts(BIO *h, const char *str);
+static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_sctp_new(BIO *h);
+static int dgram_sctp_free(BIO *data);
+# ifdef SCTP_AUTHENTICATION_EVENT
+static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification
+ *snp);
+# endif
+# endif
+
+static int BIO_dgram_should_retry(int s);
+
+static void get_current_time(struct timeval *t);
+
+static const BIO_METHOD methods_dgramp = {
+ BIO_TYPE_DGRAM,
+ "datagram socket",
+ dgram_write,
+ dgram_read,
+ dgram_puts,
+ NULL, /* dgram_gets, */
+ dgram_ctrl,
+ dgram_new,
+ dgram_free,
+ NULL, /* dgram_callback_ctrl */
+};
+
+# ifndef OPENSSL_NO_SCTP
+static const BIO_METHOD methods_dgramp_sctp = {
+ BIO_TYPE_DGRAM_SCTP,
+ "datagram sctp socket",
+ dgram_sctp_write,
+ dgram_sctp_read,
+ dgram_sctp_puts,
+ NULL, /* dgram_gets, */
+ dgram_sctp_ctrl,
+ dgram_sctp_new,
+ dgram_sctp_free,
+ NULL, /* dgram_callback_ctrl */
+};
+# endif
+
+typedef struct bio_dgram_data_st {
+ BIO_ADDR peer;
+ unsigned int connected;
+ unsigned int _errno;
+ unsigned int mtu;
+ struct timeval next_timeout;
+ struct timeval socket_timeout;
+ unsigned int peekmode;
+} bio_dgram_data;
+
+# ifndef OPENSSL_NO_SCTP
+typedef struct bio_dgram_sctp_save_message_st {
+ BIO *bio;
+ char *data;
+ int length;
+} bio_dgram_sctp_save_message;
+
+typedef struct bio_dgram_sctp_data_st {
+ BIO_ADDR peer;
+ unsigned int connected;
+ unsigned int _errno;
+ unsigned int mtu;
+ struct bio_dgram_sctp_sndinfo sndinfo;
+ struct bio_dgram_sctp_rcvinfo rcvinfo;
+ struct bio_dgram_sctp_prinfo prinfo;
+ void (*handle_notifications) (BIO *bio, void *context, void *buf);
+ void *notification_context;
+ int in_handshake;
+ int ccs_rcvd;
+ int ccs_sent;
+ int save_shutdown;
+ int peer_auth_tested;
+} bio_dgram_sctp_data;
+# endif
+
+const BIO_METHOD *BIO_s_datagram(void)
+{
+ return (&methods_dgramp);
+}
+
+BIO *BIO_new_dgram(int fd, int close_flag)
+{
+ BIO *ret;
+
+ ret = BIO_new(BIO_s_datagram());
+ if (ret == NULL)
+ return (NULL);
+ BIO_set_fd(ret, fd, close_flag);
+ return (ret);
+}
+
+static int dgram_new(BIO *bi)
+{
+ bio_dgram_data *data = OPENSSL_zalloc(sizeof(*data));
+
+ if (data == NULL)
+ return 0;
+ bi->ptr = data;
+ return (1);
+}
+
+static int dgram_free(BIO *a)
+{
+ bio_dgram_data *data;
+
+ if (a == NULL)
+ return (0);
+ if (!dgram_clear(a))
+ return 0;
+
+ data = (bio_dgram_data *)a->ptr;
+ OPENSSL_free(data);
+
+ return (1);
+}
+
+static int dgram_clear(BIO *a)
+{
+ if (a == NULL)
+ return (0);
+ if (a->shutdown) {
+ if (a->init) {
+ BIO_closesocket(a->num);
+ }
+ a->init = 0;
+ a->flags = 0;
+ }
+ return (1);
+}
+
+static void dgram_adjust_rcv_timeout(BIO *b)
+{
+# if defined(SO_RCVTIMEO)
+ bio_dgram_data *data = (bio_dgram_data *)b->ptr;
+ union {
+ size_t s;
+ int i;
+ } sz = {
+ 0
+ };
+
+ /* Is a timer active? */
+ if (data->next_timeout.tv_sec > 0 || data->next_timeout.tv_usec > 0) {
+ struct timeval timenow, timeleft;
+
+ /* Read current socket timeout */
+# ifdef OPENSSL_SYS_WINDOWS
+ int timeout;
+
+ sz.i = sizeof(timeout);
+ if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
+ (void *)&timeout, &sz.i) < 0) {
+ perror("getsockopt");
+ } else {
+ data->socket_timeout.tv_sec = timeout / 1000;
+ data->socket_timeout.tv_usec = (timeout % 1000) * 1000;
+ }
+# else
+ sz.i = sizeof(data->socket_timeout);
+ if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
+ &(data->socket_timeout), (void *)&sz) < 0) {
+ perror("getsockopt");
+ } else if (sizeof(sz.s) != sizeof(sz.i) && sz.i == 0)
+ OPENSSL_assert(sz.s <= sizeof(data->socket_timeout));
+# endif
+
+ /* Get current time */
+ get_current_time(&timenow);
+
+ /* Calculate time left until timer expires */
+ memcpy(&timeleft, &(data->next_timeout), sizeof(struct timeval));
+ if (timeleft.tv_usec < timenow.tv_usec) {
+ timeleft.tv_usec = 1000000 - timenow.tv_usec + timeleft.tv_usec;
+ timeleft.tv_sec--;
+ } else {
+ timeleft.tv_usec -= timenow.tv_usec;
+ }
+ if (timeleft.tv_sec < timenow.tv_sec) {
+ timeleft.tv_sec = 0;
+ timeleft.tv_usec = 1;
+ } else {
+ timeleft.tv_sec -= timenow.tv_sec;
+ }
+
+ /*
+ * Adjust socket timeout if next handshake message timer will expire
+ * earlier.
+ */
+ if ((data->socket_timeout.tv_sec == 0
+ && data->socket_timeout.tv_usec == 0)
+ || (data->socket_timeout.tv_sec > timeleft.tv_sec)
+ || (data->socket_timeout.tv_sec == timeleft.tv_sec
+ && data->socket_timeout.tv_usec >= timeleft.tv_usec)) {
+# ifdef OPENSSL_SYS_WINDOWS
+ timeout = timeleft.tv_sec * 1000 + timeleft.tv_usec / 1000;
+ if (setsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
+ (void *)&timeout, sizeof(timeout)) < 0) {
+ perror("setsockopt");
+ }
+# else
+ if (setsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO, &timeleft,
+ sizeof(struct timeval)) < 0) {
+ perror("setsockopt");
+ }
+# endif
+ }
+ }
+# endif
+}
+
+static void dgram_reset_rcv_timeout(BIO *b)
+{
+# if defined(SO_RCVTIMEO)
+ bio_dgram_data *data = (bio_dgram_data *)b->ptr;
+
+ /* Is a timer active? */
+ if (data->next_timeout.tv_sec > 0 || data->next_timeout.tv_usec > 0) {
+# ifdef OPENSSL_SYS_WINDOWS
+ int timeout = data->socket_timeout.tv_sec * 1000 +
+ data->socket_timeout.tv_usec / 1000;
+ if (setsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
+ (void *)&timeout, sizeof(timeout)) < 0) {
+ perror("setsockopt");
+ }
+# else
+ if (setsockopt
+ (b->num, SOL_SOCKET, SO_RCVTIMEO, &(data->socket_timeout),
+ sizeof(struct timeval)) < 0) {
+ perror("setsockopt");
+ }
+# endif
+ }
+# endif
+}
+
+static int dgram_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+ bio_dgram_data *data = (bio_dgram_data *)b->ptr;
+ int flags = 0;
+
+ BIO_ADDR peer;
+ socklen_t len = sizeof(peer);
+
+ if (out != NULL) {
+ clear_socket_error();
+ memset(&peer, 0, sizeof(peer));
+ dgram_adjust_rcv_timeout(b);
+ if (data->peekmode)
+ flags = MSG_PEEK;
+ ret = recvfrom(b->num, out, outl, flags,
+ BIO_ADDR_sockaddr_noconst(&peer), &len);
+
+ if (!data->connected && ret >= 0)
+ BIO_ctrl(b, BIO_CTRL_DGRAM_SET_PEER, 0, &peer);
+
+ BIO_clear_retry_flags(b);
+ if (ret < 0) {
+ if (BIO_dgram_should_retry(ret)) {
+ BIO_set_retry_read(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+
+ dgram_reset_rcv_timeout(b);
+ }
+ return (ret);
+}
+
+static int dgram_write(BIO *b, const char *in, int inl)
+{
+ int ret;
+ bio_dgram_data *data = (bio_dgram_data *)b->ptr;
+ clear_socket_error();
+
+ if (data->connected)
+ ret = writesocket(b->num, in, inl);
+ else {
+ int peerlen = BIO_ADDR_sockaddr_size(&data->peer);
+
+# if defined(NETWARE_CLIB) && defined(NETWARE_BSDSOCK)
+ ret = sendto(b->num, (char *)in, inl, 0,
+ BIO_ADDR_sockaddr(&data->peer), peerlen);
+# else
+ ret = sendto(b->num, in, inl, 0,
+ BIO_ADDR_sockaddr(&data->peer), peerlen);
+# endif
+ }
+
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_dgram_should_retry(ret)) {
+ BIO_set_retry_write(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+ return (ret);
+}
+
+static long dgram_get_mtu_overhead(bio_dgram_data *data)
+{
+ long ret;
+
+ switch (BIO_ADDR_family(&data->peer)) {
+ case AF_INET:
+ /*
+ * Assume this is UDP - 20 bytes for IP, 8 bytes for UDP
+ */
+ ret = 28;
+ break;
+# ifdef AF_INET6
+ case AF_INET6:
+ {
+# ifdef IN6_IS_ADDR_V4MAPPED
+ struct in6_addr tmp_addr;
+ if (BIO_ADDR_rawaddress(&data->peer, &tmp_addr, NULL)
+ && IN6_IS_ADDR_V4MAPPED(&tmp_addr))
+ /*
+ * Assume this is UDP - 20 bytes for IP, 8 bytes for UDP
+ */
+ ret = 28;
+ else
+# endif
+ /*
+ * Assume this is UDP - 40 bytes for IP, 8 bytes for UDP
+ */
+ ret = 48;
+ }
+ break;
+# endif
+ default:
+ /* We don't know. Go with the historical default */
+ ret = 28;
+ break;
+ }
+ return ret;
+}
+
+static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret = 1;
+ int *ip;
+ bio_dgram_data *data = NULL;
+ int sockopt_val = 0;
+ int d_errno;
+# if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
+ socklen_t sockopt_len; /* assume that system supporting IP_MTU is
+ * modern enough to define socklen_t */
+ socklen_t addr_len;
+ BIO_ADDR addr;
+# endif
+
+ data = (bio_dgram_data *)b->ptr;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ num = 0;
+ ret = 0;
+ break;
+ case BIO_CTRL_INFO:
+ ret = 0;
+ break;
+ case BIO_C_SET_FD:
+ dgram_clear(b);
+ b->num = *((int *)ptr);
+ b->shutdown = (int)num;
+ b->init = 1;
+ break;
+ case BIO_C_GET_FD:
+ if (b->init) {
+ ip = (int *)ptr;
+ if (ip != NULL)
+ *ip = b->num;
+ ret = b->num;
+ } else
+ ret = -1;
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ ret = b->shutdown;
+ break;
+ case BIO_CTRL_SET_CLOSE:
+ b->shutdown = (int)num;
+ break;
+ case BIO_CTRL_PENDING:
+ case BIO_CTRL_WPENDING:
+ ret = 0;
+ break;
+ case BIO_CTRL_DUP:
+ case BIO_CTRL_FLUSH:
+ ret = 1;
+ break;
+ case BIO_CTRL_DGRAM_CONNECT:
+ BIO_ADDR_make(&data->peer, BIO_ADDR_sockaddr((BIO_ADDR *)ptr));
+ break;
+ /* (Linux)kernel sets DF bit on outgoing IP packets */
+ case BIO_CTRL_DGRAM_MTU_DISCOVER:
+# if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DO)
+ addr_len = (socklen_t) sizeof(addr);
+ memset(&addr, 0, sizeof(addr));
+ if (getsockname(b->num, &addr.sa, &addr_len) < 0) {
+ ret = 0;
+ break;
+ }
+ switch (addr.sa.sa_family) {
+ case AF_INET:
+ sockopt_val = IP_PMTUDISC_DO;
+ if ((ret = setsockopt(b->num, IPPROTO_IP, IP_MTU_DISCOVER,
+ &sockopt_val, sizeof(sockopt_val))) < 0)
+ perror("setsockopt");
+ break;
+# if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DO)
+ case AF_INET6:
+ sockopt_val = IPV6_PMTUDISC_DO;
+ if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
+ &sockopt_val, sizeof(sockopt_val))) < 0)
+ perror("setsockopt");
+ break;
+# endif
+ default:
+ ret = -1;
+ break;
+ }
+# else
+ ret = -1;
+# endif
+ break;
+ case BIO_CTRL_DGRAM_QUERY_MTU:
+# if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU)
+ addr_len = (socklen_t) sizeof(addr);
+ memset(&addr, 0, sizeof(addr));
+ if (getsockname(b->num, &addr.sa, &addr_len) < 0) {
+ ret = 0;
+ break;
+ }
+ sockopt_len = sizeof(sockopt_val);
+ switch (addr.sa.sa_family) {
+ case AF_INET:
+ if ((ret =
+ getsockopt(b->num, IPPROTO_IP, IP_MTU, (void *)&sockopt_val,
+ &sockopt_len)) < 0 || sockopt_val < 0) {
+ ret = 0;
+ } else {
+ /*
+ * we assume that the transport protocol is UDP and no IP
+ * options are used.
+ */
+ data->mtu = sockopt_val - 8 - 20;
+ ret = data->mtu;
+ }
+ break;
+# if OPENSSL_USE_IPV6 && defined(IPV6_MTU)
+ case AF_INET6:
+ if ((ret =
+ getsockopt(b->num, IPPROTO_IPV6, IPV6_MTU,
+ (void *)&sockopt_val, &sockopt_len)) < 0
+ || sockopt_val < 0) {
+ ret = 0;
+ } else {
+ /*
+ * we assume that the transport protocol is UDP and no IPV6
+ * options are used.
+ */
+ data->mtu = sockopt_val - 8 - 40;
+ ret = data->mtu;
+ }
+ break;
+# endif
+ default:
+ ret = 0;
+ break;
+ }
+# else
+ ret = 0;
+# endif
+ break;
+ case BIO_CTRL_DGRAM_GET_FALLBACK_MTU:
+ ret = -dgram_get_mtu_overhead(data);
+ switch (BIO_ADDR_family(&data->peer)) {
+ case AF_INET:
+ ret += 576;
+ break;
+# if OPENSSL_USE_IPV6
+ case AF_INET6:
+ {
+# ifdef IN6_IS_ADDR_V4MAPPED
+ struct in6_addr tmp_addr;
+ if (BIO_ADDR_rawaddress(&data->peer, &tmp_addr, NULL)
+ && IN6_IS_ADDR_V4MAPPED(&tmp_addr))
+ ret += 576;
+ else
+# endif
+ ret += 1280;
+ }
+ break;
+# endif
+ default:
+ ret += 576;
+ break;
+ }
+ break;
+ case BIO_CTRL_DGRAM_GET_MTU:
+ return data->mtu;
+ case BIO_CTRL_DGRAM_SET_MTU:
+ data->mtu = num;
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SET_CONNECTED:
+ if (ptr != NULL) {
+ data->connected = 1;
+ BIO_ADDR_make(&data->peer, BIO_ADDR_sockaddr((BIO_ADDR *)ptr));
+ } else {
+ data->connected = 0;
+ memset(&data->peer, 0, sizeof(data->peer));
+ }
+ break;
+ case BIO_CTRL_DGRAM_GET_PEER:
+ ret = BIO_ADDR_sockaddr_size(&data->peer);
+ /* FIXME: if num < ret, we will only return part of an address.
+ That should bee an error, no? */
+ if (num == 0 || num > ret)
+ num = ret;
+ memcpy(ptr, &data->peer, (ret = num));
+ break;
+ case BIO_CTRL_DGRAM_SET_PEER:
+ BIO_ADDR_make(&data->peer, BIO_ADDR_sockaddr((BIO_ADDR *)ptr));
+ break;
+ case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+ memcpy(&(data->next_timeout), ptr, sizeof(struct timeval));
+ break;
+# if defined(SO_RCVTIMEO)
+ case BIO_CTRL_DGRAM_SET_RECV_TIMEOUT:
+# ifdef OPENSSL_SYS_WINDOWS
+ {
+ struct timeval *tv = (struct timeval *)ptr;
+ int timeout = tv->tv_sec * 1000 + tv->tv_usec / 1000;
+ if (setsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
+ (void *)&timeout, sizeof(timeout)) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+ }
+# else
+ if (setsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO, ptr,
+ sizeof(struct timeval)) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# endif
+ break;
+ case BIO_CTRL_DGRAM_GET_RECV_TIMEOUT:
+ {
+ union {
+ size_t s;
+ int i;
+ } sz = {
+ 0
+ };
+# ifdef OPENSSL_SYS_WINDOWS
+ int timeout;
+ struct timeval *tv = (struct timeval *)ptr;
+
+ sz.i = sizeof(timeout);
+ if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
+ (void *)&timeout, &sz.i) < 0) {
+ perror("getsockopt");
+ ret = -1;
+ } else {
+ tv->tv_sec = timeout / 1000;
+ tv->tv_usec = (timeout % 1000) * 1000;
+ ret = sizeof(*tv);
+ }
+# else
+ sz.i = sizeof(struct timeval);
+ if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
+ ptr, (void *)&sz) < 0) {
+ perror("getsockopt");
+ ret = -1;
+ } else if (sizeof(sz.s) != sizeof(sz.i) && sz.i == 0) {
+ OPENSSL_assert(sz.s <= sizeof(struct timeval));
+ ret = (int)sz.s;
+ } else
+ ret = sz.i;
+# endif
+ }
+ break;
+# endif
+# if defined(SO_SNDTIMEO)
+ case BIO_CTRL_DGRAM_SET_SEND_TIMEOUT:
+# ifdef OPENSSL_SYS_WINDOWS
+ {
+ struct timeval *tv = (struct timeval *)ptr;
+ int timeout = tv->tv_sec * 1000 + tv->tv_usec / 1000;
+ if (setsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
+ (void *)&timeout, sizeof(timeout)) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+ }
+# else
+ if (setsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO, ptr,
+ sizeof(struct timeval)) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# endif
+ break;
+ case BIO_CTRL_DGRAM_GET_SEND_TIMEOUT:
+ {
+ union {
+ size_t s;
+ int i;
+ } sz = {
+ 0
+ };
+# ifdef OPENSSL_SYS_WINDOWS
+ int timeout;
+ struct timeval *tv = (struct timeval *)ptr;
+
+ sz.i = sizeof(timeout);
+ if (getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
+ (void *)&timeout, &sz.i) < 0) {
+ perror("getsockopt");
+ ret = -1;
+ } else {
+ tv->tv_sec = timeout / 1000;
+ tv->tv_usec = (timeout % 1000) * 1000;
+ ret = sizeof(*tv);
+ }
+# else
+ sz.i = sizeof(struct timeval);
+ if (getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
+ ptr, (void *)&sz) < 0) {
+ perror("getsockopt");
+ ret = -1;
+ } else if (sizeof(sz.s) != sizeof(sz.i) && sz.i == 0) {
+ OPENSSL_assert(sz.s <= sizeof(struct timeval));
+ ret = (int)sz.s;
+ } else
+ ret = sz.i;
+# endif
+ }
+ break;
+# endif
+ case BIO_CTRL_DGRAM_GET_SEND_TIMER_EXP:
+ /* fall-through */
+ case BIO_CTRL_DGRAM_GET_RECV_TIMER_EXP:
+# ifdef OPENSSL_SYS_WINDOWS
+ d_errno = (data->_errno == WSAETIMEDOUT);
+# else
+ d_errno = (data->_errno == EAGAIN);
+# endif
+ if (d_errno) {
+ ret = 1;
+ data->_errno = 0;
+ } else
+ ret = 0;
+ break;
+# ifdef EMSGSIZE
+ case BIO_CTRL_DGRAM_MTU_EXCEEDED:
+ if (data->_errno == EMSGSIZE) {
+ ret = 1;
+ data->_errno = 0;
+ } else
+ ret = 0;
+ break;
+# endif
+ case BIO_CTRL_DGRAM_SET_DONT_FRAG:
+ sockopt_val = num ? 1 : 0;
+
+ switch (data->peer.sa.sa_family) {
+ case AF_INET:
+# if defined(IP_DONTFRAG)
+ if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAG,
+ &sockopt_val, sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# elif defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined (IP_PMTUDISC_PROBE)
+ if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
+ (ret = setsockopt(b->num, IPPROTO_IP, IP_MTU_DISCOVER,
+ &sockopt_val, sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# elif defined(OPENSSL_SYS_WINDOWS) && defined(IP_DONTFRAGMENT)
+ if ((ret = setsockopt(b->num, IPPROTO_IP, IP_DONTFRAGMENT,
+ (const char *)&sockopt_val,
+ sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# else
+ ret = -1;
+# endif
+ break;
+# if OPENSSL_USE_IPV6
+ case AF_INET6:
+# if defined(IPV6_DONTFRAG)
+ if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_DONTFRAG,
+ (const void *)&sockopt_val,
+ sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# elif defined(OPENSSL_SYS_LINUX) && defined(IPV6_MTUDISCOVER)
+ if ((sockopt_val = num ? IP_PMTUDISC_PROBE : IP_PMTUDISC_DONT),
+ (ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
+ &sockopt_val, sizeof(sockopt_val))) < 0) {
+ perror("setsockopt");
+ ret = -1;
+ }
+# else
+ ret = -1;
+# endif
+ break;
+# endif
+ default:
+ ret = -1;
+ break;
+ }
+ break;
+ case BIO_CTRL_DGRAM_GET_MTU_OVERHEAD:
+ ret = dgram_get_mtu_overhead(data);
+ break;
+
+ /*
+ * BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE is used here for compatibility
+ * reasons. When BIO_CTRL_DGRAM_SET_PEEK_MODE was first defined its value
+ * was incorrectly clashing with BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE. The
+ * value has been updated to a non-clashing value. However to preserve
+ * binary compatiblity we now respond to both the old value and the new one
+ */
+ case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+ case BIO_CTRL_DGRAM_SET_PEEK_MODE:
+ data->peekmode = (unsigned int)num;
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int dgram_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = dgram_write(bp, str, n);
+ return (ret);
+}
+
+# ifndef OPENSSL_NO_SCTP
+const BIO_METHOD *BIO_s_datagram_sctp(void)
+{
+ return (&methods_dgramp_sctp);
+}
+
+BIO *BIO_new_dgram_sctp(int fd, int close_flag)
+{
+ BIO *bio;
+ int ret, optval = 20000;
+ int auth_data = 0, auth_forward = 0;
+ unsigned char *p;
+ struct sctp_authchunk auth;
+ struct sctp_authchunks *authchunks;
+ socklen_t sockopt_len;
+# ifdef SCTP_AUTHENTICATION_EVENT
+# ifdef SCTP_EVENT
+ struct sctp_event event;
+# else
+ struct sctp_event_subscribe event;
+# endif
+# endif
+
+ bio = BIO_new(BIO_s_datagram_sctp());
+ if (bio == NULL)
+ return (NULL);
+ BIO_set_fd(bio, fd, close_flag);
+
+ /* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */
+ auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE;
+ ret =
+ setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth,
+ sizeof(struct sctp_authchunk));
+ if (ret < 0) {
+ BIO_vfree(bio);
+ return (NULL);
+ }
+ auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE;
+ ret =
+ setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth,
+ sizeof(struct sctp_authchunk));
+ if (ret < 0) {
+ BIO_vfree(bio);
+ return (NULL);
+ }
+
+ /*
+ * Test if activation was successful. When using accept(), SCTP-AUTH has
+ * to be activated for the listening socket already, otherwise the
+ * connected socket won't use it.
+ */
+ sockopt_len = (socklen_t) (sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+ authchunks = OPENSSL_zalloc(sockopt_len);
+ if (authchunks == NULL) {
+ BIO_vfree(bio);
+ return (NULL);
+ }
+ ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks,
+ &sockopt_len);
+ if (ret < 0) {
+ OPENSSL_free(authchunks);
+ BIO_vfree(bio);
+ return (NULL);
+ }
+
+ for (p = (unsigned char *)authchunks->gauth_chunks;
+ p < (unsigned char *)authchunks + sockopt_len;
+ p += sizeof(uint8_t)) {
+ if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE)
+ auth_data = 1;
+ if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE)
+ auth_forward = 1;
+ }
+
+ OPENSSL_free(authchunks);
+
+ OPENSSL_assert(auth_data);
+ OPENSSL_assert(auth_forward);
+
+# ifdef SCTP_AUTHENTICATION_EVENT
+# ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_AUTHENTICATION_EVENT;
+ event.se_on = 1;
+ ret =
+ setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event,
+ sizeof(struct sctp_event));
+ if (ret < 0) {
+ BIO_vfree(bio);
+ return (NULL);
+ }
+# else
+ sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len);
+ if (ret < 0) {
+ BIO_vfree(bio);
+ return (NULL);
+ }
+
+ event.sctp_authentication_event = 1;
+
+ ret =
+ setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event,
+ sizeof(struct sctp_event_subscribe));
+ if (ret < 0) {
+ BIO_vfree(bio);
+ return (NULL);
+ }
+# endif
+# endif
+
+ /*
+ * Disable partial delivery by setting the min size larger than the max
+ * record size of 2^14 + 2048 + 13
+ */
+ ret =
+ setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval,
+ sizeof(optval));
+ if (ret < 0) {
+ BIO_vfree(bio);
+ return (NULL);
+ }
+
+ return (bio);
+}
+
+int BIO_dgram_is_sctp(BIO *bio)
+{
+ return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP);
+}
+
+static int dgram_sctp_new(BIO *bi)
+{
+ bio_dgram_sctp_data *data = NULL;
+
+ bi->init = 0;
+ bi->num = 0;
+ data = OPENSSL_zalloc(sizeof(*data));
+ if (data == NULL)
+ return 0;
+# ifdef SCTP_PR_SCTP_NONE
+ data->prinfo.pr_policy = SCTP_PR_SCTP_NONE;
+# endif
+ bi->ptr = data;
+
+ bi->flags = 0;
+ return (1);
+}
+
+static int dgram_sctp_free(BIO *a)
+{
+ bio_dgram_sctp_data *data;
+
+ if (a == NULL)
+ return (0);
+ if (!dgram_clear(a))
+ return 0;
+
+ data = (bio_dgram_sctp_data *) a->ptr;
+ if (data != NULL)
+ OPENSSL_free(data);
+
+ return (1);
+}
+
+# ifdef SCTP_AUTHENTICATION_EVENT
+void dgram_sctp_handle_auth_free_key_event(BIO *b,
+ union sctp_notification *snp)
+{
+ int ret;
+ struct sctp_authkey_event *authkeyevent = &snp->sn_auth_event;
+
+ if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY) {
+ struct sctp_authkeyid authkeyid;
+
+ /* delete key */
+ authkeyid.scact_keynumber = authkeyevent->auth_keynumber;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ }
+}
+# endif
+
+static int dgram_sctp_read(BIO *b, char *out, int outl)
+{
+ int ret = 0, n = 0, i, optval;
+ socklen_t optlen;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+ union sctp_notification *snp;
+ struct msghdr msg;
+ struct iovec iov;
+ struct cmsghdr *cmsg;
+ char cmsgbuf[512];
+
+ if (out != NULL) {
+ clear_socket_error();
+
+ do {
+ memset(&data->rcvinfo, 0, sizeof(data->rcvinfo));
+ iov.iov_base = out;
+ iov.iov_len = outl;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = 512;
+ msg.msg_flags = 0;
+ n = recvmsg(b->num, &msg, 0);
+
+ if (n <= 0) {
+ if (n < 0)
+ ret = n;
+ break;
+ }
+
+ if (msg.msg_controllen > 0) {
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg;
+ cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ if (cmsg->cmsg_level != IPPROTO_SCTP)
+ continue;
+# ifdef SCTP_RCVINFO
+ if (cmsg->cmsg_type == SCTP_RCVINFO) {
+ struct sctp_rcvinfo *rcvinfo;
+
+ rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg);
+ data->rcvinfo.rcv_sid = rcvinfo->rcv_sid;
+ data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn;
+ data->rcvinfo.rcv_flags = rcvinfo->rcv_flags;
+ data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid;
+ data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn;
+ data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn;
+ data->rcvinfo.rcv_context = rcvinfo->rcv_context;
+ }
+# endif
+# ifdef SCTP_SNDRCV
+ if (cmsg->cmsg_type == SCTP_SNDRCV) {
+ struct sctp_sndrcvinfo *sndrcvinfo;
+
+ sndrcvinfo =
+ (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream;
+ data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn;
+ data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags;
+ data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid;
+ data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn;
+ data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn;
+ data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context;
+ }
+# endif
+ }
+ }
+
+ if (msg.msg_flags & MSG_NOTIFICATION) {
+ snp = (union sctp_notification *)out;
+ if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT) {
+# ifdef SCTP_EVENT
+ struct sctp_event event;
+# else
+ struct sctp_event_subscribe event;
+ socklen_t eventsize;
+# endif
+
+ /* disable sender dry event */
+# ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 0;
+ i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event,
+ sizeof(struct sctp_event));
+ if (i < 0) {
+ ret = i;
+ break;
+ }
+# else
+ eventsize = sizeof(struct sctp_event_subscribe);
+ i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event,
+ &eventsize);
+ if (i < 0) {
+ ret = i;
+ break;
+ }
+
+ event.sctp_sender_dry_event = 0;
+
+ i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event,
+ sizeof(struct sctp_event_subscribe));
+ if (i < 0) {
+ ret = i;
+ break;
+ }
+# endif
+ }
+# ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, snp);
+# endif
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context,
+ (void *)out);
+
+ memset(out, 0, outl);
+ } else
+ ret += n;
+ }
+ while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR)
+ && (ret < outl));
+
+ if (ret > 0 && !(msg.msg_flags & MSG_EOR)) {
+ /* Partial message read, this should never happen! */
+
+ /*
+ * The buffer was too small, this means the peer sent a message
+ * that was larger than allowed.
+ */
+ if (ret == outl)
+ return -1;
+
+ /*
+ * Test if socket buffer can handle max record size (2^14 + 2048
+ * + 13)
+ */
+ optlen = (socklen_t) sizeof(int);
+ ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen);
+ if (ret >= 0)
+ OPENSSL_assert(optval >= 18445);
+
+ /*
+ * Test if SCTP doesn't partially deliver below max record size
+ * (2^14 + 2048 + 13)
+ */
+ optlen = (socklen_t) sizeof(int);
+ ret =
+ getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT,
+ &optval, &optlen);
+ if (ret >= 0)
+ OPENSSL_assert(optval >= 18445);
+
+ /*
+ * Partially delivered notification??? Probably a bug....
+ */
+ OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION));
+
+ /*
+ * Everything seems ok till now, so it's most likely a message
+ * dropped by PR-SCTP.
+ */
+ memset(out, 0, outl);
+ BIO_set_retry_read(b);
+ return -1;
+ }
+
+ BIO_clear_retry_flags(b);
+ if (ret < 0) {
+ if (BIO_dgram_should_retry(ret)) {
+ BIO_set_retry_read(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+
+ /* Test if peer uses SCTP-AUTH before continuing */
+ if (!data->peer_auth_tested) {
+ int ii, auth_data = 0, auth_forward = 0;
+ unsigned char *p;
+ struct sctp_authchunks *authchunks;
+
+ optlen =
+ (socklen_t) (sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+ authchunks = OPENSSL_malloc(optlen);
+ if (authchunks == NULL) {
+ BIOerr(BIO_F_DGRAM_SCTP_READ, ERR_R_MALLOC_FAILURE);
+ return -1;
+ }
+ memset(authchunks, 0, optlen);
+ ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS,
+ authchunks, &optlen);
+
+ if (ii >= 0)
+ for (p = (unsigned char *)authchunks->gauth_chunks;
+ p < (unsigned char *)authchunks + optlen;
+ p += sizeof(uint8_t)) {
+ if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE)
+ auth_data = 1;
+ if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE)
+ auth_forward = 1;
+ }
+
+ OPENSSL_free(authchunks);
+
+ if (!auth_data || !auth_forward) {
+ BIOerr(BIO_F_DGRAM_SCTP_READ, BIO_R_CONNECT_ERROR);
+ return -1;
+ }
+
+ data->peer_auth_tested = 1;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * dgram_sctp_write - send message on SCTP socket
+ * @b: BIO to write to
+ * @in: data to send
+ * @inl: amount of bytes in @in to send
+ *
+ * Returns -1 on error or the sent amount of bytes on success
+ */
+static int dgram_sctp_write(BIO *b, const char *in, int inl)
+{
+ int ret;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+ struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo);
+ struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo);
+ struct bio_dgram_sctp_sndinfo handshake_sinfo;
+ struct iovec iov[1];
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+# if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+ char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) +
+ CMSG_SPACE(sizeof(struct sctp_prinfo))];
+ struct sctp_sndinfo *sndinfo;
+ struct sctp_prinfo *prinfo;
+# else
+ char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct sctp_sndrcvinfo *sndrcvinfo;
+# endif
+
+ clear_socket_error();
+
+ /*
+ * If we're send anything else than application data, disable all user
+ * parameters and flags.
+ */
+ if (in[0] != 23) {
+ memset(&handshake_sinfo, 0, sizeof(handshake_sinfo));
+# ifdef SCTP_SACK_IMMEDIATELY
+ handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY;
+# endif
+ sinfo = &handshake_sinfo;
+ }
+
+ /* We can only send a shutdown alert if the socket is dry */
+ if (data->save_shutdown) {
+ ret = BIO_dgram_sctp_wait_for_dry(b);
+ if (ret < 0)
+ return -1;
+ if (ret == 0) {
+ BIO_clear_retry_flags(b);
+ BIO_set_retry_write(b);
+ return -1;
+ }
+ }
+
+ iov[0].iov_base = (char *)in;
+ iov[0].iov_len = inl;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = (caddr_t) cmsgbuf;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+# if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+ cmsg = (struct cmsghdr *)cmsgbuf;
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo));
+ sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg);
+ memset(sndinfo, 0, sizeof(*sndinfo));
+ sndinfo->snd_sid = sinfo->snd_sid;
+ sndinfo->snd_flags = sinfo->snd_flags;
+ sndinfo->snd_ppid = sinfo->snd_ppid;
+ sndinfo->snd_context = sinfo->snd_context;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo));
+
+ cmsg =
+ (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))];
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_PRINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo));
+ prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg);
+ memset(prinfo, 0, sizeof(*prinfo));
+ prinfo->pr_policy = pinfo->pr_policy;
+ prinfo->pr_value = pinfo->pr_value;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo));
+# else
+ cmsg = (struct cmsghdr *)cmsgbuf;
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sndrcvinfo, 0, sizeof(*sndrcvinfo));
+ sndrcvinfo->sinfo_stream = sinfo->snd_sid;
+ sndrcvinfo->sinfo_flags = sinfo->snd_flags;
+# ifdef __FreeBSD__
+ sndrcvinfo->sinfo_flags |= pinfo->pr_policy;
+# endif
+ sndrcvinfo->sinfo_ppid = sinfo->snd_ppid;
+ sndrcvinfo->sinfo_context = sinfo->snd_context;
+ sndrcvinfo->sinfo_timetolive = pinfo->pr_value;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo));
+# endif
+
+ ret = sendmsg(b->num, &msg, 0);
+
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_dgram_should_retry(ret)) {
+ BIO_set_retry_write(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+ return (ret);
+}
+
+static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret = 1;
+ bio_dgram_sctp_data *data = NULL;
+ socklen_t sockopt_len = 0;
+ struct sctp_authkeyid authkeyid;
+ struct sctp_authkey *authkey = NULL;
+
+ data = (bio_dgram_sctp_data *) b->ptr;
+
+ switch (cmd) {
+ case BIO_CTRL_DGRAM_QUERY_MTU:
+ /*
+ * Set to maximum (2^14) and ignore user input to enable transport
+ * protocol fragmentation. Returns always 2^14.
+ */
+ data->mtu = 16384;
+ ret = data->mtu;
+ break;
+ case BIO_CTRL_DGRAM_SET_MTU:
+ /*
+ * Set to maximum (2^14) and ignore input to enable transport
+ * protocol fragmentation. Returns always 2^14.
+ */
+ data->mtu = 16384;
+ ret = data->mtu;
+ break;
+ case BIO_CTRL_DGRAM_SET_CONNECTED:
+ case BIO_CTRL_DGRAM_CONNECT:
+ /* Returns always -1. */
+ ret = -1;
+ break;
+ case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+ /*
+ * SCTP doesn't need the DTLS timer Returns always 1.
+ */
+ break;
+ case BIO_CTRL_DGRAM_GET_MTU_OVERHEAD:
+ /*
+ * We allow transport protocol fragmentation so this is irrelevant
+ */
+ ret = 0;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+ if (num > 0)
+ data->in_handshake = 1;
+ else
+ data->in_handshake = 0;
+
+ ret =
+ setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY,
+ &data->in_handshake, sizeof(int));
+ break;
+ case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY:
+ /*
+ * New shared key for SCTP AUTH. Returns 0 on success, -1 otherwise.
+ */
+
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret =
+ getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid,
+ &sockopt_len);
+ if (ret < 0)
+ break;
+
+ /* Add new key */
+ sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t);
+ authkey = OPENSSL_malloc(sockopt_len);
+ if (authkey == NULL) {
+ ret = -1;
+ break;
+ }
+ memset(authkey, 0, sockopt_len);
+ authkey->sca_keynumber = authkeyid.scact_keynumber + 1;
+# ifndef __FreeBSD__
+ /*
+ * This field is missing in FreeBSD 8.2 and earlier, and FreeBSD 8.3
+ * and higher work without it.
+ */
+ authkey->sca_keylength = 64;
+# endif
+ memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t));
+
+ ret =
+ setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey,
+ sockopt_len);
+ OPENSSL_free(authkey);
+ authkey = NULL;
+ if (ret < 0)
+ break;
+
+ /* Reset active key */
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0)
+ break;
+
+ break;
+ case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY:
+ /* Returns 0 on success, -1 otherwise. */
+
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret =
+ getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid,
+ &sockopt_len);
+ if (ret < 0)
+ break;
+
+ /* Set active key */
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0)
+ break;
+
+ /*
+ * CCS has been sent, so remember that and fall through to check if
+ * we need to deactivate an old key
+ */
+ data->ccs_sent = 1;
+ /* fall-through */
+
+ case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD:
+ /* Returns 0 on success, -1 otherwise. */
+
+ /*
+ * Has this command really been called or is this just a
+ * fall-through?
+ */
+ if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD)
+ data->ccs_rcvd = 1;
+
+ /*
+ * CSS has been both, received and sent, so deactivate an old key
+ */
+ if (data->ccs_rcvd == 1 && data->ccs_sent == 1) {
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret =
+ getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+ &authkeyid, &sockopt_len);
+ if (ret < 0)
+ break;
+
+ /*
+ * Deactivate key or delete second last key if
+ * SCTP_AUTHENTICATION_EVENT is not available.
+ */
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+# ifdef SCTP_AUTH_DEACTIVATE_KEY
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY,
+ &authkeyid, sockopt_len);
+ if (ret < 0)
+ break;
+# endif
+# ifndef SCTP_AUTHENTICATION_EVENT
+ if (authkeyid.scact_keynumber > 0) {
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0)
+ break;
+ }
+# endif
+
+ data->ccs_rcvd = 0;
+ data->ccs_sent = 0;
+ }
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long)sizeof(struct bio_dgram_sctp_sndinfo))
+ num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+ memcpy(ptr, &(data->sndinfo), num);
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long)sizeof(struct bio_dgram_sctp_sndinfo))
+ num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+ memcpy(&(data->sndinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long)sizeof(struct bio_dgram_sctp_rcvinfo))
+ num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+ memcpy(ptr, &data->rcvinfo, num);
+
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long)sizeof(struct bio_dgram_sctp_rcvinfo))
+ num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+ memcpy(&(data->rcvinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_PRINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long)sizeof(struct bio_dgram_sctp_prinfo))
+ num = sizeof(struct bio_dgram_sctp_prinfo);
+
+ memcpy(ptr, &(data->prinfo), num);
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_PRINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long)sizeof(struct bio_dgram_sctp_prinfo))
+ num = sizeof(struct bio_dgram_sctp_prinfo);
+
+ memcpy(&(data->prinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN:
+ /* Returns always 1. */
+ if (num > 0)
+ data->save_shutdown = 1;
+ else
+ data->save_shutdown = 0;
+ break;
+
+ default:
+ /*
+ * Pass to default ctrl function to process SCTP unspecific commands
+ */
+ ret = dgram_ctrl(b, cmd, num, ptr);
+ break;
+ }
+ return (ret);
+}
+
+int BIO_dgram_sctp_notification_cb(BIO *b,
+ void (*handle_notifications) (BIO *bio,
+ void
+ *context,
+ void *buf),
+ void *context)
+{
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+ if (handle_notifications != NULL) {
+ data->handle_notifications = handle_notifications;
+ data->notification_context = context;
+ } else
+ return -1;
+
+ return 0;
+}
+
+/*
+ * BIO_dgram_sctp_wait_for_dry - Wait for SCTP SENDER_DRY event
+ * @b: The BIO to check for the dry event
+ *
+ * Wait until the peer confirms all packets have been received, and so that
+ * our kernel doesn't have anything to send anymore. This is only received by
+ * the peer's kernel, not the application.
+ *
+ * Returns:
+ * -1 on error
+ * 0 when not dry yet
+ * 1 when dry
+ */
+int BIO_dgram_sctp_wait_for_dry(BIO *b)
+{
+ int is_dry = 0;
+ int sockflags = 0;
+ int n, ret;
+ union sctp_notification snp;
+ struct msghdr msg;
+ struct iovec iov;
+# ifdef SCTP_EVENT
+ struct sctp_event event;
+# else
+ struct sctp_event_subscribe event;
+ socklen_t eventsize;
+# endif
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+ /* set sender dry event */
+# ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 1;
+ ret =
+ setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event,
+ sizeof(struct sctp_event));
+# else
+ eventsize = sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+ if (ret < 0)
+ return -1;
+
+ event.sctp_sender_dry_event = 1;
+
+ ret =
+ setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event,
+ sizeof(struct sctp_event_subscribe));
+# endif
+ if (ret < 0)
+ return -1;
+
+ /* peek for notification */
+ memset(&snp, 0, sizeof(snp));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+ if (n <= 0) {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN)
+ && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return 0;
+ }
+
+ /* if we find a notification, process it and try again if necessary */
+ while (msg.msg_flags & MSG_NOTIFICATION) {
+ memset(&snp, 0, sizeof(snp));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ n = recvmsg(b->num, &msg, 0);
+ if (n <= 0) {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN)
+ && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return is_dry;
+ }
+
+ if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT) {
+ is_dry = 1;
+
+ /* disable sender dry event */
+# ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 0;
+ ret =
+ setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event,
+ sizeof(struct sctp_event));
+# else
+ eventsize = (socklen_t) sizeof(struct sctp_event_subscribe);
+ ret =
+ getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event,
+ &eventsize);
+ if (ret < 0)
+ return -1;
+
+ event.sctp_sender_dry_event = 0;
+
+ ret =
+ setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event,
+ sizeof(struct sctp_event_subscribe));
+# endif
+ if (ret < 0)
+ return -1;
+ }
+# ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, &snp);
+# endif
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context,
+ (void *)&snp);
+
+ /* found notification, peek again */
+ memset(&snp, 0, sizeof(snp));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ /* if we have seen the dry already, don't wait */
+ if (is_dry) {
+ sockflags = fcntl(b->num, F_GETFL, 0);
+ fcntl(b->num, F_SETFL, O_NONBLOCK);
+ }
+
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+
+ if (is_dry) {
+ fcntl(b->num, F_SETFL, sockflags);
+ }
+
+ if (n <= 0) {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN)
+ && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return is_dry;
+ }
+ }
+
+ /* read anything else */
+ return is_dry;
+}
+
+int BIO_dgram_sctp_msg_waiting(BIO *b)
+{
+ int n, sockflags;
+ union sctp_notification snp;
+ struct msghdr msg;
+ struct iovec iov;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+ /* Check if there are any messages waiting to be read */
+ do {
+ memset(&snp, 0, sizeof(snp));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ sockflags = fcntl(b->num, F_GETFL, 0);
+ fcntl(b->num, F_SETFL, O_NONBLOCK);
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+ fcntl(b->num, F_SETFL, sockflags);
+
+ /* if notification, process and try again */
+ if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION)) {
+# ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, &snp);
+# endif
+
+ memset(&snp, 0, sizeof(snp));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+ n = recvmsg(b->num, &msg, 0);
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context,
+ (void *)&snp);
+ }
+
+ } while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION));
+
+ /* Return 1 if there is a message to be read, return 0 otherwise. */
+ if (n > 0)
+ return 1;
+ else
+ return 0;
+}
+
+static int dgram_sctp_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = dgram_sctp_write(bp, str, n);
+ return (ret);
+}
+# endif
+
+static int BIO_dgram_should_retry(int i)
+{
+ int err;
+
+ if ((i == 0) || (i == -1)) {
+ err = get_last_socket_error();
+
+# if defined(OPENSSL_SYS_WINDOWS)
+ /*
+ * If the socket return value (i) is -1 and err is unexpectedly 0 at
+ * this point, the error code was overwritten by another system call
+ * before this error handling is called.
+ */
+# endif
+
+ return (BIO_dgram_non_fatal_error(err));
+ }
+ return (0);
+}
+
+int BIO_dgram_non_fatal_error(int err)
+{
+ switch (err) {
+# if defined(OPENSSL_SYS_WINDOWS)
+# if defined(WSAEWOULDBLOCK)
+ case WSAEWOULDBLOCK:
+# endif
+# endif
+
+# ifdef EWOULDBLOCK
+# ifdef WSAEWOULDBLOCK
+# if WSAEWOULDBLOCK != EWOULDBLOCK
+ case EWOULDBLOCK:
+# endif
+# else
+ case EWOULDBLOCK:
+# endif
+# endif
+
+# ifdef EINTR
+ case EINTR:
+# endif
+
+# ifdef EAGAIN
+# if EWOULDBLOCK != EAGAIN
+ case EAGAIN:
+# endif
+# endif
+
+# ifdef EPROTO
+ case EPROTO:
+# endif
+
+# ifdef EINPROGRESS
+ case EINPROGRESS:
+# endif
+
+# ifdef EALREADY
+ case EALREADY:
+# endif
+
+ return (1);
+ /* break; */
+ default:
+ break;
+ }
+ return (0);
+}
+
+static void get_current_time(struct timeval *t)
+{
+# if defined(_WIN32)
+ SYSTEMTIME st;
+ union {
+ unsigned __int64 ul;
+ FILETIME ft;
+ } now;
+
+ GetSystemTime(&st);
+ SystemTimeToFileTime(&st, &now.ft);
+# ifdef __MINGW32__
+ now.ul -= 116444736000000000ULL;
+# else
+ now.ul -= 116444736000000000UI64; /* re-bias to 1/1/1970 */
+# endif
+ t->tv_sec = (long)(now.ul / 10000000);
+ t->tv_usec = ((int)(now.ul % 10000000)) / 10;
+# elif defined(OPENSSL_SYS_VMS)
+ struct timeb tb;
+ ftime(&tb);
+ t->tv_sec = (long)tb.time;
+ t->tv_usec = (long)tb.millitm * 1000;
+# else
+ gettimeofday(t, NULL);
+# endif
+}
+
+#endif
diff --git a/openssl-1.1.0h/crypto/bio/bss_fd.c b/openssl-1.1.0h/crypto/bio/bss_fd.c
new file mode 100644
index 0000000..2bd3517
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_fd.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "bio_lcl.h"
+
+#if defined(OPENSSL_NO_POSIX_IO)
+/*
+ * Dummy placeholder for BIO_s_fd...
+ */
+BIO *BIO_new_fd(int fd, int close_flag)
+{
+ return NULL;
+}
+
+int BIO_fd_non_fatal_error(int err)
+{
+ return 0;
+}
+
+int BIO_fd_should_retry(int i)
+{
+ return 0;
+}
+
+const BIO_METHOD *BIO_s_fd(void)
+{
+ return NULL;
+}
+#else
+/*
+ * As for unconditional usage of "UPLINK" interface in this module.
+ * Trouble is that unlike Unix file descriptors [which are indexes
+ * in kernel-side per-process table], corresponding descriptors on
+ * platforms which require "UPLINK" interface seem to be indexes
+ * in a user-land, non-global table. Well, in fact they are indexes
+ * in stdio _iob[], and recall that _iob[] was the very reason why
+ * "UPLINK" interface was introduced in first place. But one way on
+ * another. Neither libcrypto or libssl use this BIO meaning that
+ * file descriptors can only be provided by application. Therefore
+ * "UPLINK" calls are due...
+ */
+static int fd_write(BIO *h, const char *buf, int num);
+static int fd_read(BIO *h, char *buf, int size);
+static int fd_puts(BIO *h, const char *str);
+static int fd_gets(BIO *h, char *buf, int size);
+static long fd_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int fd_new(BIO *h);
+static int fd_free(BIO *data);
+int BIO_fd_should_retry(int s);
+
+static const BIO_METHOD methods_fdp = {
+ BIO_TYPE_FD,
+ "file descriptor",
+ fd_write,
+ fd_read,
+ fd_puts,
+ fd_gets,
+ fd_ctrl,
+ fd_new,
+ fd_free,
+ NULL, /* fd_callback_ctrl */
+};
+
+const BIO_METHOD *BIO_s_fd(void)
+{
+ return (&methods_fdp);
+}
+
+BIO *BIO_new_fd(int fd, int close_flag)
+{
+ BIO *ret;
+ ret = BIO_new(BIO_s_fd());
+ if (ret == NULL)
+ return (NULL);
+ BIO_set_fd(ret, fd, close_flag);
+ return (ret);
+}
+
+static int fd_new(BIO *bi)
+{
+ bi->init = 0;
+ bi->num = -1;
+ bi->ptr = NULL;
+ bi->flags = BIO_FLAGS_UPLINK; /* essentially redundant */
+ return (1);
+}
+
+static int fd_free(BIO *a)
+{
+ if (a == NULL)
+ return (0);
+ if (a->shutdown) {
+ if (a->init) {
+ UP_close(a->num);
+ }
+ a->init = 0;
+ a->flags = BIO_FLAGS_UPLINK;
+ }
+ return (1);
+}
+
+static int fd_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+
+ if (out != NULL) {
+ clear_sys_error();
+ ret = UP_read(b->num, out, outl);
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_fd_should_retry(ret))
+ BIO_set_retry_read(b);
+ }
+ }
+ return (ret);
+}
+
+static int fd_write(BIO *b, const char *in, int inl)
+{
+ int ret;
+ clear_sys_error();
+ ret = UP_write(b->num, in, inl);
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_fd_should_retry(ret))
+ BIO_set_retry_write(b);
+ }
+ return (ret);
+}
+
+static long fd_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret = 1;
+ int *ip;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ num = 0;
+ /* fall thru */
+ case BIO_C_FILE_SEEK:
+ ret = (long)UP_lseek(b->num, num, 0);
+ break;
+ case BIO_C_FILE_TELL:
+ case BIO_CTRL_INFO:
+ ret = (long)UP_lseek(b->num, 0, 1);
+ break;
+ case BIO_C_SET_FD:
+ fd_free(b);
+ b->num = *((int *)ptr);
+ b->shutdown = (int)num;
+ b->init = 1;
+ break;
+ case BIO_C_GET_FD:
+ if (b->init) {
+ ip = (int *)ptr;
+ if (ip != NULL)
+ *ip = b->num;
+ ret = b->num;
+ } else
+ ret = -1;
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ ret = b->shutdown;
+ break;
+ case BIO_CTRL_SET_CLOSE:
+ b->shutdown = (int)num;
+ break;
+ case BIO_CTRL_PENDING:
+ case BIO_CTRL_WPENDING:
+ ret = 0;
+ break;
+ case BIO_CTRL_DUP:
+ case BIO_CTRL_FLUSH:
+ ret = 1;
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int fd_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = fd_write(bp, str, n);
+ return (ret);
+}
+
+static int fd_gets(BIO *bp, char *buf, int size)
+{
+ int ret = 0;
+ char *ptr = buf;
+ char *end = buf + size - 1;
+
+ while ((ptr < end) && (fd_read(bp, ptr, 1) > 0) && (ptr[0] != '\n'))
+ ptr++;
+
+ ptr[0] = '\0';
+
+ if (buf[0] != '\0')
+ ret = strlen(buf);
+ return (ret);
+}
+
+int BIO_fd_should_retry(int i)
+{
+ int err;
+
+ if ((i == 0) || (i == -1)) {
+ err = get_last_sys_error();
+
+ return (BIO_fd_non_fatal_error(err));
+ }
+ return (0);
+}
+
+int BIO_fd_non_fatal_error(int err)
+{
+ switch (err) {
+
+# ifdef EWOULDBLOCK
+# ifdef WSAEWOULDBLOCK
+# if WSAEWOULDBLOCK != EWOULDBLOCK
+ case EWOULDBLOCK:
+# endif
+# else
+ case EWOULDBLOCK:
+# endif
+# endif
+
+# if defined(ENOTCONN)
+ case ENOTCONN:
+# endif
+
+# ifdef EINTR
+ case EINTR:
+# endif
+
+# ifdef EAGAIN
+# if EWOULDBLOCK != EAGAIN
+ case EAGAIN:
+# endif
+# endif
+
+# ifdef EPROTO
+ case EPROTO:
+# endif
+
+# ifdef EINPROGRESS
+ case EINPROGRESS:
+# endif
+
+# ifdef EALREADY
+ case EALREADY:
+# endif
+ return (1);
+ /* break; */
+ default:
+ break;
+ }
+ return (0);
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/bio/bss_file.c b/openssl-1.1.0h/crypto/bio/bss_file.c
new file mode 100644
index 0000000..2edf244
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_file.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*-
+ * 03-Dec-1997 rdenny@dc3.com Fix bug preventing use of stdin/stdout
+ * with binary data (e.g. asn1parse -inform DER < xxx) under
+ * Windows
+ */
+
+#ifndef HEADER_BSS_FILE_C
+# define HEADER_BSS_FILE_C
+
+# if defined(__linux) || defined(__sun) || defined(__hpux)
+/*
+ * Following definition aliases fopen to fopen64 on above mentioned
+ * platforms. This makes it possible to open and sequentially access files
+ * larger than 2GB from 32-bit application. It does not allow to traverse
+ * them beyond 2GB with fseek/ftell, but on the other hand *no* 32-bit
+ * platform permits that, not with fseek/ftell. Not to mention that breaking
+ * 2GB limit for seeking would require surgery to *our* API. But sequential
+ * access suffices for practical cases when you can run into large files,
+ * such as fingerprinting, so we can let API alone. For reference, the list
+ * of 32-bit platforms which allow for sequential access of large files
+ * without extra "magic" comprise *BSD, Darwin, IRIX...
+ */
+# ifndef _FILE_OFFSET_BITS
+# define _FILE_OFFSET_BITS 64
+# endif
+# endif
+
+# include <stdio.h>
+# include <errno.h>
+# include "bio_lcl.h"
+# include <openssl/err.h>
+
+# if !defined(OPENSSL_NO_STDIO)
+
+static int file_write(BIO *h, const char *buf, int num);
+static int file_read(BIO *h, char *buf, int size);
+static int file_puts(BIO *h, const char *str);
+static int file_gets(BIO *h, char *str, int size);
+static long file_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int file_new(BIO *h);
+static int file_free(BIO *data);
+static const BIO_METHOD methods_filep = {
+ BIO_TYPE_FILE,
+ "FILE pointer",
+ file_write,
+ file_read,
+ file_puts,
+ file_gets,
+ file_ctrl,
+ file_new,
+ file_free,
+ NULL, /* file_callback_ctrl */
+};
+
+BIO *BIO_new_file(const char *filename, const char *mode)
+{
+ BIO *ret;
+ FILE *file = openssl_fopen(filename, mode);
+ int fp_flags = BIO_CLOSE;
+
+ if (strchr(mode, 'b') == NULL)
+ fp_flags |= BIO_FP_TEXT;
+
+ if (file == NULL) {
+ SYSerr(SYS_F_FOPEN, get_last_sys_error());
+ ERR_add_error_data(5, "fopen('", filename, "','", mode, "')");
+ if (errno == ENOENT
+# ifdef ENXIO
+ || errno == ENXIO
+# endif
+ )
+ BIOerr(BIO_F_BIO_NEW_FILE, BIO_R_NO_SUCH_FILE);
+ else
+ BIOerr(BIO_F_BIO_NEW_FILE, ERR_R_SYS_LIB);
+ return (NULL);
+ }
+ if ((ret = BIO_new(BIO_s_file())) == NULL) {
+ fclose(file);
+ return (NULL);
+ }
+
+ BIO_clear_flags(ret, BIO_FLAGS_UPLINK); /* we did fopen -> we disengage
+ * UPLINK */
+ BIO_set_fp(ret, file, fp_flags);
+ return (ret);
+}
+
+BIO *BIO_new_fp(FILE *stream, int close_flag)
+{
+ BIO *ret;
+
+ if ((ret = BIO_new(BIO_s_file())) == NULL)
+ return (NULL);
+
+ /* redundant flag, left for documentation purposes */
+ BIO_set_flags(ret, BIO_FLAGS_UPLINK);
+ BIO_set_fp(ret, stream, close_flag);
+ return (ret);
+}
+
+const BIO_METHOD *BIO_s_file(void)
+{
+ return (&methods_filep);
+}
+
+static int file_new(BIO *bi)
+{
+ bi->init = 0;
+ bi->num = 0;
+ bi->ptr = NULL;
+ bi->flags = BIO_FLAGS_UPLINK; /* default to UPLINK */
+ return (1);
+}
+
+static int file_free(BIO *a)
+{
+ if (a == NULL)
+ return (0);
+ if (a->shutdown) {
+ if ((a->init) && (a->ptr != NULL)) {
+ if (a->flags & BIO_FLAGS_UPLINK)
+ UP_fclose(a->ptr);
+ else
+ fclose(a->ptr);
+ a->ptr = NULL;
+ a->flags = BIO_FLAGS_UPLINK;
+ }
+ a->init = 0;
+ }
+ return (1);
+}
+
+static int file_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+
+ if (b->init && (out != NULL)) {
+ if (b->flags & BIO_FLAGS_UPLINK)
+ ret = UP_fread(out, 1, (int)outl, b->ptr);
+ else
+ ret = fread(out, 1, (int)outl, (FILE *)b->ptr);
+ if (ret == 0
+ && (b->flags & BIO_FLAGS_UPLINK) ? UP_ferror((FILE *)b->ptr) :
+ ferror((FILE *)b->ptr)) {
+ SYSerr(SYS_F_FREAD, get_last_sys_error());
+ BIOerr(BIO_F_FILE_READ, ERR_R_SYS_LIB);
+ ret = -1;
+ }
+ }
+ return (ret);
+}
+
+static int file_write(BIO *b, const char *in, int inl)
+{
+ int ret = 0;
+
+ if (b->init && (in != NULL)) {
+ if (b->flags & BIO_FLAGS_UPLINK)
+ ret = UP_fwrite(in, (int)inl, 1, b->ptr);
+ else
+ ret = fwrite(in, (int)inl, 1, (FILE *)b->ptr);
+ if (ret)
+ ret = inl;
+ /* ret=fwrite(in,1,(int)inl,(FILE *)b->ptr); */
+ /*
+ * according to Tim Hudson <tjh@cryptsoft.com>, the commented out
+ * version above can cause 'inl' write calls under some stupid stdio
+ * implementations (VMS)
+ */
+ }
+ return (ret);
+}
+
+static long file_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret = 1;
+ FILE *fp = (FILE *)b->ptr;
+ FILE **fpp;
+ char p[4];
+ int st;
+
+ switch (cmd) {
+ case BIO_C_FILE_SEEK:
+ case BIO_CTRL_RESET:
+ if (b->flags & BIO_FLAGS_UPLINK)
+ ret = (long)UP_fseek(b->ptr, num, 0);
+ else
+ ret = (long)fseek(fp, num, 0);
+ break;
+ case BIO_CTRL_EOF:
+ if (b->flags & BIO_FLAGS_UPLINK)
+ ret = (long)UP_feof(fp);
+ else
+ ret = (long)feof(fp);
+ break;
+ case BIO_C_FILE_TELL:
+ case BIO_CTRL_INFO:
+ if (b->flags & BIO_FLAGS_UPLINK)
+ ret = UP_ftell(b->ptr);
+ else
+ ret = ftell(fp);
+ break;
+ case BIO_C_SET_FILE_PTR:
+ file_free(b);
+ b->shutdown = (int)num & BIO_CLOSE;
+ b->ptr = ptr;
+ b->init = 1;
+# if BIO_FLAGS_UPLINK!=0
+# if defined(__MINGW32__) && defined(__MSVCRT__) && !defined(_IOB_ENTRIES)
+# define _IOB_ENTRIES 20
+# endif
+ /* Safety net to catch purely internal BIO_set_fp calls */
+# if defined(_MSC_VER) && _MSC_VER>=1900
+ if (ptr == stdin || ptr == stdout || ptr == stderr)
+ BIO_clear_flags(b, BIO_FLAGS_UPLINK);
+# elif defined(_IOB_ENTRIES)
+ if ((size_t)ptr >= (size_t)stdin &&
+ (size_t)ptr < (size_t)(stdin + _IOB_ENTRIES))
+ BIO_clear_flags(b, BIO_FLAGS_UPLINK);
+# endif
+# endif
+# ifdef UP_fsetmod
+ if (b->flags & BIO_FLAGS_UPLINK)
+ UP_fsetmod(b->ptr, (char)((num & BIO_FP_TEXT) ? 't' : 'b'));
+ else
+# endif
+ {
+# if defined(OPENSSL_SYS_WINDOWS)
+ int fd = _fileno((FILE *)ptr);
+ if (num & BIO_FP_TEXT)
+ _setmode(fd, _O_TEXT);
+ else
+ _setmode(fd, _O_BINARY);
+# elif defined(OPENSSL_SYS_MSDOS)
+ int fd = fileno((FILE *)ptr);
+ /* Set correct text/binary mode */
+ if (num & BIO_FP_TEXT)
+ _setmode(fd, _O_TEXT);
+ /* Dangerous to set stdin/stdout to raw (unless redirected) */
+ else {
+ if (fd == STDIN_FILENO || fd == STDOUT_FILENO) {
+ if (isatty(fd) <= 0)
+ _setmode(fd, _O_BINARY);
+ } else
+ _setmode(fd, _O_BINARY);
+ }
+# elif defined(OPENSSL_SYS_WIN32_CYGWIN)
+ int fd = fileno((FILE *)ptr);
+ if (num & BIO_FP_TEXT)
+ setmode(fd, O_TEXT);
+ else
+ setmode(fd, O_BINARY);
+# endif
+ }
+ break;
+ case BIO_C_SET_FILENAME:
+ file_free(b);
+ b->shutdown = (int)num & BIO_CLOSE;
+ if (num & BIO_FP_APPEND) {
+ if (num & BIO_FP_READ)
+ OPENSSL_strlcpy(p, "a+", sizeof(p));
+ else
+ OPENSSL_strlcpy(p, "a", sizeof(p));
+ } else if ((num & BIO_FP_READ) && (num & BIO_FP_WRITE))
+ OPENSSL_strlcpy(p, "r+", sizeof(p));
+ else if (num & BIO_FP_WRITE)
+ OPENSSL_strlcpy(p, "w", sizeof(p));
+ else if (num & BIO_FP_READ)
+ OPENSSL_strlcpy(p, "r", sizeof(p));
+ else {
+ BIOerr(BIO_F_FILE_CTRL, BIO_R_BAD_FOPEN_MODE);
+ ret = 0;
+ break;
+ }
+# if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_WIN32_CYGWIN)
+ if (!(num & BIO_FP_TEXT))
+ strcat(p, "b");
+ else
+ strcat(p, "t");
+# endif
+ fp = openssl_fopen(ptr, p);
+ if (fp == NULL) {
+ SYSerr(SYS_F_FOPEN, get_last_sys_error());
+ ERR_add_error_data(5, "fopen('", ptr, "','", p, "')");
+ BIOerr(BIO_F_FILE_CTRL, ERR_R_SYS_LIB);
+ ret = 0;
+ break;
+ }
+ b->ptr = fp;
+ b->init = 1;
+ BIO_clear_flags(b, BIO_FLAGS_UPLINK); /* we did fopen -> we disengage
+ * UPLINK */
+ break;
+ case BIO_C_GET_FILE_PTR:
+ /* the ptr parameter is actually a FILE ** in this case. */
+ if (ptr != NULL) {
+ fpp = (FILE **)ptr;
+ *fpp = (FILE *)b->ptr;
+ }
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ ret = (long)b->shutdown;
+ break;
+ case BIO_CTRL_SET_CLOSE:
+ b->shutdown = (int)num;
+ break;
+ case BIO_CTRL_FLUSH:
+ st = b->flags & BIO_FLAGS_UPLINK
+ ? UP_fflush(b->ptr) : fflush((FILE *)b->ptr);
+ if (st == EOF) {
+ SYSerr(SYS_F_FFLUSH, get_last_sys_error());
+ ERR_add_error_data(1, "fflush()");
+ BIOerr(BIO_F_FILE_CTRL, ERR_R_SYS_LIB);
+ ret = 0;
+ }
+ break;
+ case BIO_CTRL_DUP:
+ ret = 1;
+ break;
+
+ case BIO_CTRL_WPENDING:
+ case BIO_CTRL_PENDING:
+ case BIO_CTRL_PUSH:
+ case BIO_CTRL_POP:
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int file_gets(BIO *bp, char *buf, int size)
+{
+ int ret = 0;
+
+ buf[0] = '\0';
+ if (bp->flags & BIO_FLAGS_UPLINK) {
+ if (!UP_fgets(buf, size, bp->ptr))
+ goto err;
+ } else {
+ if (!fgets(buf, size, (FILE *)bp->ptr))
+ goto err;
+ }
+ if (buf[0] != '\0')
+ ret = strlen(buf);
+ err:
+ return (ret);
+}
+
+static int file_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = file_write(bp, str, n);
+ return (ret);
+}
+
+#else
+
+static int file_write(BIO *b, const char *in, int inl)
+{
+ return -1;
+}
+static int file_read(BIO *b, char *out, int outl)
+{
+ return -1;
+}
+static int file_puts(BIO *bp, const char *str)
+{
+ return -1;
+}
+static int file_gets(BIO *bp, char *buf, int size)
+{
+ return 0;
+}
+static long file_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ return 0;
+}
+static int file_new(BIO *bi)
+{
+ return 0;
+}
+static int file_free(BIO *a)
+{
+ return 0;
+}
+
+static const BIO_METHOD methods_filep = {
+ BIO_TYPE_FILE,
+ "FILE pointer",
+ file_write,
+ file_read,
+ file_puts,
+ file_gets,
+ file_ctrl,
+ file_new,
+ file_free,
+ NULL, /* file_callback_ctrl */
+};
+
+const BIO_METHOD *BIO_s_file(void)
+{
+ return (&methods_filep);
+}
+
+BIO *BIO_new_file(const char *filename, const char *mode)
+{
+ return NULL;
+}
+
+# endif /* OPENSSL_NO_STDIO */
+
+#endif /* HEADER_BSS_FILE_C */
diff --git a/openssl-1.1.0h/crypto/bio/bss_log.c b/openssl-1.1.0h/crypto/bio/bss_log.c
new file mode 100644
index 0000000..5221acc
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_log.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright 1999-2017 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Why BIO_s_log?
+ *
+ * BIO_s_log is useful for system daemons (or services under NT). It is
+ * one-way BIO, it sends all stuff to syslogd (on system that commonly use
+ * that), or event log (on NT), or OPCOM (on OpenVMS).
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+
+#if defined(OPENSSL_SYS_WINCE)
+#elif defined(OPENSSL_SYS_WIN32)
+#elif defined(OPENSSL_SYS_VMS)
+# include <opcdef.h>
+# include <descrip.h>
+# include <lib$routines.h>
+# include <starlet.h>
+/* Some compiler options may mask the declaration of "_malloc32". */
+# if __INITIAL_POINTER_SIZE && defined _ANSI_C_SOURCE
+# if __INITIAL_POINTER_SIZE == 64
+# pragma pointer_size save
+# pragma pointer_size 32
+void *_malloc32(__size_t);
+# pragma pointer_size restore
+# endif /* __INITIAL_POINTER_SIZE == 64 */
+# endif /* __INITIAL_POINTER_SIZE && defined
+ * _ANSI_C_SOURCE */
+#elif defined(OPENSSL_SYS_NETWARE)
+# define NO_SYSLOG
+#elif (!defined(MSDOS) || defined(WATT32)) && !defined(OPENSSL_SYS_VXWORKS) && !defined(NO_SYSLOG)
+# include <syslog.h>
+#endif
+
+#include <openssl/buffer.h>
+#include <openssl/err.h>
+
+#ifndef NO_SYSLOG
+
+# if defined(OPENSSL_SYS_WIN32)
+# define LOG_EMERG 0
+# define LOG_ALERT 1
+# define LOG_CRIT 2
+# define LOG_ERR 3
+# define LOG_WARNING 4
+# define LOG_NOTICE 5
+# define LOG_INFO 6
+# define LOG_DEBUG 7
+
+# define LOG_DAEMON (3<<3)
+# elif defined(OPENSSL_SYS_VMS)
+/* On VMS, we don't really care about these, but we need them to compile */
+# define LOG_EMERG 0
+# define LOG_ALERT 1
+# define LOG_CRIT 2
+# define LOG_ERR 3
+# define LOG_WARNING 4
+# define LOG_NOTICE 5
+# define LOG_INFO 6
+# define LOG_DEBUG 7
+
+# define LOG_DAEMON OPC$M_NM_NTWORK
+# endif
+
+static int slg_write(BIO *h, const char *buf, int num);
+static int slg_puts(BIO *h, const char *str);
+static long slg_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int slg_new(BIO *h);
+static int slg_free(BIO *data);
+static void xopenlog(BIO *bp, char *name, int level);
+static void xsyslog(BIO *bp, int priority, const char *string);
+static void xcloselog(BIO *bp);
+
+static const BIO_METHOD methods_slg = {
+ BIO_TYPE_MEM,
+ "syslog",
+ slg_write,
+ NULL, /* slg_read, */
+ slg_puts,
+ NULL, /* slg_gets, */
+ slg_ctrl,
+ slg_new,
+ slg_free,
+ NULL, /* slg_callback_ctrl */
+};
+
+const BIO_METHOD *BIO_s_log(void)
+{
+ return (&methods_slg);
+}
+
+static int slg_new(BIO *bi)
+{
+ bi->init = 1;
+ bi->num = 0;
+ bi->ptr = NULL;
+ xopenlog(bi, "application", LOG_DAEMON);
+ return (1);
+}
+
+static int slg_free(BIO *a)
+{
+ if (a == NULL)
+ return (0);
+ xcloselog(a);
+ return (1);
+}
+
+static int slg_write(BIO *b, const char *in, int inl)
+{
+ int ret = inl;
+ char *buf;
+ char *pp;
+ int priority, i;
+ static const struct {
+ int strl;
+ char str[10];
+ int log_level;
+ } mapping[] = {
+ {
+ 6, "PANIC ", LOG_EMERG
+ },
+ {
+ 6, "EMERG ", LOG_EMERG
+ },
+ {
+ 4, "EMR ", LOG_EMERG
+ },
+ {
+ 6, "ALERT ", LOG_ALERT
+ },
+ {
+ 4, "ALR ", LOG_ALERT
+ },
+ {
+ 5, "CRIT ", LOG_CRIT
+ },
+ {
+ 4, "CRI ", LOG_CRIT
+ },
+ {
+ 6, "ERROR ", LOG_ERR
+ },
+ {
+ 4, "ERR ", LOG_ERR
+ },
+ {
+ 8, "WARNING ", LOG_WARNING
+ },
+ {
+ 5, "WARN ", LOG_WARNING
+ },
+ {
+ 4, "WAR ", LOG_WARNING
+ },
+ {
+ 7, "NOTICE ", LOG_NOTICE
+ },
+ {
+ 5, "NOTE ", LOG_NOTICE
+ },
+ {
+ 4, "NOT ", LOG_NOTICE
+ },
+ {
+ 5, "INFO ", LOG_INFO
+ },
+ {
+ 4, "INF ", LOG_INFO
+ },
+ {
+ 6, "DEBUG ", LOG_DEBUG
+ },
+ {
+ 4, "DBG ", LOG_DEBUG
+ },
+ {
+ 0, "", LOG_ERR
+ }
+ /* The default */
+ };
+
+ if ((buf = OPENSSL_malloc(inl + 1)) == NULL) {
+ return (0);
+ }
+ strncpy(buf, in, inl);
+ buf[inl] = '\0';
+
+ i = 0;
+ while (strncmp(buf, mapping[i].str, mapping[i].strl) != 0)
+ i++;
+ priority = mapping[i].log_level;
+ pp = buf + mapping[i].strl;
+
+ xsyslog(b, priority, pp);
+
+ OPENSSL_free(buf);
+ return (ret);
+}
+
+static long slg_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ switch (cmd) {
+ case BIO_CTRL_SET:
+ xcloselog(b);
+ xopenlog(b, ptr, num);
+ break;
+ default:
+ break;
+ }
+ return (0);
+}
+
+static int slg_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = slg_write(bp, str, n);
+ return (ret);
+}
+
+# if defined(OPENSSL_SYS_WIN32)
+
+static void xopenlog(BIO *bp, char *name, int level)
+{
+ if (check_winnt())
+ bp->ptr = RegisterEventSourceA(NULL, name);
+ else
+ bp->ptr = NULL;
+}
+
+static void xsyslog(BIO *bp, int priority, const char *string)
+{
+ LPCSTR lpszStrings[2];
+ WORD evtype = EVENTLOG_ERROR_TYPE;
+ char pidbuf[DECIMAL_SIZE(DWORD) + 4];
+
+ if (bp->ptr == NULL)
+ return;
+
+ switch (priority) {
+ case LOG_EMERG:
+ case LOG_ALERT:
+ case LOG_CRIT:
+ case LOG_ERR:
+ evtype = EVENTLOG_ERROR_TYPE;
+ break;
+ case LOG_WARNING:
+ evtype = EVENTLOG_WARNING_TYPE;
+ break;
+ case LOG_NOTICE:
+ case LOG_INFO:
+ case LOG_DEBUG:
+ evtype = EVENTLOG_INFORMATION_TYPE;
+ break;
+ default:
+ /*
+ * Should never happen, but set it
+ * as error anyway.
+ */
+ evtype = EVENTLOG_ERROR_TYPE;
+ break;
+ }
+
+ sprintf(pidbuf, "[%lu] ", GetCurrentProcessId());
+ lpszStrings[0] = pidbuf;
+ lpszStrings[1] = string;
+
+ ReportEventA(bp->ptr, evtype, 0, 1024, NULL, 2, 0, lpszStrings, NULL);
+}
+
+static void xcloselog(BIO *bp)
+{
+ if (bp->ptr)
+ DeregisterEventSource((HANDLE) (bp->ptr));
+ bp->ptr = NULL;
+}
+
+# elif defined(OPENSSL_SYS_VMS)
+
+static int VMS_OPC_target = LOG_DAEMON;
+
+static void xopenlog(BIO *bp, char *name, int level)
+{
+ VMS_OPC_target = level;
+}
+
+static void xsyslog(BIO *bp, int priority, const char *string)
+{
+ struct dsc$descriptor_s opc_dsc;
+
+/* Arrange 32-bit pointer to opcdef buffer and malloc(), if needed. */
+# if __INITIAL_POINTER_SIZE == 64
+# pragma pointer_size save
+# pragma pointer_size 32
+# define OPCDEF_TYPE __char_ptr32
+# define OPCDEF_MALLOC _malloc32
+# else /* __INITIAL_POINTER_SIZE == 64 */
+# define OPCDEF_TYPE char *
+# define OPCDEF_MALLOC OPENSSL_malloc
+# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
+
+ struct opcdef *opcdef_p;
+
+# if __INITIAL_POINTER_SIZE == 64
+# pragma pointer_size restore
+# endif /* __INITIAL_POINTER_SIZE == 64 */
+
+ char buf[10240];
+ unsigned int len;
+ struct dsc$descriptor_s buf_dsc;
+ $DESCRIPTOR(fao_cmd, "!AZ: !AZ");
+ char *priority_tag;
+
+ switch (priority) {
+ case LOG_EMERG:
+ priority_tag = "Emergency";
+ break;
+ case LOG_ALERT:
+ priority_tag = "Alert";
+ break;
+ case LOG_CRIT:
+ priority_tag = "Critical";
+ break;
+ case LOG_ERR:
+ priority_tag = "Error";
+ break;
+ case LOG_WARNING:
+ priority_tag = "Warning";
+ break;
+ case LOG_NOTICE:
+ priority_tag = "Notice";
+ break;
+ case LOG_INFO:
+ priority_tag = "Info";
+ break;
+ case LOG_DEBUG:
+ priority_tag = "DEBUG";
+ break;
+ }
+
+ buf_dsc.dsc$b_dtype = DSC$K_DTYPE_T;
+ buf_dsc.dsc$b_class = DSC$K_CLASS_S;
+ buf_dsc.dsc$a_pointer = buf;
+ buf_dsc.dsc$w_length = sizeof(buf) - 1;
+
+ lib$sys_fao(&fao_cmd, &len, &buf_dsc, priority_tag, string);
+
+ /* We know there's an 8-byte header. That's documented. */
+ opcdef_p = OPCDEF_MALLOC(8 + len);
+ opcdef_p->opc$b_ms_type = OPC$_RQ_RQST;
+ memcpy(opcdef_p->opc$z_ms_target_classes, &VMS_OPC_target, 3);
+ opcdef_p->opc$l_ms_rqstid = 0;
+ memcpy(&opcdef_p->opc$l_ms_text, buf, len);
+
+ opc_dsc.dsc$b_dtype = DSC$K_DTYPE_T;
+ opc_dsc.dsc$b_class = DSC$K_CLASS_S;
+ opc_dsc.dsc$a_pointer = (OPCDEF_TYPE) opcdef_p;
+ opc_dsc.dsc$w_length = len + 8;
+
+ sys$sndopr(opc_dsc, 0);
+
+ OPENSSL_free(opcdef_p);
+}
+
+static void xcloselog(BIO *bp)
+{
+}
+
+# else /* Unix/Watt32 */
+
+static void xopenlog(BIO *bp, char *name, int level)
+{
+# ifdef WATT32 /* djgpp/DOS */
+ openlog(name, LOG_PID | LOG_CONS | LOG_NDELAY, level);
+# else
+ openlog(name, LOG_PID | LOG_CONS, level);
+# endif
+}
+
+static void xsyslog(BIO *bp, int priority, const char *string)
+{
+ syslog(priority, "%s", string);
+}
+
+static void xcloselog(BIO *bp)
+{
+ closelog();
+}
+
+# endif /* Unix */
+
+#endif /* NO_SYSLOG */
diff --git a/openssl-1.1.0h/crypto/bio/bss_mem.c b/openssl-1.1.0h/crypto/bio/bss_mem.c
new file mode 100644
index 0000000..ff9a3eb
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_mem.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+
+static int mem_write(BIO *h, const char *buf, int num);
+static int mem_read(BIO *h, char *buf, int size);
+static int mem_puts(BIO *h, const char *str);
+static int mem_gets(BIO *h, char *str, int size);
+static long mem_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int mem_new(BIO *h);
+static int secmem_new(BIO *h);
+static int mem_free(BIO *data);
+static int mem_buf_free(BIO *data, int free_all);
+static int mem_buf_sync(BIO *h);
+
+static const BIO_METHOD mem_method = {
+ BIO_TYPE_MEM,
+ "memory buffer",
+ mem_write,
+ mem_read,
+ mem_puts,
+ mem_gets,
+ mem_ctrl,
+ mem_new,
+ mem_free,
+ NULL, /* mem_callback_ctrl */
+};
+
+static const BIO_METHOD secmem_method = {
+ BIO_TYPE_MEM,
+ "secure memory buffer",
+ mem_write,
+ mem_read,
+ mem_puts,
+ mem_gets,
+ mem_ctrl,
+ secmem_new,
+ mem_free,
+ NULL, /* mem_callback_ctrl */
+};
+
+/* BIO memory stores buffer and read pointer */
+typedef struct bio_buf_mem_st {
+ struct buf_mem_st *buf; /* allocated buffer */
+ struct buf_mem_st *readp; /* read pointer */
+} BIO_BUF_MEM;
+
+/*
+ * bio->num is used to hold the value to return on 'empty', if it is 0,
+ * should_retry is not set
+ */
+
+const BIO_METHOD *BIO_s_mem(void)
+{
+ return (&mem_method);
+}
+
+const BIO_METHOD *BIO_s_secmem(void)
+{
+ return(&secmem_method);
+}
+
+BIO *BIO_new_mem_buf(const void *buf, int len)
+{
+ BIO *ret;
+ BUF_MEM *b;
+ BIO_BUF_MEM *bb;
+ size_t sz;
+
+ if (buf == NULL) {
+ BIOerr(BIO_F_BIO_NEW_MEM_BUF, BIO_R_NULL_PARAMETER);
+ return NULL;
+ }
+ sz = (len < 0) ? strlen(buf) : (size_t)len;
+ if ((ret = BIO_new(BIO_s_mem())) == NULL)
+ return NULL;
+ bb = (BIO_BUF_MEM *)ret->ptr;
+ b = bb->buf;
+ /* Cast away const and trust in the MEM_RDONLY flag. */
+ b->data = (void *)buf;
+ b->length = sz;
+ b->max = sz;
+ *bb->readp = *bb->buf;
+ ret->flags |= BIO_FLAGS_MEM_RDONLY;
+ /* Since this is static data retrying won't help */
+ ret->num = 0;
+ return ret;
+}
+
+static int mem_init(BIO *bi, unsigned long flags)
+{
+ BIO_BUF_MEM *bb = OPENSSL_zalloc(sizeof(*bb));
+
+ if (bb == NULL)
+ return 0;
+ if ((bb->buf = BUF_MEM_new_ex(flags)) == NULL) {
+ OPENSSL_free(bb);
+ return 0;
+ }
+ if ((bb->readp = OPENSSL_zalloc(sizeof(*bb->readp))) == NULL) {
+ BUF_MEM_free(bb->buf);
+ OPENSSL_free(bb);
+ return 0;
+ }
+ *bb->readp = *bb->buf;
+ bi->shutdown = 1;
+ bi->init = 1;
+ bi->num = -1;
+ bi->ptr = (char *)bb;
+ return 1;
+}
+
+static int mem_new(BIO *bi)
+{
+ return (mem_init(bi, 0L));
+}
+
+static int secmem_new(BIO *bi)
+{
+ return (mem_init(bi, BUF_MEM_FLAG_SECURE));
+}
+
+static int mem_free(BIO *a)
+{
+ return (mem_buf_free(a, 1));
+}
+
+static int mem_buf_free(BIO *a, int free_all)
+{
+ if (a == NULL)
+ return (0);
+ if (a->shutdown) {
+ if ((a->init) && (a->ptr != NULL)) {
+ BUF_MEM *b;
+ BIO_BUF_MEM *bb = (BIO_BUF_MEM *)a->ptr;
+
+ if (bb != NULL) {
+ b = bb->buf;
+ if (a->flags & BIO_FLAGS_MEM_RDONLY)
+ b->data = NULL;
+ BUF_MEM_free(b);
+ if (free_all) {
+ OPENSSL_free(bb->readp);
+ OPENSSL_free(bb);
+ }
+ }
+ a->ptr = NULL;
+ }
+ }
+ return (1);
+}
+
+/*
+ * Reallocate memory buffer if read pointer differs
+ */
+static int mem_buf_sync(BIO *b)
+{
+ if (b != NULL && b->init != 0 && b->ptr != NULL) {
+ BIO_BUF_MEM *bbm = (BIO_BUF_MEM *)b->ptr;
+
+ if (bbm->readp->data != bbm->buf->data) {
+ memmove(bbm->buf->data, bbm->readp->data, bbm->readp->length);
+ bbm->buf->length = bbm->readp->length;
+ bbm->readp->data = bbm->buf->data;
+ }
+ }
+ return (0);
+}
+
+static int mem_read(BIO *b, char *out, int outl)
+{
+ int ret = -1;
+ BIO_BUF_MEM *bbm = (BIO_BUF_MEM *)b->ptr;
+ BUF_MEM *bm = bbm->readp;
+
+ BIO_clear_retry_flags(b);
+ ret = (outl >= 0 && (size_t)outl > bm->length) ? (int)bm->length : outl;
+ if ((out != NULL) && (ret > 0)) {
+ memcpy(out, bm->data, ret);
+ bm->length -= ret;
+ bm->data += ret;
+ } else if (bm->length == 0) {
+ ret = b->num;
+ if (ret != 0)
+ BIO_set_retry_read(b);
+ }
+ return (ret);
+}
+
+static int mem_write(BIO *b, const char *in, int inl)
+{
+ int ret = -1;
+ int blen;
+ BIO_BUF_MEM *bbm = (BIO_BUF_MEM *)b->ptr;
+
+ if (in == NULL) {
+ BIOerr(BIO_F_MEM_WRITE, BIO_R_NULL_PARAMETER);
+ goto end;
+ }
+ if (b->flags & BIO_FLAGS_MEM_RDONLY) {
+ BIOerr(BIO_F_MEM_WRITE, BIO_R_WRITE_TO_READ_ONLY_BIO);
+ goto end;
+ }
+ BIO_clear_retry_flags(b);
+ blen = bbm->readp->length;
+ mem_buf_sync(b);
+ if (BUF_MEM_grow_clean(bbm->buf, blen + inl) == 0)
+ goto end;
+ memcpy(bbm->buf->data + blen, in, inl);
+ *bbm->readp = *bbm->buf;
+ ret = inl;
+ end:
+ return (ret);
+}
+
+static long mem_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret = 1;
+ char **pptr;
+ BIO_BUF_MEM *bbm = (BIO_BUF_MEM *)b->ptr;
+ BUF_MEM *bm;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ bm = bbm->buf;
+ if (bm->data != NULL) {
+ /* For read only case reset to the start again */
+ if ((b->flags & BIO_FLAGS_MEM_RDONLY) || (b->flags & BIO_FLAGS_NONCLEAR_RST)) {
+ bm->length = bm->max;
+ } else {
+ memset(bm->data, 0, bm->max);
+ bm->length = 0;
+ }
+ *bbm->readp = *bbm->buf;
+ }
+ break;
+ case BIO_CTRL_EOF:
+ bm = bbm->readp;
+ ret = (long)(bm->length == 0);
+ break;
+ case BIO_C_SET_BUF_MEM_EOF_RETURN:
+ b->num = (int)num;
+ break;
+ case BIO_CTRL_INFO:
+ bm = bbm->readp;
+ ret = (long)bm->length;
+ if (ptr != NULL) {
+ pptr = (char **)ptr;
+ *pptr = (char *)&(bm->data[0]);
+ }
+ break;
+ case BIO_C_SET_BUF_MEM:
+ mem_buf_free(b, 0);
+ b->shutdown = (int)num;
+ bbm->buf = ptr;
+ *bbm->readp = *bbm->buf;
+ b->ptr = bbm;
+ break;
+ case BIO_C_GET_BUF_MEM_PTR:
+ if (ptr != NULL) {
+ mem_buf_sync(b);
+ bm = bbm->readp;
+ pptr = (char **)ptr;
+ *pptr = (char *)bm;
+ }
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ ret = (long)b->shutdown;
+ break;
+ case BIO_CTRL_SET_CLOSE:
+ b->shutdown = (int)num;
+ break;
+ case BIO_CTRL_WPENDING:
+ ret = 0L;
+ break;
+ case BIO_CTRL_PENDING:
+ bm = bbm->readp;
+ ret = (long)bm->length;
+ break;
+ case BIO_CTRL_DUP:
+ case BIO_CTRL_FLUSH:
+ ret = 1;
+ break;
+ case BIO_CTRL_PUSH:
+ case BIO_CTRL_POP:
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int mem_gets(BIO *bp, char *buf, int size)
+{
+ int i, j;
+ int ret = -1;
+ char *p;
+ BIO_BUF_MEM *bbm = (BIO_BUF_MEM *)bp->ptr;
+ BUF_MEM *bm = bbm->readp;
+
+ BIO_clear_retry_flags(bp);
+ j = bm->length;
+ if ((size - 1) < j)
+ j = size - 1;
+ if (j <= 0) {
+ *buf = '\0';
+ return 0;
+ }
+ p = bm->data;
+ for (i = 0; i < j; i++) {
+ if (p[i] == '\n') {
+ i++;
+ break;
+ }
+ }
+
+ /*
+ * i is now the max num of bytes to copy, either j or up to
+ * and including the first newline
+ */
+
+ i = mem_read(bp, buf, i);
+ if (i > 0)
+ buf[i] = '\0';
+ ret = i;
+ return (ret);
+}
+
+static int mem_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = mem_write(bp, str, n);
+ /* memory semantics is that it will always work */
+ return (ret);
+}
diff --git a/openssl-1.1.0h/crypto/bio/bss_null.c b/openssl-1.1.0h/crypto/bio/bss_null.c
new file mode 100644
index 0000000..56f95f9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_null.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+
+static int null_write(BIO *h, const char *buf, int num);
+static int null_read(BIO *h, char *buf, int size);
+static int null_puts(BIO *h, const char *str);
+static int null_gets(BIO *h, char *str, int size);
+static long null_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static const BIO_METHOD null_method = {
+ BIO_TYPE_NULL,
+ "NULL",
+ null_write,
+ null_read,
+ null_puts,
+ null_gets,
+ null_ctrl,
+ NULL,
+ NULL,
+ NULL, /* null_callback_ctrl */
+};
+
+const BIO_METHOD *BIO_s_null(void)
+{
+ return (&null_method);
+}
+
+static int null_read(BIO *b, char *out, int outl)
+{
+ return (0);
+}
+
+static int null_write(BIO *b, const char *in, int inl)
+{
+ return (inl);
+}
+
+static long null_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret = 1;
+
+ switch (cmd) {
+ case BIO_CTRL_RESET:
+ case BIO_CTRL_EOF:
+ case BIO_CTRL_SET:
+ case BIO_CTRL_SET_CLOSE:
+ case BIO_CTRL_FLUSH:
+ case BIO_CTRL_DUP:
+ ret = 1;
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ case BIO_CTRL_INFO:
+ case BIO_CTRL_GET:
+ case BIO_CTRL_PENDING:
+ case BIO_CTRL_WPENDING:
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int null_gets(BIO *bp, char *buf, int size)
+{
+ return (0);
+}
+
+static int null_puts(BIO *bp, const char *str)
+{
+ if (str == NULL)
+ return (0);
+ return (strlen(str));
+}
diff --git a/openssl-1.1.0h/crypto/bio/bss_sock.c b/openssl-1.1.0h/crypto/bio/bss_sock.c
new file mode 100644
index 0000000..992266d
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/bss_sock.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#define USE_SOCKETS
+#include "bio_lcl.h"
+#include "internal/cryptlib.h"
+
+#ifndef OPENSSL_NO_SOCK
+
+# include <openssl/bio.h>
+
+# ifdef WATT32
+/* Watt-32 uses same names */
+# undef sock_write
+# undef sock_read
+# undef sock_puts
+# define sock_write SockWrite
+# define sock_read SockRead
+# define sock_puts SockPuts
+# endif
+
+static int sock_write(BIO *h, const char *buf, int num);
+static int sock_read(BIO *h, char *buf, int size);
+static int sock_puts(BIO *h, const char *str);
+static long sock_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int sock_new(BIO *h);
+static int sock_free(BIO *data);
+int BIO_sock_should_retry(int s);
+
+static const BIO_METHOD methods_sockp = {
+ BIO_TYPE_SOCKET,
+ "socket",
+ sock_write,
+ sock_read,
+ sock_puts,
+ NULL, /* sock_gets, */
+ sock_ctrl,
+ sock_new,
+ sock_free,
+ NULL, /* sock_callback_ctrl */
+};
+
+const BIO_METHOD *BIO_s_socket(void)
+{
+ return (&methods_sockp);
+}
+
+BIO *BIO_new_socket(int fd, int close_flag)
+{
+ BIO *ret;
+
+ ret = BIO_new(BIO_s_socket());
+ if (ret == NULL)
+ return (NULL);
+ BIO_set_fd(ret, fd, close_flag);
+ return (ret);
+}
+
+static int sock_new(BIO *bi)
+{
+ bi->init = 0;
+ bi->num = 0;
+ bi->ptr = NULL;
+ bi->flags = 0;
+ return (1);
+}
+
+static int sock_free(BIO *a)
+{
+ if (a == NULL)
+ return (0);
+ if (a->shutdown) {
+ if (a->init) {
+ BIO_closesocket(a->num);
+ }
+ a->init = 0;
+ a->flags = 0;
+ }
+ return (1);
+}
+
+static int sock_read(BIO *b, char *out, int outl)
+{
+ int ret = 0;
+
+ if (out != NULL) {
+ clear_socket_error();
+ ret = readsocket(b->num, out, outl);
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_sock_should_retry(ret))
+ BIO_set_retry_read(b);
+ }
+ }
+ return (ret);
+}
+
+static int sock_write(BIO *b, const char *in, int inl)
+{
+ int ret;
+
+ clear_socket_error();
+ ret = writesocket(b->num, in, inl);
+ BIO_clear_retry_flags(b);
+ if (ret <= 0) {
+ if (BIO_sock_should_retry(ret))
+ BIO_set_retry_write(b);
+ }
+ return (ret);
+}
+
+static long sock_ctrl(BIO *b, int cmd, long num, void *ptr)
+{
+ long ret = 1;
+ int *ip;
+
+ switch (cmd) {
+ case BIO_C_SET_FD:
+ sock_free(b);
+ b->num = *((int *)ptr);
+ b->shutdown = (int)num;
+ b->init = 1;
+ break;
+ case BIO_C_GET_FD:
+ if (b->init) {
+ ip = (int *)ptr;
+ if (ip != NULL)
+ *ip = b->num;
+ ret = b->num;
+ } else
+ ret = -1;
+ break;
+ case BIO_CTRL_GET_CLOSE:
+ ret = b->shutdown;
+ break;
+ case BIO_CTRL_SET_CLOSE:
+ b->shutdown = (int)num;
+ break;
+ case BIO_CTRL_DUP:
+ case BIO_CTRL_FLUSH:
+ ret = 1;
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return (ret);
+}
+
+static int sock_puts(BIO *bp, const char *str)
+{
+ int n, ret;
+
+ n = strlen(str);
+ ret = sock_write(bp, str, n);
+ return (ret);
+}
+
+int BIO_sock_should_retry(int i)
+{
+ int err;
+
+ if ((i == 0) || (i == -1)) {
+ err = get_last_socket_error();
+
+ return (BIO_sock_non_fatal_error(err));
+ }
+ return (0);
+}
+
+int BIO_sock_non_fatal_error(int err)
+{
+ switch (err) {
+# if defined(OPENSSL_SYS_WINDOWS)
+# if defined(WSAEWOULDBLOCK)
+ case WSAEWOULDBLOCK:
+# endif
+# endif
+
+# ifdef EWOULDBLOCK
+# ifdef WSAEWOULDBLOCK
+# if WSAEWOULDBLOCK != EWOULDBLOCK
+ case EWOULDBLOCK:
+# endif
+# else
+ case EWOULDBLOCK:
+# endif
+# endif
+
+# if defined(ENOTCONN)
+ case ENOTCONN:
+# endif
+
+# ifdef EINTR
+ case EINTR:
+# endif
+
+# ifdef EAGAIN
+# if EWOULDBLOCK != EAGAIN
+ case EAGAIN:
+# endif
+# endif
+
+# ifdef EPROTO
+ case EPROTO:
+# endif
+
+# ifdef EINPROGRESS
+ case EINPROGRESS:
+# endif
+
+# ifdef EALREADY
+ case EALREADY:
+# endif
+ return (1);
+ /* break; */
+ default:
+ break;
+ }
+ return (0);
+}
+
+#endif /* #ifndef OPENSSL_NO_SOCK */
diff --git a/openssl-1.1.0h/crypto/bio/build.info b/openssl-1.1.0h/crypto/bio/build.info
new file mode 100644
index 0000000..d1e7d73
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bio/build.info
@@ -0,0 +1,8 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+ bio_lib.c bio_cb.c bio_err.c \
+ bss_mem.c bss_null.c bss_fd.c \
+ bss_file.c bss_sock.c bss_conn.c \
+ bf_null.c bf_buff.c b_print.c b_dump.c b_addr.c \
+ b_sock.c b_sock2.c bss_acpt.c bf_nbio.c bss_log.c bss_bio.c \
+ bss_dgram.c bio_meth.c bf_lbuf.c
diff --git a/openssl-1.1.0h/crypto/blake2/blake2_impl.h b/openssl-1.1.0h/crypto/blake2/blake2_impl.h
new file mode 100644
index 0000000..8fe5c95
--- /dev/null
+++ b/openssl-1.1.0h/crypto/blake2/blake2_impl.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Derived from the BLAKE2 reference implementation written by Samuel Neves.
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ * More information about the BLAKE2 hash function and its implementations
+ * can be found at https://blake2.net.
+ */
+
+#include <string.h>
+#include "e_os.h"
+
+static ossl_inline uint32_t load32(const uint8_t *src)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ if (is_endian.little) {
+ uint32_t w;
+ memcpy(&w, src, sizeof(w));
+ return w;
+ } else {
+ uint32_t w = ((uint32_t)src[0])
+ | ((uint32_t)src[1] << 8)
+ | ((uint32_t)src[2] << 16)
+ | ((uint32_t)src[3] << 24);
+ return w;
+ }
+}
+
+static ossl_inline uint64_t load64(const uint8_t *src)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ if (is_endian.little) {
+ uint64_t w;
+ memcpy(&w, src, sizeof(w));
+ return w;
+ } else {
+ uint64_t w = ((uint64_t)src[0])
+ | ((uint64_t)src[1] << 8)
+ | ((uint64_t)src[2] << 16)
+ | ((uint64_t)src[3] << 24)
+ | ((uint64_t)src[4] << 32)
+ | ((uint64_t)src[5] << 40)
+ | ((uint64_t)src[6] << 48)
+ | ((uint64_t)src[7] << 56);
+ return w;
+ }
+}
+
+static ossl_inline void store32(uint8_t *dst, uint32_t w)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ if (is_endian.little) {
+ memcpy(dst, &w, sizeof(w));
+ } else {
+ uint8_t *p = (uint8_t *)dst;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ p[i] = (uint8_t)(w >> (8 * i));
+ }
+}
+
+static ossl_inline void store64(uint8_t *dst, uint64_t w)
+{
+ const union {
+ long one;
+ char little;
+ } is_endian = { 1 };
+
+ if (is_endian.little) {
+ memcpy(dst, &w, sizeof(w));
+ } else {
+ uint8_t *p = (uint8_t *)dst;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ p[i] = (uint8_t)(w >> (8 * i));
+ }
+}
+
+static ossl_inline uint64_t load48(const uint8_t *src)
+{
+ uint64_t w = ((uint64_t)src[0])
+ | ((uint64_t)src[1] << 8)
+ | ((uint64_t)src[2] << 16)
+ | ((uint64_t)src[3] << 24)
+ | ((uint64_t)src[4] << 32)
+ | ((uint64_t)src[5] << 40);
+ return w;
+}
+
+static ossl_inline void store48(uint8_t *dst, uint64_t w)
+{
+ uint8_t *p = (uint8_t *)dst;
+ p[0] = (uint8_t)w;
+ p[1] = (uint8_t)(w>>8);
+ p[2] = (uint8_t)(w>>16);
+ p[3] = (uint8_t)(w>>24);
+ p[4] = (uint8_t)(w>>32);
+ p[5] = (uint8_t)(w>>40);
+}
+
+static ossl_inline uint32_t rotr32(const uint32_t w, const unsigned int c)
+{
+ return (w >> c) | (w << (32 - c));
+}
+
+static ossl_inline uint64_t rotr64(const uint64_t w, const unsigned int c)
+{
+ return (w >> c) | (w << (64 - c));
+}
diff --git a/openssl-1.1.0h/crypto/blake2/blake2_locl.h b/openssl-1.1.0h/crypto/blake2/blake2_locl.h
new file mode 100644
index 0000000..fb7beb9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/blake2/blake2_locl.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Derived from the BLAKE2 reference implementation written by Samuel Neves.
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ * More information about the BLAKE2 hash function and its implementations
+ * can be found at https://blake2.net.
+ */
+
+#include <stddef.h>
+#include "e_os.h"
+
+#define BLAKE2S_BLOCKBYTES 64
+#define BLAKE2S_OUTBYTES 32
+#define BLAKE2S_KEYBYTES 32
+#define BLAKE2S_SALTBYTES 8
+#define BLAKE2S_PERSONALBYTES 8
+
+#define BLAKE2B_BLOCKBYTES 128
+#define BLAKE2B_OUTBYTES 64
+#define BLAKE2B_KEYBYTES 64
+#define BLAKE2B_SALTBYTES 16
+#define BLAKE2B_PERSONALBYTES 16
+
+struct blake2s_param_st {
+ uint8_t digest_length; /* 1 */
+ uint8_t key_length; /* 2 */
+ uint8_t fanout; /* 3 */
+ uint8_t depth; /* 4 */
+ uint8_t leaf_length[4];/* 8 */
+ uint8_t node_offset[6];/* 14 */
+ uint8_t node_depth; /* 15 */
+ uint8_t inner_length; /* 16 */
+ uint8_t salt[BLAKE2S_SALTBYTES]; /* 24 */
+ uint8_t personal[BLAKE2S_PERSONALBYTES]; /* 32 */
+};
+
+typedef struct blake2s_param_st BLAKE2S_PARAM;
+
+struct blake2s_ctx_st {
+ uint32_t h[8];
+ uint32_t t[2];
+ uint32_t f[2];
+ uint8_t buf[BLAKE2S_BLOCKBYTES];
+ size_t buflen;
+};
+
+struct blake2b_param_st {
+ uint8_t digest_length; /* 1 */
+ uint8_t key_length; /* 2 */
+ uint8_t fanout; /* 3 */
+ uint8_t depth; /* 4 */
+ uint8_t leaf_length[4];/* 8 */
+ uint8_t node_offset[8];/* 16 */
+ uint8_t node_depth; /* 17 */
+ uint8_t inner_length; /* 18 */
+ uint8_t reserved[14]; /* 32 */
+ uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
+ uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
+};
+
+typedef struct blake2b_param_st BLAKE2B_PARAM;
+
+struct blake2b_ctx_st {
+ uint64_t h[8];
+ uint64_t t[2];
+ uint64_t f[2];
+ uint8_t buf[BLAKE2B_BLOCKBYTES];
+ size_t buflen;
+};
+
+#define BLAKE2B_DIGEST_LENGTH 64
+#define BLAKE2S_DIGEST_LENGTH 32
+
+typedef struct blake2s_ctx_st BLAKE2S_CTX;
+typedef struct blake2b_ctx_st BLAKE2B_CTX;
+
+int BLAKE2b_Init(BLAKE2B_CTX *c);
+int BLAKE2b_Update(BLAKE2B_CTX *c, const void *data, size_t datalen);
+int BLAKE2b_Final(unsigned char *md, BLAKE2B_CTX *c);
+
+int BLAKE2s_Init(BLAKE2S_CTX *c);
+int BLAKE2s_Update(BLAKE2S_CTX *c, const void *data, size_t datalen);
+int BLAKE2s_Final(unsigned char *md, BLAKE2S_CTX *c);
diff --git a/openssl-1.1.0h/crypto/blake2/blake2b.c b/openssl-1.1.0h/crypto/blake2/blake2b.c
new file mode 100644
index 0000000..e77bd9a
--- /dev/null
+++ b/openssl-1.1.0h/crypto/blake2/blake2b.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Derived from the BLAKE2 reference implementation written by Samuel Neves.
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ * More information about the BLAKE2 hash function and its implementations
+ * can be found at https://blake2.net.
+ */
+
+#include <assert.h>
+#include <string.h>
+#include <openssl/crypto.h>
+#include "e_os.h"
+
+#include "blake2_locl.h"
+#include "blake2_impl.h"
+
+static const uint64_t blake2b_IV[8] =
+{
+ 0x6a09e667f3bcc908U, 0xbb67ae8584caa73bU,
+ 0x3c6ef372fe94f82bU, 0xa54ff53a5f1d36f1U,
+ 0x510e527fade682d1U, 0x9b05688c2b3e6c1fU,
+ 0x1f83d9abfb41bd6bU, 0x5be0cd19137e2179U
+};
+
+static const uint8_t blake2b_sigma[12][16] =
+{
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+/* Set that it's the last block we'll compress */
+static ossl_inline void blake2b_set_lastblock(BLAKE2B_CTX *S)
+{
+ S->f[0] = -1;
+}
+
+/* Initialize the hashing state. */
+static ossl_inline void blake2b_init0(BLAKE2B_CTX *S)
+{
+ int i;
+
+ memset(S, 0, sizeof(BLAKE2B_CTX));
+ for (i = 0; i < 8; ++i) {
+ S->h[i] = blake2b_IV[i];
+ }
+}
+
+/* init xors IV with input parameter block */
+static void blake2b_init_param(BLAKE2B_CTX *S, const BLAKE2B_PARAM *P)
+{
+ size_t i;
+ const uint8_t *p = (const uint8_t *)(P);
+ blake2b_init0(S);
+
+ /* The param struct is carefully hand packed, and should be 64 bytes on
+ * every platform. */
+ assert(sizeof(BLAKE2B_PARAM) == 64);
+ /* IV XOR ParamBlock */
+ for (i = 0; i < 8; ++i) {
+ S->h[i] ^= load64(p + sizeof(S->h[i]) * i);
+ }
+}
+
+/* Initialize the hashing context. Always returns 1. */
+int BLAKE2b_Init(BLAKE2B_CTX *c)
+{
+ BLAKE2B_PARAM P[1];
+ P->digest_length = BLAKE2B_DIGEST_LENGTH;
+ P->key_length = 0;
+ P->fanout = 1;
+ P->depth = 1;
+ store32(P->leaf_length, 0);
+ store64(P->node_offset, 0);
+ P->node_depth = 0;
+ P->inner_length = 0;
+ memset(P->reserved, 0, sizeof(P->reserved));
+ memset(P->salt, 0, sizeof(P->salt));
+ memset(P->personal, 0, sizeof(P->personal));
+ blake2b_init_param(c, P);
+ return 1;
+}
+
+/* Permute the state while xoring in the block of data. */
+static void blake2b_compress(BLAKE2B_CTX *S,
+ const uint8_t *blocks,
+ size_t len)
+{
+ uint64_t m[16];
+ uint64_t v[16];
+ int i;
+ size_t increment;
+
+ /*
+ * There are two distinct usage vectors for this function:
+ *
+ * a) BLAKE2b_Update uses it to process complete blocks,
+ * possibly more than one at a time;
+ *
+ * b) BLAK2b_Final uses it to process last block, always
+ * single but possibly incomplete, in which case caller
+ * pads input with zeros.
+ */
+ assert(len < BLAKE2B_BLOCKBYTES || len % BLAKE2B_BLOCKBYTES == 0);
+
+ /*
+ * Since last block is always processed with separate call,
+ * |len| not being multiple of complete blocks can be observed
+ * only with |len| being less than BLAKE2B_BLOCKBYTES ("less"
+ * including even zero), which is why following assignment doesn't
+ * have to reside inside the main loop below.
+ */
+ increment = len < BLAKE2B_BLOCKBYTES ? len : BLAKE2B_BLOCKBYTES;
+
+ for (i = 0; i < 8; ++i) {
+ v[i] = S->h[i];
+ }
+
+ do {
+ for (i = 0; i < 16; ++i) {
+ m[i] = load64(blocks + i * sizeof(m[i]));
+ }
+
+ /* blake2b_increment_counter */
+ S->t[0] += increment;
+ S->t[1] += (S->t[0] < increment);
+
+ v[8] = blake2b_IV[0];
+ v[9] = blake2b_IV[1];
+ v[10] = blake2b_IV[2];
+ v[11] = blake2b_IV[3];
+ v[12] = S->t[0] ^ blake2b_IV[4];
+ v[13] = S->t[1] ^ blake2b_IV[5];
+ v[14] = S->f[0] ^ blake2b_IV[6];
+ v[15] = S->f[1] ^ blake2b_IV[7];
+#define G(r,i,a,b,c,d) \
+ do { \
+ a = a + b + m[blake2b_sigma[r][2*i+0]]; \
+ d = rotr64(d ^ a, 32); \
+ c = c + d; \
+ b = rotr64(b ^ c, 24); \
+ a = a + b + m[blake2b_sigma[r][2*i+1]]; \
+ d = rotr64(d ^ a, 16); \
+ c = c + d; \
+ b = rotr64(b ^ c, 63); \
+ } while (0)
+#define ROUND(r) \
+ do { \
+ G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+ G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+ G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+ G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+ G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+ G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+ } while (0)
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+ /* 3x size reduction on x86_64, almost 7x on ARMv8, 9x on ARMv4 */
+ for (i = 0; i < 12; i++) {
+ ROUND(i);
+ }
+#else
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+ ROUND(10);
+ ROUND(11);
+#endif
+
+ for (i = 0; i < 8; ++i) {
+ S->h[i] = v[i] ^= v[i + 8] ^ S->h[i];
+ }
+#undef G
+#undef ROUND
+ blocks += increment;
+ len -= increment;
+ } while (len);
+}
+
+/* Absorb the input data into the hash state. Always returns 1. */
+int BLAKE2b_Update(BLAKE2B_CTX *c, const void *data, size_t datalen)
+{
+ const uint8_t *in = data;
+ size_t fill;
+
+ /*
+ * Intuitively one would expect intermediate buffer, c->buf, to
+ * store incomplete blocks. But in this case we are interested to
+ * temporarily stash even complete blocks, because last one in the
+ * stream has to be treated in special way, and at this point we
+ * don't know if last block in *this* call is last one "ever". This
+ * is the reason for why |datalen| is compared as >, and not >=.
+ */
+ fill = sizeof(c->buf) - c->buflen;
+ if (datalen > fill) {
+ if (c->buflen) {
+ memcpy(c->buf + c->buflen, in, fill); /* Fill buffer */
+ blake2b_compress(c, c->buf, BLAKE2B_BLOCKBYTES);
+ c->buflen = 0;
+ in += fill;
+ datalen -= fill;
+ }
+ if (datalen > BLAKE2B_BLOCKBYTES) {
+ size_t stashlen = datalen % BLAKE2B_BLOCKBYTES;
+ /*
+ * If |datalen| is a multiple of the blocksize, stash
+ * last complete block, it can be final one...
+ */
+ stashlen = stashlen ? stashlen : BLAKE2B_BLOCKBYTES;
+ datalen -= stashlen;
+ blake2b_compress(c, in, datalen);
+ in += datalen;
+ datalen = stashlen;
+ }
+ }
+
+ assert(datalen <= BLAKE2B_BLOCKBYTES);
+
+ memcpy(c->buf + c->buflen, in, datalen);
+ c->buflen += datalen; /* Be lazy, do not compress */
+
+ return 1;
+}
+
+/*
+ * Calculate the final hash and save it in md.
+ * Always returns 1.
+ */
+int BLAKE2b_Final(unsigned char *md, BLAKE2B_CTX *c)
+{
+ int i;
+
+ blake2b_set_lastblock(c);
+ /* Padding */
+ memset(c->buf + c->buflen, 0, sizeof(c->buf) - c->buflen);
+ blake2b_compress(c, c->buf, c->buflen);
+
+ /* Output full hash to message digest */
+ for (i = 0; i < 8; ++i) {
+ store64(md + sizeof(c->h[i]) * i, c->h[i]);
+ }
+
+ OPENSSL_cleanse(c, sizeof(BLAKE2B_CTX));
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/blake2/blake2s.c b/openssl-1.1.0h/crypto/blake2/blake2s.c
new file mode 100644
index 0000000..0b3503e
--- /dev/null
+++ b/openssl-1.1.0h/crypto/blake2/blake2s.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Derived from the BLAKE2 reference implementation written by Samuel Neves.
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ * More information about the BLAKE2 hash function and its implementations
+ * can be found at https://blake2.net.
+ */
+
+#include <assert.h>
+#include <string.h>
+#include <openssl/crypto.h>
+#include "e_os.h"
+
+#include "blake2_locl.h"
+#include "blake2_impl.h"
+
+static const uint32_t blake2s_IV[8] =
+{
+ 0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+ 0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
+};
+
+/* Set that it's the last block we'll compress */
+static ossl_inline void blake2s_set_lastblock(BLAKE2S_CTX *S)
+{
+ S->f[0] = -1;
+}
+
+/* Initialize the hashing state. */
+static ossl_inline void blake2s_init0(BLAKE2S_CTX *S)
+{
+ int i;
+
+ memset(S, 0, sizeof(BLAKE2S_CTX));
+ for (i = 0; i < 8; ++i) {
+ S->h[i] = blake2s_IV[i];
+ }
+}
+
+/* init2 xors IV with input parameter block */
+static void blake2s_init_param(BLAKE2S_CTX *S, const BLAKE2S_PARAM *P)
+{
+ const uint8_t *p = (const uint8_t *)(P);
+ size_t i;
+
+ /* The param struct is carefully hand packed, and should be 32 bytes on
+ * every platform. */
+ assert(sizeof(BLAKE2S_PARAM) == 32);
+ blake2s_init0(S);
+ /* IV XOR ParamBlock */
+ for (i = 0; i < 8; ++i) {
+ S->h[i] ^= load32(&p[i*4]);
+ }
+}
+
+/* Initialize the hashing context. Always returns 1. */
+int BLAKE2s_Init(BLAKE2S_CTX *c)
+{
+ BLAKE2S_PARAM P[1];
+
+ P->digest_length = BLAKE2S_DIGEST_LENGTH;
+ P->key_length = 0;
+ P->fanout = 1;
+ P->depth = 1;
+ store32(P->leaf_length, 0);
+ store48(P->node_offset, 0);
+ P->node_depth = 0;
+ P->inner_length = 0;
+ memset(P->salt, 0, sizeof(P->salt));
+ memset(P->personal, 0, sizeof(P->personal));
+ blake2s_init_param(c, P);
+ return 1;
+}
+
+/* Permute the state while xoring in the block of data. */
+static void blake2s_compress(BLAKE2S_CTX *S,
+ const uint8_t *blocks,
+ size_t len)
+{
+ uint32_t m[16];
+ uint32_t v[16];
+ size_t i;
+ size_t increment;
+
+ /*
+ * There are two distinct usage vectors for this function:
+ *
+ * a) BLAKE2s_Update uses it to process complete blocks,
+ * possibly more than one at a time;
+ *
+ * b) BLAK2s_Final uses it to process last block, always
+ * single but possibly incomplete, in which case caller
+ * pads input with zeros.
+ */
+ assert(len < BLAKE2S_BLOCKBYTES || len % BLAKE2S_BLOCKBYTES == 0);
+
+ /*
+ * Since last block is always processed with separate call,
+ * |len| not being multiple of complete blocks can be observed
+ * only with |len| being less than BLAKE2S_BLOCKBYTES ("less"
+ * including even zero), which is why following assignment doesn't
+ * have to reside inside the main loop below.
+ */
+ increment = len < BLAKE2S_BLOCKBYTES ? len : BLAKE2S_BLOCKBYTES;
+
+ for (i = 0; i < 8; ++i) {
+ v[i] = S->h[i];
+ }
+
+ do {
+ for (i = 0; i < 16; ++i) {
+ m[i] = load32(blocks + i * sizeof(m[i]));
+ }
+
+ /* blake2s_increment_counter */
+ S->t[0] += increment;
+ S->t[1] += (S->t[0] < increment);
+
+ v[ 8] = blake2s_IV[0];
+ v[ 9] = blake2s_IV[1];
+ v[10] = blake2s_IV[2];
+ v[11] = blake2s_IV[3];
+ v[12] = S->t[0] ^ blake2s_IV[4];
+ v[13] = S->t[1] ^ blake2s_IV[5];
+ v[14] = S->f[0] ^ blake2s_IV[6];
+ v[15] = S->f[1] ^ blake2s_IV[7];
+#define G(r,i,a,b,c,d) \
+ do { \
+ a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+ d = rotr32(d ^ a, 16); \
+ c = c + d; \
+ b = rotr32(b ^ c, 12); \
+ a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+ d = rotr32(d ^ a, 8); \
+ c = c + d; \
+ b = rotr32(b ^ c, 7); \
+ } while (0)
+#define ROUND(r) \
+ do { \
+ G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+ G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+ G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+ G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+ G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+ G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+ } while (0)
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+ /* almost 3x reduction on x86_64, 4.5x on ARMv8, 4x on ARMv4 */
+ for (i = 0; i < 10; i++) {
+ ROUND(i);
+ }
+#else
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+#endif
+
+ for (i = 0; i < 8; ++i) {
+ S->h[i] = v[i] ^= v[i + 8] ^ S->h[i];
+ }
+#undef G
+#undef ROUND
+ blocks += increment;
+ len -= increment;
+ } while (len);
+}
+
+/* Absorb the input data into the hash state. Always returns 1. */
+int BLAKE2s_Update(BLAKE2S_CTX *c, const void *data, size_t datalen)
+{
+ const uint8_t *in = data;
+ size_t fill;
+
+ /*
+ * Intuitively one would expect intermediate buffer, c->buf, to
+ * store incomplete blocks. But in this case we are interested to
+ * temporarily stash even complete blocks, because last one in the
+ * stream has to be treated in special way, and at this point we
+ * don't know if last block in *this* call is last one "ever". This
+ * is the reason for why |datalen| is compared as >, and not >=.
+ */
+ fill = sizeof(c->buf) - c->buflen;
+ if (datalen > fill) {
+ if (c->buflen) {
+ memcpy(c->buf + c->buflen, in, fill); /* Fill buffer */
+ blake2s_compress(c, c->buf, BLAKE2S_BLOCKBYTES);
+ c->buflen = 0;
+ in += fill;
+ datalen -= fill;
+ }
+ if (datalen > BLAKE2S_BLOCKBYTES) {
+ size_t stashlen = datalen % BLAKE2S_BLOCKBYTES;
+ /*
+ * If |datalen| is a multiple of the blocksize, stash
+ * last complete block, it can be final one...
+ */
+ stashlen = stashlen ? stashlen : BLAKE2S_BLOCKBYTES;
+ datalen -= stashlen;
+ blake2s_compress(c, in, datalen);
+ in += datalen;
+ datalen = stashlen;
+ }
+ }
+
+ assert(datalen <= BLAKE2S_BLOCKBYTES);
+
+ memcpy(c->buf + c->buflen, in, datalen);
+ c->buflen += datalen; /* Be lazy, do not compress */
+
+ return 1;
+}
+
+/*
+ * Calculate the final hash and save it in md.
+ * Always returns 1.
+ */
+int BLAKE2s_Final(unsigned char *md, BLAKE2S_CTX *c)
+{
+ int i;
+
+ blake2s_set_lastblock(c);
+ /* Padding */
+ memset(c->buf + c->buflen, 0, sizeof(c->buf) - c->buflen);
+ blake2s_compress(c, c->buf, c->buflen);
+
+ /* Output full hash to temp buffer */
+ for (i = 0; i < 8; ++i) {
+ store32(md + sizeof(c->h[i]) * i, c->h[i]);
+ }
+
+ OPENSSL_cleanse(c, sizeof(BLAKE2S_CTX));
+ return 1;
+}
diff --git a/openssl-1.1.0h/crypto/blake2/build.info b/openssl-1.1.0h/crypto/blake2/build.info
new file mode 100644
index 0000000..0036f08
--- /dev/null
+++ b/openssl-1.1.0h/crypto/blake2/build.info
@@ -0,0 +1,3 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+ blake2b.c blake2s.c m_blake2b.c m_blake2s.c
diff --git a/openssl-1.1.0h/crypto/blake2/m_blake2b.c b/openssl-1.1.0h/crypto/blake2/m_blake2b.c
new file mode 100644
index 0000000..82c6f6b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/blake2/m_blake2b.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Derived from the BLAKE2 reference implementation written by Samuel Neves.
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ * More information about the BLAKE2 hash function and its implementations
+ * can be found at https://blake2.net.
+ */
+
+#include "internal/cryptlib.h"
+
+#ifndef OPENSSL_NO_BLAKE2
+
+# include <openssl/evp.h>
+# include <openssl/objects.h>
+# include "blake2_locl.h"
+# include "internal/evp_int.h"
+
+static int init(EVP_MD_CTX *ctx)
+{
+ return BLAKE2b_Init(EVP_MD_CTX_md_data(ctx));
+}
+
+static int update(EVP_MD_CTX *ctx, const void *data, size_t count)
+{
+ return BLAKE2b_Update(EVP_MD_CTX_md_data(ctx), data, count);
+}
+
+static int final(EVP_MD_CTX *ctx, unsigned char *md)
+{
+ return BLAKE2b_Final(md, EVP_MD_CTX_md_data(ctx));
+}
+
+static const EVP_MD blake2b_md = {
+ NID_blake2b512,
+ 0,
+ BLAKE2B_DIGEST_LENGTH,
+ 0,
+ init,
+ update,
+ final,
+ NULL,
+ NULL,
+ BLAKE2B_BLOCKBYTES,
+ sizeof(EVP_MD *) + sizeof(BLAKE2B_CTX),
+};
+
+const EVP_MD *EVP_blake2b512(void)
+{
+ return (&blake2b_md);
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/blake2/m_blake2s.c b/openssl-1.1.0h/crypto/blake2/m_blake2s.c
new file mode 100644
index 0000000..467e91a
--- /dev/null
+++ b/openssl-1.1.0h/crypto/blake2/m_blake2s.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * Derived from the BLAKE2 reference implementation written by Samuel Neves.
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ * More information about the BLAKE2 hash function and its implementations
+ * can be found at https://blake2.net.
+ */
+
+#include "internal/cryptlib.h"
+
+#ifndef OPENSSL_NO_BLAKE2
+
+# include <openssl/evp.h>
+# include <openssl/objects.h>
+# include "blake2_locl.h"
+# include "internal/evp_int.h"
+
+static int init(EVP_MD_CTX *ctx)
+{
+ return BLAKE2s_Init(EVP_MD_CTX_md_data(ctx));
+}
+
+static int update(EVP_MD_CTX *ctx, const void *data, size_t count)
+{
+ return BLAKE2s_Update(EVP_MD_CTX_md_data(ctx), data, count);
+}
+
+static int final(EVP_MD_CTX *ctx, unsigned char *md)
+{
+ return BLAKE2s_Final(md, EVP_MD_CTX_md_data(ctx));
+}
+
+static const EVP_MD blake2s_md = {
+ NID_blake2s256,
+ 0,
+ BLAKE2S_DIGEST_LENGTH,
+ 0,
+ init,
+ update,
+ final,
+ NULL,
+ NULL,
+ BLAKE2S_BLOCKBYTES,
+ sizeof(EVP_MD *) + sizeof(BLAKE2S_CTX),
+};
+
+const EVP_MD *EVP_blake2s256(void)
+{
+ return (&blake2s_md);
+}
+#endif
diff --git a/openssl-1.1.0h/crypto/bn/README.pod b/openssl-1.1.0h/crypto/bn/README.pod
new file mode 100644
index 0000000..109ab0d
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/README.pod
@@ -0,0 +1,247 @@
+=pod
+
+=head1 NAME
+
+bn_mul_words, bn_mul_add_words, bn_sqr_words, bn_div_words,
+bn_add_words, bn_sub_words, bn_mul_comba4, bn_mul_comba8,
+bn_sqr_comba4, bn_sqr_comba8, bn_cmp_words, bn_mul_normal,
+bn_mul_low_normal, bn_mul_recursive, bn_mul_part_recursive,
+bn_mul_low_recursive, bn_mul_high, bn_sqr_normal, bn_sqr_recursive,
+bn_expand, bn_wexpand, bn_expand2, bn_fix_top, bn_check_top,
+bn_print, bn_dump, bn_set_max, bn_set_high, bn_set_low - BIGNUM
+library internal functions
+
+=head1 SYNOPSIS
+
+ #include <openssl/bn.h>
+
+ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w);
+ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num,
+ BN_ULONG w);
+ void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num);
+ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+ BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,
+ int num);
+ BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,
+ int num);
+
+ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+ void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a);
+ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a);
+
+ int bn_cmp_words(BN_ULONG *a, BN_ULONG *b, int n);
+
+ void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b,
+ int nb);
+ void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n);
+ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
+ int dna, int dnb, BN_ULONG *tmp);
+ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
+ int n, int tna, int tnb, BN_ULONG *tmp);
+ void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
+ int n2, BN_ULONG *tmp);
+ void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l,
+ int n2, BN_ULONG *tmp);
+
+ void bn_sqr_normal(BN_ULONG *r, BN_ULONG *a, int n, BN_ULONG *tmp);
+ void bn_sqr_recursive(BN_ULONG *r, BN_ULONG *a, int n2, BN_ULONG *tmp);
+
+ void mul(BN_ULONG r, BN_ULONG a, BN_ULONG w, BN_ULONG c);
+ void mul_add(BN_ULONG r, BN_ULONG a, BN_ULONG w, BN_ULONG c);
+ void sqr(BN_ULONG r0, BN_ULONG r1, BN_ULONG a);
+
+ BIGNUM *bn_expand(BIGNUM *a, int bits);
+ BIGNUM *bn_wexpand(BIGNUM *a, int n);
+ BIGNUM *bn_expand2(BIGNUM *a, int n);
+ void bn_fix_top(BIGNUM *a);
+
+ void bn_check_top(BIGNUM *a);
+ void bn_print(BIGNUM *a);
+ void bn_dump(BN_ULONG *d, int n);
+ void bn_set_max(BIGNUM *a);
+ void bn_set_high(BIGNUM *r, BIGNUM *a, int n);
+ void bn_set_low(BIGNUM *r, BIGNUM *a, int n);
+
+=head1 DESCRIPTION
+
+This page documents the internal functions used by the OpenSSL
+B<BIGNUM> implementation. They are described here to facilitate
+debugging and extending the library. They are I<not> to be used by
+applications.
+
+=head2 The BIGNUM structure
+
+ typedef struct bignum_st BIGNUM;
+
+ struct bignum_st
+ {
+ BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks. */
+ int top; /* Index of last used d +1. */
+ /* The next are internal book keeping for bn_expand. */
+ int dmax; /* Size of the d array. */
+ int neg; /* one if the number is negative */
+ int flags;
+ };
+
+
+The integer value is stored in B<d>, a malloc()ed array of words (B<BN_ULONG>),
+least significant word first. A B<BN_ULONG> can be either 16, 32 or 64 bits
+in size, depending on the 'number of bits' (B<BITS2>) specified in
+C<openssl/bn.h>.
+
+B<dmax> is the size of the B<d> array that has been allocated. B<top>
+is the number of words being used, so for a value of 4, bn.d[0]=4 and
+bn.top=1. B<neg> is 1 if the number is negative. When a B<BIGNUM> is
+B<0>, the B<d> field can be B<NULL> and B<top> == B<0>.
+
+B<flags> is a bit field of flags which are defined in C<openssl/bn.h>. The
+flags begin with B<BN_FLG_>. The macros BN_set_flags(b, n) and
+BN_get_flags(b, n) exist to enable or fetch flag(s) B<n> from B<BIGNUM>
+structure B<b>.
+
+Various routines in this library require the use of temporary
+B<BIGNUM> variables during their execution. Since dynamic memory
+allocation to create B<BIGNUM>s is rather expensive when used in
+conjunction with repeated subroutine calls, the B<BN_CTX> structure is
+used. This structure contains B<BN_CTX_NUM> B<BIGNUM>s, see
+L<BN_CTX_start(3)>.
+
+=head2 Low-level arithmetic operations
+
+These functions are implemented in C and for several platforms in
+assembly language:
+
+bn_mul_words(B<rp>, B<ap>, B<num>, B<w>) operates on the B<num> word
+arrays B<rp> and B<ap>. It computes B<ap> * B<w>, places the result
+in B<rp>, and returns the high word (carry).
+
+bn_mul_add_words(B<rp>, B<ap>, B<num>, B<w>) operates on the B<num>
+word arrays B<rp> and B<ap>. It computes B<ap> * B<w> + B<rp>, places
+the result in B<rp>, and returns the high word (carry).
+
+bn_sqr_words(B<rp>, B<ap>, B<n>) operates on the B<num> word array
+B<ap> and the 2*B<num> word array B<ap>. It computes B<ap> * B<ap>
+word-wise, and places the low and high bytes of the result in B<rp>.
+
+bn_div_words(B<h>, B<l>, B<d>) divides the two word number (B<h>, B<l>)
+by B<d> and returns the result.
+
+bn_add_words(B<rp>, B<ap>, B<bp>, B<num>) operates on the B<num> word
+arrays B<ap>, B<bp> and B<rp>. It computes B<ap> + B<bp>, places the
+result in B<rp>, and returns the high word (carry).
+
+bn_sub_words(B<rp>, B<ap>, B<bp>, B<num>) operates on the B<num> word
+arrays B<ap>, B<bp> and B<rp>. It computes B<ap> - B<bp>, places the
+result in B<rp>, and returns the carry (1 if B<bp> E<gt> B<ap>, 0
+otherwise).
+
+bn_mul_comba4(B<r>, B<a>, B<b>) operates on the 4 word arrays B<a> and
+B<b> and the 8 word array B<r>. It computes B<a>*B<b> and places the
+result in B<r>.
+
+bn_mul_comba8(B<r>, B<a>, B<b>) operates on the 8 word arrays B<a> and
+B<b> and the 16 word array B<r>. It computes B<a>*B<b> and places the
+result in B<r>.
+
+bn_sqr_comba4(B<r>, B<a>, B<b>) operates on the 4 word arrays B<a> and
+B<b> and the 8 word array B<r>.
+
+bn_sqr_comba8(B<r>, B<a>, B<b>) operates on the 8 word arrays B<a> and
+B<b> and the 16 word array B<r>.
+
+The following functions are implemented in C:
+
+bn_cmp_words(B<a>, B<b>, B<n>) operates on the B<n> word arrays B<a>
+and B<b>. It returns 1, 0 and -1 if B<a> is greater than, equal and
+less than B<b>.
+
+bn_mul_normal(B<r>, B<a>, B<na>, B<b>, B<nb>) operates on the B<na>
+word array B<a>, the B<nb> word array B<b> and the B<na>+B<nb> word
+array B<r>. It computes B<a>*B<b> and places the result in B<r>.
+
+bn_mul_low_normal(B<r>, B<a>, B<b>, B<n>) operates on the B<n> word
+arrays B<r>, B<a> and B<b>. It computes the B<n> low words of
+B<a>*B<b> and places the result in B<r>.
+
+bn_mul_recursive(B<r>, B<a>, B<b>, B<n2>, B<dna>, B<dnb>, B<t>) operates
+on the word arrays B<a> and B<b> of length B<n2>+B<dna> and B<n2>+B<dnb>
+(B<dna> and B<dnb> are currently allowed to be 0 or negative) and the 2*B<n2>
+word arrays B<r> and B<t>. B<n2> must be a power of 2. It computes
+B<a>*B<b> and places the result in B<r>.
+
+bn_mul_part_recursive(B<r>, B<a>, B<b>, B<n>, B<tna>, B<tnb>, B<tmp>)
+operates on the word arrays B<a> and B<b> of length B<n>+B<tna> and
+B<n>+B<tnb> and the 4*B<n> word arrays B<r> and B<tmp>.
+
+bn_mul_low_recursive(B<r>, B<a>, B<b>, B<n2>, B<tmp>) operates on the
+B<n2> word arrays B<r> and B<tmp> and the B<n2>/2 word arrays B<a>
+and B<b>.
+
+bn_mul_high(B<r>, B<a>, B<b>, B<l>, B<n2>, B<tmp>) operates on the
+B<n2> word arrays B<r>, B<a>, B<b> and B<l> (?) and the 3*B<n2> word
+array B<tmp>.
+
+BN_mul() calls bn_mul_normal(), or an optimized implementation if the
+factors have the same size: bn_mul_comba8() is used if they are 8
+words long, bn_mul_recursive() if they are larger than
+B<BN_MULL_SIZE_NORMAL> and the size is an exact multiple of the word
+size, and bn_mul_part_recursive() for others that are larger than
+B<BN_MULL_SIZE_NORMAL>.
+
+bn_sqr_normal(B<r>, B<a>, B<n>, B<tmp>) operates on the B<n> word array
+B<a> and the 2*B<n> word arrays B<tmp> and B<r>.
+
+The implementations use the following macros which, depending on the
+architecture, may use "long long" C operations or inline assembler.
+They are defined in C<bn_lcl.h>.
+
+mul(B<r>, B<a>, B<w>, B<c>) computes B<w>*B<a>+B<c> and places the
+low word of the result in B<r> and the high word in B<c>.
+
+mul_add(B<r>, B<a>, B<w>, B<c>) computes B<w>*B<a>+B<r>+B<c> and
+places the low word of the result in B<r> and the high word in B<c>.
+
+sqr(B<r0>, B<r1>, B<a>) computes B<a>*B<a> and places the low word
+of the result in B<r0> and the high word in B<r1>.
+
+=head2 Size changes
+
+bn_expand() ensures that B<b> has enough space for a B<bits> bit
+number. bn_wexpand() ensures that B<b> has enough space for an
+B<n> word number. If the number has to be expanded, both macros
+call bn_expand2(), which allocates a new B<d> array and copies the
+data. They return B<NULL> on error, B<b> otherwise.
+
+The bn_fix_top() macro reduces B<a-E<gt>top> to point to the most
+significant non-zero word plus one when B<a> has shrunk.
+
+=head2 Debugging
+
+bn_check_top() verifies that C<((a)-E<gt>top E<gt>= 0 && (a)-E<gt>top
+E<lt>= (a)-E<gt>dmax)>. A violation will cause the program to abort.
+
+bn_print() prints B<a> to stderr. bn_dump() prints B<n> words at B<d>
+(in reverse order, i.e. most significant word first) to stderr.
+
+bn_set_max() makes B<a> a static number with a B<dmax> of its current size.
+This is used by bn_set_low() and bn_set_high() to make B<r> a read-only
+B<BIGNUM> that contains the B<n> low or high words of B<a>.
+
+If B<BN_DEBUG> is not defined, bn_check_top(), bn_print(), bn_dump()
+and bn_set_max() are defined as empty macros.
+
+=head1 SEE ALSO
+
+L<bn(3)>
+
+=head1 COPYRIGHT
+
+Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+
+Licensed under the OpenSSL license (the "License"). You may not use
+this file except in compliance with the License. You can obtain a copy
+in the file LICENSE in the source distribution or at
+L<https://www.openssl.org/source/license.html>.
+
+=cut
diff --git a/openssl-1.1.0h/crypto/bn/asm/alpha-mont.pl b/openssl-1.1.0h/crypto/bn/asm/alpha-mont.pl
new file mode 100644
index 0000000..1d68d6d
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,331 @@
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+$output=pop;
+open STDOUT,">$output";
+
+# int bn_mul_mont(
+$rp="a0"; # BN_ULONG *rp,
+$ap="a1"; # const BN_ULONG *ap,
+$bp="a2"; # const BN_ULONG *bp,
+$np="a3"; # const BN_ULONG *np,
+$n0="a4"; # const BN_ULONG *n0,
+$num="a5"; # int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+
+.globl bn_mul_mont
+.align 5
+.ent bn_mul_mont
+bn_mul_mont:
+ lda sp,-48(sp)
+ stq ra,0(sp)
+ stq s3,8(sp)
+ stq s4,16(sp)
+ stq s5,24(sp)
+ stq fp,32(sp)
+ mov sp,fp
+ .mask 0x0400f000,-48
+ .frame fp,48,ra
+ .prologue 0
+
+ .align 4
+ .set reorder
+ sextl $num,$num
+ mov 0,v0
+ cmplt $num,4,AT
+ bne AT,.Lexit
+
+ ldq $hi0,0($ap) # ap[0]
+ s8addq $num,16,AT
+ ldq $aj,8($ap)
+ subq sp,AT,sp
+ ldq $bi,0($bp) # bp[0]
+ lda AT,-4096(zero) # mov -4096,AT
+ ldq $n0,0($n0)
+ and sp,AT,sp
+
+ mulq $hi0,$bi,$lo0
+ ldq $hi1,0($np) # np[0]
+ umulh $hi0,$bi,$hi0
+ ldq $nj,8($np)
+
+ mulq $lo0,$n0,$m1
+
+ mulq $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ addq $hi1,AT,$hi1
+
+ mulq $aj,$bi,$alo
+ mov 2,$j
+ umulh $aj,$bi,$ahi
+ mov sp,$tp
+
+ mulq $nj,$m1,$nlo
+ s8addq $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+ s8addq $j,$np,$nj
+.align 4
+.L1st:
+ .set noreorder
+ ldq $aj,0($aj)
+ addl $j,1,$j
+ ldq $nj,0($nj)
+ lda $tp,8($tp)
+
+ addq $alo,$hi0,$lo0
+ mulq $aj,$bi,$alo
+ cmpult $lo0,$hi0,AT
+ addq $nlo,$hi1,$lo1
+
+ mulq $nj,$m1,$nlo
+ addq $ahi,AT,$hi0
+ cmpult $lo1,$hi1,v0
+ cmplt $j,$num,$tj
+
+ umulh $aj,$bi,$ahi
+ addq $nhi,v0,$hi1
+ addq $lo1,$lo0,$lo1
+ s8addq $j,$ap,$aj
+
+ umulh $nj,$m1,$nhi
+ cmpult $lo1,$lo0,v0
+ addq $hi1,v0,$hi1
+ s8addq $j,$np,$nj
+
+ stq $lo1,-8($tp)
+ nop
+ unop
+ bne $tj,.L1st
+ .set reorder
+
+ addq $alo,$hi0,$lo0
+ addq $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addq $ahi,AT,$hi0
+ addq $nhi,v0,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,v0
+ addq $hi1,v0,$hi1
+
+ stq $lo1,0($tp)
+
+ addq $hi1,$hi0,$hi1
+ cmpult $hi1,$hi0,AT
+ stq $hi1,8($tp)
+ stq AT,16($tp)
+
+ mov 1,$i
+.align 4
+.Louter:
+ s8addq $i,$bp,$bi
+ ldq $hi0,0($ap)
+ ldq $aj,8($ap)
+ ldq $bi,0($bi)
+ ldq $hi1,0($np)
+ ldq $nj,8($np)
+ ldq $tj,0(sp)
+
+ mulq $hi0,$bi,$lo0
+ umulh $hi0,$bi,$hi0
+
+ addq $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addq $hi0,AT,$hi0
+
+ mulq $lo0,$n0,$m1
+
+ mulq $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ mov 2,$j
+ addq $hi1,AT,$hi1
+
+ mulq $aj,$bi,$alo
+ mov sp,$tp
+ umulh $aj,$bi,$ahi
+
+ mulq $nj,$m1,$nlo
+ s8addq $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+.align 4
+.Linner:
+ .set noreorder
+ ldq $tj,8($tp) #L0
+ nop #U1
+ ldq $aj,0($aj) #L1
+ s8addq $j,$np,$nj #U0
+
+ ldq $nj,0($nj) #L0
+ nop #U1
+ addq $alo,$hi0,$lo0 #L1
+ lda $tp,8($tp)
+
+ mulq $aj,$bi,$alo #U1
+ cmpult $lo0,$hi0,AT #L0
+ addq $nlo,$hi1,$lo1 #L1
+ addl $j,1,$j
+
+ mulq $nj,$m1,$nlo #U1
+ addq $ahi,AT,$hi0 #L0
+ addq $lo0,$tj,$lo0 #L1
+ cmpult $lo1,$hi1,v0 #U0
+
+ umulh $aj,$bi,$ahi #U1
+ cmpult $lo0,$tj,AT #L0
+ addq $lo1,$lo0,$lo1 #L1
+ addq $nhi,v0,$hi1 #U0
+
+ umulh $nj,$m1,$nhi #U1
+ s8addq $j,$ap,$aj #L0
+ cmpult $lo1,$lo0,v0 #L1
+ cmplt $j,$num,$tj #U0 # borrow $tj
+
+ addq $hi0,AT,$hi0 #L0
+ addq $hi1,v0,$hi1 #U1
+ stq $lo1,-8($tp) #L1
+ bne $tj,.Linner #U0
+ .set reorder
+
+ ldq $tj,8($tp)
+ addq $alo,$hi0,$lo0
+ addq $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addq $ahi,AT,$hi0
+ addq $nhi,v0,$hi1
+
+ addq $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addq $hi0,AT,$hi0
+
+ ldq $tj,16($tp)
+ addq $lo1,$lo0,$j
+ cmpult $j,$lo0,v0
+ addq $hi1,v0,$hi1
+
+ addq $hi1,$hi0,$lo1
+ stq $j,0($tp)
+ cmpult $lo1,$hi0,$hi1
+ addq $lo1,$tj,$lo1
+ cmpult $lo1,$tj,AT
+ addl $i,1,$i
+ addq $hi1,AT,$hi1
+ stq $lo1,8($tp)
+ cmplt $i,$num,$tj # borrow $tj
+ stq $hi1,16($tp)
+ bne $tj,.Louter
+
+ s8addq $num,sp,$tj # &tp[num]
+ mov $rp,$bp # put rp aside
+ mov sp,$tp
+ mov sp,$ap
+ mov 0,$hi0 # clear borrow bit
+
+.align 4
+.Lsub: ldq $lo0,0($tp)
+ ldq $lo1,0($np)
+ lda $tp,8($tp)
+ lda $np,8($np)
+ subq $lo0,$lo1,$lo1 # tp[i]-np[i]
+ cmpult $lo0,$lo1,AT
+ subq $lo1,$hi0,$lo0
+ cmpult $lo1,$lo0,$hi0
+ or $hi0,AT,$hi0
+ stq $lo0,0($rp)
+ cmpult $tp,$tj,v0
+ lda $rp,8($rp)
+ bne v0,.Lsub
+
+ subq $hi1,$hi0,$hi0 # handle upmost overflow bit
+ mov sp,$tp
+ mov $bp,$rp # restore rp
+
+ and sp,$hi0,$ap
+ bic $bp,$hi0,$bp
+ bis $bp,$ap,$ap # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
+ lda $tp,8($tp)
+ lda $rp,8($rp)
+ lda $ap,8($ap)
+ stq zero,-8($tp) # zap tp
+ cmpult $tp,$tj,AT
+ stq $aj,-8($rp)
+ bne AT,.Lcopy
+ mov 1,v0
+
+.Lexit:
+ .set noreorder
+ mov fp,sp
+ /*ldq ra,0(sp)*/
+ ldq s3,8(sp)
+ ldq s4,16(sp)
+ ldq s5,24(sp)
+ ldq fp,32(sp)
+ lda sp,48(sp)
+ ret (ra)
+.end bn_mul_mont
+.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/armv4-gf2m.pl b/openssl-1.1.0h/crypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 0000000..0bb5433
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,332 @@
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+#
+# April 2014
+#
+# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
+# referred below, which improves ECDH and ECDSA verify benchmarks
+# by 18-40%.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type mul_1x1_ialu,%function
+.align 5
+mul_1x1_ialu:
+ mov $a0,#0
+ bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
+ str $a0,[sp,#0] @ tab[0]=0
+ add $a2,$a1,$a1 @ a2=a1<<1
+ str $a1,[sp,#4] @ tab[1]=a1
+ eor $a12,$a1,$a2 @ a1^a2
+ str $a2,[sp,#8] @ tab[2]=a2
+ mov $a4,$a1,lsl#2 @ a4=a1<<2
+ str $a12,[sp,#12] @ tab[3]=a1^a2
+ eor $a14,$a1,$a4 @ a1^a4
+ str $a4,[sp,#16] @ tab[4]=a4
+ eor $a0,$a2,$a4 @ a2^a4
+ str $a14,[sp,#20] @ tab[5]=a1^a4
+ eor $a12,$a12,$a4 @ a1^a2^a4
+ str $a0,[sp,#24] @ tab[6]=a2^a4
+ and $i0,$mask,$b,lsl#2
+ str $a12,[sp,#28] @ tab[7]=a1^a2^a4
+
+ and $i1,$mask,$b,lsr#1
+ ldr $lo,[sp,$i0] @ tab[b & 0x7]
+ and $i0,$mask,$b,lsr#4
+ ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
+ and $i1,$mask,$b,lsr#7
+ ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
+ eor $lo,$lo,$t1,lsl#3 @ stall
+ mov $hi,$t1,lsr#29
+ ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
+
+ and $i0,$mask,$b,lsr#10
+ eor $lo,$lo,$t0,lsl#6
+ eor $hi,$hi,$t0,lsr#26
+ ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
+
+ and $i1,$mask,$b,lsr#13
+ eor $lo,$lo,$t1,lsl#9
+ eor $hi,$hi,$t1,lsr#23
+ ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
+
+ and $i0,$mask,$b,lsr#16
+ eor $lo,$lo,$t0,lsl#12
+ eor $hi,$hi,$t0,lsr#20
+ ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
+
+ and $i1,$mask,$b,lsr#19
+ eor $lo,$lo,$t1,lsl#15
+ eor $hi,$hi,$t1,lsr#17
+ ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
+
+ and $i0,$mask,$b,lsr#22
+ eor $lo,$lo,$t0,lsl#18
+ eor $hi,$hi,$t0,lsr#14
+ ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
+
+ and $i1,$mask,$b,lsr#25
+ eor $lo,$lo,$t1,lsl#21
+ eor $hi,$hi,$t1,lsr#11
+ ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
+
+ tst $a,#1<<30
+ and $i0,$mask,$b,lsr#28
+ eor $lo,$lo,$t0,lsl#24
+ eor $hi,$hi,$t0,lsr#8
+ ldr $t0,[sp,$i0] @ tab[b >> 30 ]
+
+#ifdef __thumb2__
+ itt ne
+#endif
+ eorne $lo,$lo,$b,lsl#30
+ eorne $hi,$hi,$b,lsr#2
+ tst $a,#1<<31
+ eor $lo,$lo,$t1,lsl#27
+ eor $hi,$hi,$t1,lsr#5
+#ifdef __thumb2__
+ itt ne
+#endif
+ eorne $lo,$lo,$b,lsl#31
+ eorne $hi,$hi,$b,lsr#1
+ eor $lo,$lo,$t0,lsl#30
+ eor $hi,$hi,$t0,lsr#2
+
+ mov pc,lr
+.size mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void bn_GF2m_mul_2x2(BN_ULONG *r,
+# BN_ULONG a1,BN_ULONG a0,
+# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
+{
+$code.=<<___;
+.global bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,%function
+.align 5
+bn_GF2m_mul_2x2:
+#if __ARM_MAX_ARCH__>=7
+ stmdb sp!,{r10,lr}
+ ldr r12,.LOPENSSL_armcap
+ adr r10,.LOPENSSL_armcap
+ ldr r12,[r12,r10]
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV7_NEON
+ itt ne
+ ldrne r10,[sp],#8
+ bne .LNEON
+ stmdb sp!,{r4-r9}
+#else
+ stmdb sp!,{r4-r10,lr}
+#endif
+___
+$ret="r10"; # reassigned 1st argument
+$code.=<<___;
+ mov $ret,r0 @ reassign 1st argument
+ mov $b,r3 @ $b=b1
+ sub r7,sp,#36
+ mov r8,sp
+ and r7,r7,#-32
+ ldr r3,[sp,#32] @ load b0
+ mov $mask,#7<<2
+ mov sp,r7 @ allocate tab[8]
+ str r8,[r7,#32]
+
+ bl mul_1x1_ialu @ a1·b1
+ str $lo,[$ret,#8]
+ str $hi,[$ret,#12]
+
+ eor $b,$b,r3 @ flip b0 and b1
+ eor $a,$a,r2 @ flip a0 and a1
+ eor r3,r3,$b
+ eor r2,r2,$a
+ eor $b,$b,r3
+ eor $a,$a,r2
+ bl mul_1x1_ialu @ a0·b0
+ str $lo,[$ret]
+ str $hi,[$ret,#4]
+
+ eor $a,$a,r2
+ eor $b,$b,r3
+ bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+ ldmia $ret,{@r[0]-@r[3]}
+ eor $lo,$lo,$hi
+ ldr sp,[sp,#32] @ destroy tab[8]
+ eor $hi,$hi,@r[1]
+ eor $lo,$lo,@r[0]
+ eor $hi,$hi,@r[2]
+ eor $lo,$lo,@r[3]
+ eor $hi,$hi,@r[3]
+ str $hi,[$ret,#8]
+ eor $lo,$lo,$hi
+ str $lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r10,pc}
+#else
+ ldmia sp!,{r4-r10,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+___
+}
+{
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.align 5
+.LNEON:
+ ldr r12, [sp] @ 5th argument
+ vmov $a, r2, r1
+ vmov $b, r12, r3
+ vmov.i64 $k48, #0x0000ffffffffffff
+ vmov.i64 $k32, #0x00000000ffffffff
+ vmov.i64 $k16, #0x000000000000ffff
+
+ vext.8 $t0#lo, $a, $a, #1 @ A1
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
+ vext.8 $r#lo, $b, $b, #1 @ B1
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
+ vext.8 $t1#lo, $a, $a, #2 @ A2
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
+ vext.8 $t3#lo, $b, $b, #2 @ B2
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
+ vext.8 $t2#lo, $a, $a, #3 @ A3
+ veor $t0, $t0, $r @ L = E + F
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
+ vext.8 $r#lo, $b, $b, #3 @ B3
+ veor $t1, $t1, $t3 @ M = G + H
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
+ vand $t0#hi, $t0#hi, $k48
+ vext.8 $t3#lo, $b, $b, #4 @ B4
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
+ vand $t1#hi, $t1#hi, $k32
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
+ veor $t2, $t2, $r @ N = I + J
+ veor $t0#lo, $t0#lo, $t0#hi
+ veor $t1#lo, $t1#lo, $t1#hi
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
+ vand $t2#hi, $t2#hi, $k16
+ vext.8 $t0, $t0, $t0, #15
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 $t3#hi, #0
+ vext.8 $t1, $t1, $t1, #14
+ veor $t2#lo, $t2#lo, $t2#hi
+ vmull.p8 $r, $a, $b @ D = A*B
+ vext.8 $t3, $t3, $t3, #12
+ vext.8 $t2, $t2, $t2, #13
+ veor $t0, $t0, $t1
+ veor $t2, $t2, $t3
+ veor $r, $r, $t0
+ veor $r, $r, $t2
+
+ vst1.32 {$r}, [r0]
+ ret @ bx lr
+#endif
+___
+}
+$code.=<<___;
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_MAX_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.
+#endif
+.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 5
+
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/openssl-1.1.0h/crypto/bn/asm/armv4-mont.pl b/openssl-1.1.0h/crypto/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000..0dc4fe9
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,756 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+# September 2015
+#
+# Align Cortex-A9 performance with November 2013 improvements, i.e.
+# NEON code is now ~20-105% faster than integer-only one on this
+# processor. But this optimization further improved performance even
+# on other processors: NEON code path is ~45-180% faster than original
+# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
+# Snapdragon S4.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+$num="r0"; # starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10"; # sl, gcc uses it to keep @GOT
+$ahi="r11"; # fp
+$nlo="r12"; # ip
+########### # r13 is stack pointer
+$nhi="r14"; # lr
+########### # r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4"; $_bpend=$_num;
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lbn_mul_mont
+#endif
+
+.global bn_mul_mont
+.type bn_mul_mont,%function
+
+.align 5
+bn_mul_mont:
+.Lbn_mul_mont:
+ ldr ip,[sp,#4] @ load num
+ stmdb sp!,{r0,r2} @ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+ tst ip,#7
+ bne .Lialu
+ adr r0,.Lbn_mul_mont
+ ldr r2,.LOPENSSL_armcap
+ ldr r0,[r0,r2]
+#ifdef __APPLE__
+ ldr r0,[r0]
+#endif
+ tst r0,#ARMV7_NEON @ NEON available?
+ ldmia sp, {r0,r2}
+ beq .Lialu
+ add sp,sp,#8
+ b bn_mul8x_mont_neon
+.align 4
+.Lialu:
+#endif
+ cmp ip,#2
+ mov $num,ip @ load num
+#ifdef __thumb2__
+ ittt lt
+#endif
+ movlt r0,#0
+ addlt sp,sp,#2*4
+ blt .Labrt
+
+ stmdb sp!,{r4-r12,lr} @ save 10 registers
+
+ mov $num,$num,lsl#2 @ rescale $num for byte count
+ sub sp,sp,$num @ alloca(4*num)
+ sub sp,sp,#4 @ +extra dword
+ sub $num,$num,#4 @ "num=num-1"
+ add $tp,$bp,$num @ &bp[num-1]
+
+ add $num,sp,$num @ $num to point at &tp[num-1]
+ ldr $n0,[$_n0] @ &n0
+ ldr $bi,[$bp] @ bp[0]
+ ldr $aj,[$ap],#4 @ ap[0],ap++
+ ldr $nj,[$np],#4 @ np[0],np++
+ ldr $n0,[$n0] @ *n0
+ str $tp,[$_bpend] @ save &bp[num]
+
+ umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
+ str $n0,[$_n0] @ save n0 value
+ mul $n0,$alo,$n0 @ "tp[0]"*n0
+ mov $nlo,#0
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
+ mov $tp,sp
+
+.L1st:
+ ldr $aj,[$ap],#4 @ ap[j],ap++
+ mov $alo,$ahi
+ ldr $nj,[$np],#4 @ np[j],np++
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
+ mov $nhi,#0
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
+ adds $nlo,$nlo,$alo
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
+ adc $nlo,$nhi,#0
+ cmp $tp,$num
+ bne .L1st
+
+ adds $nlo,$nlo,$ahi
+ ldr $tp,[$_bp] @ restore bp
+ mov $nhi,#0
+ ldr $n0,[$_n0] @ restore n0
+ adc $nhi,$nhi,#0
+ str $nlo,[$num] @ tp[num-1]=
+ mov $tj,sp
+ str $nhi,[$num,#4] @ tp[num]=
+
+.Louter:
+ sub $tj,$num,$tj @ "original" $num-1 value
+ sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
+ ldr $bi,[$tp,#4]! @ *(++bp)
+ sub $np,$np,$tj @ "rewind" np to &np[1]
+ ldr $aj,[$ap,#-4] @ ap[0]
+ ldr $alo,[sp] @ tp[0]
+ ldr $nj,[$np,#-4] @ np[0]
+ ldr $tj,[sp,#4] @ tp[1]
+
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
+ str $tp,[$_bp] @ save bp
+ mul $n0,$alo,$n0
+ mov $nlo,#0
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
+ mov $tp,sp
+
+.Linner:
+ ldr $aj,[$ap],#4 @ ap[j],ap++
+ adds $alo,$ahi,$tj @ +=tp[j]
+ ldr $nj,[$np],#4 @ np[j],np++
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
+ mov $nhi,#0
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
+ adc $ahi,$ahi,#0
+ ldr $tj,[$tp,#8] @ tp[j+1]
+ adds $nlo,$nlo,$alo
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
+ adc $nlo,$nhi,#0
+ cmp $tp,$num
+ bne .Linner
+
+ adds $nlo,$nlo,$ahi
+ mov $nhi,#0
+ ldr $tp,[$_bp] @ restore bp
+ adc $nhi,$nhi,#0
+ ldr $n0,[$_n0] @ restore n0
+ adds $nlo,$nlo,$tj
+ ldr $tj,[$_bpend] @ restore &bp[num]
+ adc $nhi,$nhi,#0
+ str $nlo,[$num] @ tp[num-1]=
+ str $nhi,[$num,#4] @ tp[num]=
+
+ cmp $tp,$tj
+#ifdef __thumb2__
+ itt ne
+#endif
+ movne $tj,sp
+ bne .Louter
+
+ ldr $rp,[$_rp] @ pull rp
+ mov $aj,sp
+ add $num,$num,#4 @ $num to point at &tp[num]
+ sub $aj,$num,$aj @ "original" num value
+ mov $tp,sp @ "rewind" $tp
+ mov $ap,$tp @ "borrow" $ap
+ sub $np,$np,$aj @ "rewind" $np to &np[0]
+
+ subs $tj,$tj,$tj @ "clear" carry flag
+.Lsub: ldr $tj,[$tp],#4
+ ldr $nj,[$np],#4
+ sbcs $tj,$tj,$nj @ tp[j]-np[j]
+ str $tj,[$rp],#4 @ rp[j]=
+ teq $tp,$num @ preserve carry
+ bne .Lsub
+ sbcs $nhi,$nhi,#0 @ upmost carry
+ mov $tp,sp @ "rewind" $tp
+ sub $rp,$rp,$aj @ "rewind" $rp
+
+ and $ap,$tp,$nhi
+ bic $np,$rp,$nhi
+ orr $ap,$ap,$np @ ap=borrow?tp:rp
+
+.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
+ str sp,[$tp],#4 @ zap tp
+ str $tj,[$rp],#4
+ cmp $tp,$num
+ bne .Lcopy
+
+ mov sp,$num
+ add sp,sp,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4-r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size bn_mul_mont,.-bn_mul_mont
+___
+{
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my @ACC=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero="$Z#lo";
+my $temp="$Temp#lo";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4-r11}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load rest of parameter block
+ mov ip,sp
+
+ cmp $num,#8
+ bhi .LNEON_8n
+
+ @ special case for $num==8, everything is in register bank...
+
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ veor $zero,$zero,$zero
+ sub $toutptr,sp,$num,lsl#4
+ vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
+ and $toutptr,$toutptr,#-64
+ vld1.32 {${M0}[0]}, [$n0,:32]
+ mov sp,$toutptr @ alloca
+ vzip.16 $Bi,$zero
+
+ vmull.u32 @ACC[0],$Bi,${A0}[0]
+ vmull.u32 @ACC[1],$Bi,${A0}[1]
+ vmull.u32 @ACC[2],$Bi,${A1}[0]
+ vshl.i64 $Ni,@ACC[0]#hi,#16
+ vmull.u32 @ACC[3],$Bi,${A1}[1]
+
+ vadd.u64 $Ni,$Ni,@ACC[0]#lo
+ veor $zero,$zero,$zero
+ vmul.u32 $Ni,$Ni,$M0
+
+ vmull.u32 @ACC[4],$Bi,${A2}[0]
+ vld1.32 {$N0-$N3}, [$nptr]!
+ vmull.u32 @ACC[5],$Bi,${A2}[1]
+ vmull.u32 @ACC[6],$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmull.u32 @ACC[7],$Bi,${A3}[1]
+
+ vmlal.u32 @ACC[0],$Ni,${N0}[0]
+ sub $outer,$num,#1
+ vmlal.u32 @ACC[1],$Ni,${N0}[1]
+ vmlal.u32 @ACC[2],$Ni,${N1}[0]
+ vmlal.u32 @ACC[3],$Ni,${N1}[1]
+
+ vmlal.u32 @ACC[4],$Ni,${N2}[0]
+ vmov $Temp,@ACC[0]
+ vmlal.u32 @ACC[5],$Ni,${N2}[1]
+ vmov @ACC[0],@ACC[1]
+ vmlal.u32 @ACC[6],$Ni,${N3}[0]
+ vmov @ACC[1],@ACC[2]
+ vmlal.u32 @ACC[7],$Ni,${N3}[1]
+ vmov @ACC[2],@ACC[3]
+ vmov @ACC[3],@ACC[4]
+ vshr.u64 $temp,$temp,#16
+ vmov @ACC[4],@ACC[5]
+ vmov @ACC[5],@ACC[6]
+ vadd.u64 $temp,$temp,$Temp#hi
+ vmov @ACC[6],@ACC[7]
+ veor @ACC[7],@ACC[7]
+ vshr.u64 $temp,$temp,#16
+
+ b .LNEON_outer8
+
+.align 4
+.LNEON_outer8:
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
+ veor $zero,$zero,$zero
+ vzip.16 $Bi,$zero
+ vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
+
+ vmlal.u32 @ACC[0],$Bi,${A0}[0]
+ vmlal.u32 @ACC[1],$Bi,${A0}[1]
+ vmlal.u32 @ACC[2],$Bi,${A1}[0]
+ vshl.i64 $Ni,@ACC[0]#hi,#16
+ vmlal.u32 @ACC[3],$Bi,${A1}[1]
+
+ vadd.u64 $Ni,$Ni,@ACC[0]#lo
+ veor $zero,$zero,$zero
+ subs $outer,$outer,#1
+ vmul.u32 $Ni,$Ni,$M0
+
+ vmlal.u32 @ACC[4],$Bi,${A2}[0]
+ vmlal.u32 @ACC[5],$Bi,${A2}[1]
+ vmlal.u32 @ACC[6],$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 @ACC[7],$Bi,${A3}[1]
+
+ vmlal.u32 @ACC[0],$Ni,${N0}[0]
+ vmlal.u32 @ACC[1],$Ni,${N0}[1]
+ vmlal.u32 @ACC[2],$Ni,${N1}[0]
+ vmlal.u32 @ACC[3],$Ni,${N1}[1]
+
+ vmlal.u32 @ACC[4],$Ni,${N2}[0]
+ vmov $Temp,@ACC[0]
+ vmlal.u32 @ACC[5],$Ni,${N2}[1]
+ vmov @ACC[0],@ACC[1]
+ vmlal.u32 @ACC[6],$Ni,${N3}[0]
+ vmov @ACC[1],@ACC[2]
+ vmlal.u32 @ACC[7],$Ni,${N3}[1]
+ vmov @ACC[2],@ACC[3]
+ vmov @ACC[3],@ACC[4]
+ vshr.u64 $temp,$temp,#16
+ vmov @ACC[4],@ACC[5]
+ vmov @ACC[5],@ACC[6]
+ vadd.u64 $temp,$temp,$Temp#hi
+ vmov @ACC[6],@ACC[7]
+ veor @ACC[7],@ACC[7]
+ vshr.u64 $temp,$temp,#16
+
+ bne .LNEON_outer8
+
+ vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
+ mov $toutptr,sp
+ vshr.u64 $temp,@ACC[0]#lo,#16
+ mov $inner,$num
+ vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
+ add $tinptr,sp,#96
+ vshr.u64 $temp,@ACC[0]#hi,#16
+ vzip.16 @ACC[0]#lo,@ACC[0]#hi
+
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_8n:
+ veor @ACC[0],@ACC[0],@ACC[0]
+ sub $toutptr,sp,#128
+ veor @ACC[1],@ACC[1],@ACC[1]
+ sub $toutptr,$toutptr,$num,lsl#4
+ veor @ACC[2],@ACC[2],@ACC[2]
+ and $toutptr,$toutptr,#-64
+ veor @ACC[3],@ACC[3],@ACC[3]
+ mov sp,$toutptr @ alloca
+ veor @ACC[4],@ACC[4],@ACC[4]
+ add $toutptr,$toutptr,#256
+ veor @ACC[5],@ACC[5],@ACC[5]
+ sub $inner,$num,#8
+ veor @ACC[6],@ACC[6],@ACC[6]
+ veor @ACC[7],@ACC[7],@ACC[7]
+
+.LNEON_8n_init:
+ vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
+ subs $inner,$inner,#8
+ vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
+ vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
+ vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
+ bne .LNEON_8n_init
+
+ add $tinptr,sp,#256
+ vld1.32 {$A0-$A3},[$aptr]!
+ add $bnptr,sp,#8
+ vld1.32 {${M0}[0]},[$n0,:32]
+ mov $outer,$num
+ b .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+ vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
+ veor $zero,$zero,$zero
+ vzip.16 $Bi,$zero
+ add $toutptr,sp,#128
+ vld1.32 {$N0-$N3},[$nptr]!
+
+ vmlal.u32 @ACC[0],$Bi,${A0}[0]
+ vmlal.u32 @ACC[1],$Bi,${A0}[1]
+ veor $zero,$zero,$zero
+ vmlal.u32 @ACC[2],$Bi,${A1}[0]
+ vshl.i64 $Ni,@ACC[0]#hi,#16
+ vmlal.u32 @ACC[3],$Bi,${A1}[1]
+ vadd.u64 $Ni,$Ni,@ACC[0]#lo
+ vmlal.u32 @ACC[4],$Bi,${A2}[0]
+ vmul.u32 $Ni,$Ni,$M0
+ vmlal.u32 @ACC[5],$Bi,${A2}[1]
+ vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
+ vmlal.u32 @ACC[6],$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 @ACC[7],$Bi,${A3}[1]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+ vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
+ vmlal.u32 @ACC[0],$Ni,${N0}[0]
+ veor $temp,$temp,$temp
+ vmlal.u32 @ACC[1],$Ni,${N0}[1]
+ vzip.16 $Bi,$temp
+ vmlal.u32 @ACC[2],$Ni,${N1}[0]
+ vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
+ vmlal.u32 @ACC[3],$Ni,${N1}[1]
+ vmlal.u32 @ACC[4],$Ni,${N2}[0]
+ vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+ vmlal.u32 @ACC[5],$Ni,${N2}[1]
+ vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
+ vmlal.u32 @ACC[6],$Ni,${N3}[0]
+ vmlal.u32 @ACC[7],$Ni,${N3}[1]
+ vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+ vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
+___
+ push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+ vmlal.u32 @ACC[0],$Bi,${A0}[0]
+ vld1.64 {@ACC[7]},[$tinptr,:128]!
+ vmlal.u32 @ACC[1],$Bi,${A0}[1]
+ veor $zero,$zero,$zero
+ vmlal.u32 @ACC[2],$Bi,${A1}[0]
+ vshl.i64 $Ni,@ACC[0]#hi,#16
+ vmlal.u32 @ACC[3],$Bi,${A1}[1]
+ vadd.u64 $Ni,$Ni,@ACC[0]#lo
+ vmlal.u32 @ACC[4],$Bi,${A2}[0]
+ vmul.u32 $Ni,$Ni,$M0
+ vmlal.u32 @ACC[5],$Bi,${A2}[1]
+ vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
+ vmlal.u32 @ACC[6],$Bi,${A3}[0]
+ vzip.16 $Ni,$zero
+ vmlal.u32 @ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+ vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 @ACC[0],$Ni,${N0}[0]
+ vld1.32 {$A0-$A3},[$aptr]!
+ vmlal.u32 @ACC[1],$Ni,${N0}[1]
+ vmlal.u32 @ACC[2],$Ni,${N1}[0]
+ vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
+ vmlal.u32 @ACC[3],$Ni,${N1}[1]
+ vmlal.u32 @ACC[4],$Ni,${N2}[0]
+ vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+ vmlal.u32 @ACC[5],$Ni,${N2}[1]
+ vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
+ vmlal.u32 @ACC[6],$Ni,${N3}[0]
+ vmlal.u32 @ACC[7],$Ni,${N3}[1]
+ vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+ vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
+ add $bnptr,sp,#8 @ rewind
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ sub $inner,$num,#8
+ b .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+ subs $inner,$inner,#8
+ vmlal.u32 @ACC[0],$Bi,${A0}[0]
+ vld1.64 {@ACC[7]},[$tinptr,:128]
+ vmlal.u32 @ACC[1],$Bi,${A0}[1]
+ vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
+ vmlal.u32 @ACC[2],$Bi,${A1}[0]
+ vld1.32 {$N0-$N3},[$nptr]!
+ vmlal.u32 @ACC[3],$Bi,${A1}[1]
+ it ne
+ addne $tinptr,$tinptr,#16 @ don't advance in last iteration
+ vmlal.u32 @ACC[4],$Bi,${A2}[0]
+ vmlal.u32 @ACC[5],$Bi,${A2}[1]
+ vmlal.u32 @ACC[6],$Bi,${A3}[0]
+ vmlal.u32 @ACC[7],$Bi,${A3}[1]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
+ vmlal.u32 @ACC[0],$Ni,${N0}[0]
+ vmlal.u32 @ACC[1],$Ni,${N0}[1]
+ vmlal.u32 @ACC[2],$Ni,${N1}[0]
+ vmlal.u32 @ACC[3],$Ni,${N1}[1]
+ vmlal.u32 @ACC[4],$Ni,${N2}[0]
+ vmlal.u32 @ACC[5],$Ni,${N2}[1]
+ vmlal.u32 @ACC[6],$Ni,${N3}[0]
+ vmlal.u32 @ACC[7],$Ni,${N3}[1]
+ vst1.64 {@ACC[0]},[$toutptr,:128]!
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ vmlal.u32 @ACC[0],$Bi,${A0}[0]
+ vld1.64 {@ACC[7]},[$tinptr,:128]
+ vmlal.u32 @ACC[1],$Bi,${A0}[1]
+ vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
+ vmlal.u32 @ACC[2],$Bi,${A1}[0]
+ it ne
+ addne $tinptr,$tinptr,#16 @ don't advance in last iteration
+ vmlal.u32 @ACC[3],$Bi,${A1}[1]
+ vmlal.u32 @ACC[4],$Bi,${A2}[0]
+ vmlal.u32 @ACC[5],$Bi,${A2}[1]
+ vmlal.u32 @ACC[6],$Bi,${A3}[0]
+ vmlal.u32 @ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+ it eq
+ subeq $aptr,$aptr,$num,lsl#2 @ rewind
+ vmlal.u32 @ACC[0],$Ni,${N0}[0]
+ vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 @ACC[1],$Ni,${N0}[1]
+ vld1.32 {$A0-$A3},[$aptr]!
+ vmlal.u32 @ACC[2],$Ni,${N1}[0]
+ add $bnptr,sp,#8 @ rewind
+ vmlal.u32 @ACC[3],$Ni,${N1}[1]
+ vmlal.u32 @ACC[4],$Ni,${N2}[0]
+ vmlal.u32 @ACC[5],$Ni,${N2}[1]
+ vmlal.u32 @ACC[6],$Ni,${N3}[0]
+ vst1.64 {@ACC[0]},[$toutptr,:128]!
+ vmlal.u32 @ACC[7],$Ni,${N3}[1]
+
+ bne .LNEON_8n_inner
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ add $tinptr,sp,#128
+ vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
+ veor q2,q2,q2 @ $N0-$N1
+ vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
+ veor q3,q3,q3 @ $N2-$N3
+ vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
+ vst1.64 {@ACC[6]},[$toutptr,:128]
+
+ subs $outer,$outer,#8
+ vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
+ vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
+ vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
+ vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
+
+ itt ne
+ subne $nptr,$nptr,$num,lsl#2 @ rewind
+ bne .LNEON_8n_outer
+
+ add $toutptr,sp,#128
+ vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
+ vshr.u64 $temp,@ACC[0]#lo,#16
+ vst1.64 {q2-q3},[sp,:256]!
+ vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
+ vst1.64 {q2-q3}, [sp,:256]!
+ vshr.u64 $temp,@ACC[0]#hi,#16
+ vst1.64 {q2-q3}, [sp,:256]!
+ vzip.16 @ACC[0]#lo,@ACC[0]#hi
+
+ mov $inner,$num
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+ vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
+ vshr.u64 $temp,@ACC[0]#lo,#16
+ vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
+ vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
+ vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
+ vshr.u64 $temp,@ACC[0]#hi,#16
+ vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
+ vzip.16 @ACC[0]#lo,@ACC[0]#hi
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
+ vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
+ vshr.u64 $temp,@ACC[1]#lo,#16
+ vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
+ vshr.u64 $temp,@ACC[1]#hi,#16
+ vzip.16 @ACC[1]#lo,@ACC[1]#hi
+___
+ push(@ACC,shift(@ACC));
+}
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
+ subs $inner,$inner,#8
+ vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
+ bne .LNEON_tail
+
+ vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
+ subs $aptr,sp,#0 @ clear carry flag
+ add $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+ ldmia $aptr!, {r4-r7}
+ ldmia $nptr!, {r8-r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_sub
+
+ ldr r10, [$aptr] @ load top-most bit
+ mov r11,sp
+ veor q0,q0,q0
+ sub r11,$bptr,r11 @ this is num*4
+ veor q1,q1,q1
+ mov $aptr,sp
+ sub $rptr,$rptr,r11 @ rewind $rptr
+ mov $nptr,$bptr @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+.LNEON_copy_n_zap:
+ ldmia $aptr!, {r4-r7}
+ ldmia $rptr, {r8-r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ ldmia $aptr, {r4-r7}
+ stmia $rptr!, {r8-r11}
+ sub $aptr,$aptr,#16
+ ldmia $rptr, {r8-r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ teq $aptr,$bptr @ preserves carry
+ stmia $rptr!, {r8-r11}
+ bne .LNEON_copy_n_zap
+
+ mov sp,ip
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r11}
+ ret @ bx lr
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
+ s/\bret\b/bx lr/g or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/armv8-mont.pl b/openssl-1.1.0h/crypto/bn/asm/armv8-mont.pl
new file mode 100755
index 0000000..5d5af1b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/armv8-mont.pl
@@ -0,0 +1,1510 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2015
+#
+# "Teaser" Montgomery multiplication module for ARMv8. Needs more
+# work. While it does improve RSA sign performance by 20-30% (less for
+# longer keys) on most processors, for some reason RSA2048 is not
+# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
+# instruction issue rate is limited on processor in question, meaning
+# that dedicated squaring procedure is a must. Well, actually all
+# contemporary AArch64 processors seem to have limited multiplication
+# issue rate, i.e. they can't issue multiplication every cycle, which
+# explains moderate improvement coefficients in comparison to
+# compiler-generated code. Recall that compiler is instructed to use
+# umulh and therefore uses same amount of multiplication instructions
+# to do the job. Assembly's edge is to minimize number of "collateral"
+# instructions and of course instruction scheduling.
+#
+# April 2015
+#
+# Squaring procedure that handles lengths divisible by 8 improves
+# RSA/DSA performance by 25-40-60% depending on processor and key
+# length. Overall improvement coefficients are always positive in
+# comparison to compiler-generated code. On Cortex-A57 improvement
+# is still modest on longest key lengths, while others exhibit e.g.
+# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
+# on Cortex-A57 and ~60-100% faster on others.
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($lo0,$hi0,$aj,$m0,$alo,$ahi,
+ $lo1,$hi1,$nj,$m1,$nlo,$nhi,
+ $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
+
+# int bn_mul_mont(
+$rp="x0"; # BN_ULONG *rp,
+$ap="x1"; # const BN_ULONG *ap,
+$bp="x2"; # const BN_ULONG *bp,
+$np="x3"; # const BN_ULONG *np,
+$n0="x4"; # const BN_ULONG *n0,
+$num="x5"; # int num);
+
+$code.=<<___;
+.text
+
+.globl bn_mul_mont
+.type bn_mul_mont,%function
+.align 5
+bn_mul_mont:
+ tst $num,#7
+ b.eq __bn_sqr8x_mont
+ tst $num,#3
+ b.eq __bn_mul4x_mont
+.Lmul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ ldr $m0,[$bp],#8 // bp[0]
+ sub $tp,sp,$num,lsl#3
+ ldp $hi0,$aj,[$ap],#16 // ap[0..1]
+ lsl $num,$num,#3
+ ldr $n0,[$n0] // *n0
+ and $tp,$tp,#-16 // ABI says so
+ ldp $hi1,$nj,[$np],#16 // np[0..1]
+
+ mul $lo0,$hi0,$m0 // ap[0]*bp[0]
+ sub $j,$num,#16 // j=num-2
+ umulh $hi0,$hi0,$m0
+ mul $alo,$aj,$m0 // ap[1]*bp[0]
+ umulh $ahi,$aj,$m0
+
+ mul $m1,$lo0,$n0 // "tp[0]"*n0
+ mov sp,$tp // alloca
+
+ // (*) mul $lo1,$hi1,$m1 // np[0]*m1
+ umulh $hi1,$hi1,$m1
+ mul $nlo,$nj,$m1 // np[1]*m1
+ // (*) adds $lo1,$lo1,$lo0 // discarded
+ // (*) As for removal of first multiplication and addition
+ // instructions. The outcome of first addition is
+ // guaranteed to be zero, which leaves two computationally
+ // significant outcomes: it either carries or not. Then
+ // question is when does it carry? Is there alternative
+ // way to deduce it? If you follow operations, you can
+ // observe that condition for carry is quite simple:
+ // $lo0 being non-zero. So that carry can be calculated
+ // by adding -1 to $lo0. That's what next instruction does.
+ subs xzr,$lo0,#1 // (*)
+ umulh $nhi,$nj,$m1
+ adc $hi1,$hi1,xzr
+ cbz $j,.L1st_skip
+
+.L1st:
+ ldr $aj,[$ap],#8
+ adds $lo0,$alo,$hi0
+ sub $j,$j,#8 // j--
+ adc $hi0,$ahi,xzr
+
+ ldr $nj,[$np],#8
+ adds $lo1,$nlo,$hi1
+ mul $alo,$aj,$m0 // ap[j]*bp[0]
+ adc $hi1,$nhi,xzr
+ umulh $ahi,$aj,$m0
+
+ adds $lo1,$lo1,$lo0
+ mul $nlo,$nj,$m1 // np[j]*m1
+ adc $hi1,$hi1,xzr
+ umulh $nhi,$nj,$m1
+ str $lo1,[$tp],#8 // tp[j-1]
+ cbnz $j,.L1st
+
+.L1st_skip:
+ adds $lo0,$alo,$hi0
+ sub $ap,$ap,$num // rewind $ap
+ adc $hi0,$ahi,xzr
+
+ adds $lo1,$nlo,$hi1
+ sub $np,$np,$num // rewind $np
+ adc $hi1,$nhi,xzr
+
+ adds $lo1,$lo1,$lo0
+ sub $i,$num,#8 // i=num-1
+ adcs $hi1,$hi1,$hi0
+
+ adc $ovf,xzr,xzr // upmost overflow bit
+ stp $lo1,$hi1,[$tp]
+
+.Louter:
+ ldr $m0,[$bp],#8 // bp[i]
+ ldp $hi0,$aj,[$ap],#16
+ ldr $tj,[sp] // tp[0]
+ add $tp,sp,#8
+
+ mul $lo0,$hi0,$m0 // ap[0]*bp[i]
+ sub $j,$num,#16 // j=num-2
+ umulh $hi0,$hi0,$m0
+ ldp $hi1,$nj,[$np],#16
+ mul $alo,$aj,$m0 // ap[1]*bp[i]
+ adds $lo0,$lo0,$tj
+ umulh $ahi,$aj,$m0
+ adc $hi0,$hi0,xzr
+
+ mul $m1,$lo0,$n0
+ sub $i,$i,#8 // i--
+
+ // (*) mul $lo1,$hi1,$m1 // np[0]*m1
+ umulh $hi1,$hi1,$m1
+ mul $nlo,$nj,$m1 // np[1]*m1
+ // (*) adds $lo1,$lo1,$lo0
+ subs xzr,$lo0,#1 // (*)
+ umulh $nhi,$nj,$m1
+ cbz $j,.Linner_skip
+
+.Linner:
+ ldr $aj,[$ap],#8
+ adc $hi1,$hi1,xzr
+ ldr $tj,[$tp],#8 // tp[j]
+ adds $lo0,$alo,$hi0
+ sub $j,$j,#8 // j--
+ adc $hi0,$ahi,xzr
+
+ adds $lo1,$nlo,$hi1
+ ldr $nj,[$np],#8
+ adc $hi1,$nhi,xzr
+
+ mul $alo,$aj,$m0 // ap[j]*bp[i]
+ adds $lo0,$lo0,$tj
+ umulh $ahi,$aj,$m0
+ adc $hi0,$hi0,xzr
+
+ mul $nlo,$nj,$m1 // np[j]*m1
+ adds $lo1,$lo1,$lo0
+ umulh $nhi,$nj,$m1
+ str $lo1,[$tp,#-16] // tp[j-1]
+ cbnz $j,.Linner
+
+.Linner_skip:
+ ldr $tj,[$tp],#8 // tp[j]
+ adc $hi1,$hi1,xzr
+ adds $lo0,$alo,$hi0
+ sub $ap,$ap,$num // rewind $ap
+ adc $hi0,$ahi,xzr
+
+ adds $lo1,$nlo,$hi1
+ sub $np,$np,$num // rewind $np
+ adcs $hi1,$nhi,$ovf
+ adc $ovf,xzr,xzr
+
+ adds $lo0,$lo0,$tj
+ adc $hi0,$hi0,xzr
+
+ adds $lo1,$lo1,$lo0
+ adcs $hi1,$hi1,$hi0
+ adc $ovf,$ovf,xzr // upmost overflow bit
+ stp $lo1,$hi1,[$tp,#-16]
+
+ cbnz $i,.Louter
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr $tj,[sp] // tp[0]
+ add $tp,sp,#8
+ ldr $nj,[$np],#8 // np[0]
+ subs $j,$num,#8 // j=num-1 and clear borrow
+ mov $ap,$rp
+.Lsub:
+ sbcs $aj,$tj,$nj // tp[j]-np[j]
+ ldr $tj,[$tp],#8
+ sub $j,$j,#8 // j--
+ ldr $nj,[$np],#8
+ str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
+ cbnz $j,.Lsub
+
+ sbcs $aj,$tj,$nj
+ sbcs $ovf,$ovf,xzr // did it borrow?
+ str $aj,[$ap],#8 // rp[num-1]
+
+ ldr $tj,[sp] // tp[0]
+ add $tp,sp,#8
+ ldr $aj,[$rp],#8 // rp[0]
+ sub $num,$num,#8 // num--
+ nop
+.Lcond_copy:
+ sub $num,$num,#8 // num--
+ csel $nj,$tj,$aj,lo // did it borrow?
+ ldr $tj,[$tp],#8
+ ldr $aj,[$rp],#8
+ str xzr,[$tp,#-16] // wipe tp
+ str $nj,[$rp,#-16]
+ cbnz $num,.Lcond_copy
+
+ csel $nj,$tj,$aj,lo
+ str xzr,[$tp,#-8] // wipe tp
+ str $nj,[$rp,#-8]
+
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+___
+{
+########################################################################
+# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
+
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
+my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
+my ($cnt,$carry,$topmost)=("x27","x28","x30");
+my ($tp,$ap_end,$na0)=($bp,$np,$carry);
+
+$code.=<<___;
+.type __bn_sqr8x_mont,%function
+.align 5
+__bn_sqr8x_mont:
+ cmp $ap,$bp
+ b.ne __bn_mul4x_mont
+.Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp $rp,$np,[sp,#96] // offload rp and np
+
+ ldp $a0,$a1,[$ap,#8*0]
+ ldp $a2,$a3,[$ap,#8*2]
+ ldp $a4,$a5,[$ap,#8*4]
+ ldp $a6,$a7,[$ap,#8*6]
+
+ sub $tp,sp,$num,lsl#4
+ lsl $num,$num,#3
+ ldr $n0,[$n0] // *n0
+ mov sp,$tp // alloca
+ sub $cnt,$num,#8*8
+ b .Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+ sub $cnt,$cnt,#8*8
+ stp xzr,xzr,[$tp,#8*0]
+ stp xzr,xzr,[$tp,#8*2]
+ stp xzr,xzr,[$tp,#8*4]
+ stp xzr,xzr,[$tp,#8*6]
+.Lsqr8x_zero_start:
+ stp xzr,xzr,[$tp,#8*8]
+ stp xzr,xzr,[$tp,#8*10]
+ stp xzr,xzr,[$tp,#8*12]
+ stp xzr,xzr,[$tp,#8*14]
+ add $tp,$tp,#8*16
+ cbnz $cnt,.Lsqr8x_zero
+
+ add $ap_end,$ap,$num
+ add $ap,$ap,#8*8
+ mov $acc0,xzr
+ mov $acc1,xzr
+ mov $acc2,xzr
+ mov $acc3,xzr
+ mov $acc4,xzr
+ mov $acc5,xzr
+ mov $acc6,xzr
+ mov $acc7,xzr
+ mov $tp,sp
+ str $n0,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+.Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
+ mul $t1,$a2,$a0
+ mul $t2,$a3,$a0
+ mul $t3,$a4,$a0
+ adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
+ mul $t0,$a5,$a0
+ adcs $acc2,$acc2,$t1
+ mul $t1,$a6,$a0
+ adcs $acc3,$acc3,$t2
+ mul $t2,$a7,$a0
+ adcs $acc4,$acc4,$t3
+ umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
+ adcs $acc5,$acc5,$t0
+ umulh $t0,$a2,$a0
+ adcs $acc6,$acc6,$t1
+ umulh $t1,$a3,$a0
+ adcs $acc7,$acc7,$t2
+ umulh $t2,$a4,$a0
+ stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
+ adc $acc0,xzr,xzr // t[8]
+ adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
+ umulh $t3,$a5,$a0
+ adcs $acc3,$acc3,$t0
+ umulh $t0,$a6,$a0
+ adcs $acc4,$acc4,$t1
+ umulh $t1,$a7,$a0
+ adcs $acc5,$acc5,$t2
+ mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
+ adcs $acc6,$acc6,$t3
+ mul $t3,$a3,$a1
+ adcs $acc7,$acc7,$t0
+ mul $t0,$a4,$a1
+ adc $acc0,$acc0,$t1
+
+ mul $t1,$a5,$a1
+ adds $acc3,$acc3,$t2
+ mul $t2,$a6,$a1
+ adcs $acc4,$acc4,$t3
+ mul $t3,$a7,$a1
+ adcs $acc5,$acc5,$t0
+ umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
+ adcs $acc6,$acc6,$t1
+ umulh $t1,$a3,$a1
+ adcs $acc7,$acc7,$t2
+ umulh $t2,$a4,$a1
+ adcs $acc0,$acc0,$t3
+ umulh $t3,$a5,$a1
+ stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
+ adc $acc1,xzr,xzr // t[9]
+ adds $acc4,$acc4,$t0
+ umulh $t0,$a6,$a1
+ adcs $acc5,$acc5,$t1
+ umulh $t1,$a7,$a1
+ adcs $acc6,$acc6,$t2
+ mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
+ adcs $acc7,$acc7,$t3
+ mul $t3,$a4,$a2
+ adcs $acc0,$acc0,$t0
+ mul $t0,$a5,$a2
+ adc $acc1,$acc1,$t1
+
+ mul $t1,$a6,$a2
+ adds $acc5,$acc5,$t2
+ mul $t2,$a7,$a2
+ adcs $acc6,$acc6,$t3
+ umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
+ adcs $acc7,$acc7,$t0
+ umulh $t0,$a4,$a2
+ adcs $acc0,$acc0,$t1
+ umulh $t1,$a5,$a2
+ adcs $acc1,$acc1,$t2
+ umulh $t2,$a6,$a2
+ stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
+ adc $acc2,xzr,xzr // t[10]
+ adds $acc6,$acc6,$t3
+ umulh $t3,$a7,$a2
+ adcs $acc7,$acc7,$t0
+ mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
+ adcs $acc0,$acc0,$t1
+ mul $t1,$a5,$a3
+ adcs $acc1,$acc1,$t2
+ mul $t2,$a6,$a3
+ adc $acc2,$acc2,$t3
+
+ mul $t3,$a7,$a3
+ adds $acc7,$acc7,$t0
+ umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
+ adcs $acc0,$acc0,$t1
+ umulh $t1,$a5,$a3
+ adcs $acc1,$acc1,$t2
+ umulh $t2,$a6,$a3
+ adcs $acc2,$acc2,$t3
+ umulh $t3,$a7,$a3
+ stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
+ adc $acc3,xzr,xzr // t[11]
+ adds $acc0,$acc0,$t0
+ mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
+ adcs $acc1,$acc1,$t1
+ mul $t1,$a6,$a4
+ adcs $acc2,$acc2,$t2
+ mul $t2,$a7,$a4
+ adc $acc3,$acc3,$t3
+
+ umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
+ adds $acc1,$acc1,$t0
+ umulh $t0,$a6,$a4
+ adcs $acc2,$acc2,$t1
+ umulh $t1,$a7,$a4
+ adcs $acc3,$acc3,$t2
+ mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
+ adc $acc4,xzr,xzr // t[12]
+ adds $acc2,$acc2,$t3
+ mul $t3,$a7,$a5
+ adcs $acc3,$acc3,$t0
+ umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
+ adc $acc4,$acc4,$t1
+
+ umulh $t1,$a7,$a5
+ adds $acc3,$acc3,$t2
+ mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
+ adcs $acc4,$acc4,$t3
+ umulh $t3,$a7,$a6 // hi(a[7]*a[6])
+ adc $acc5,xzr,xzr // t[13]
+ adds $acc4,$acc4,$t0
+ sub $cnt,$ap_end,$ap // done yet?
+ adc $acc5,$acc5,$t1
+
+ adds $acc5,$acc5,$t2
+ sub $t0,$ap_end,$num // rewinded ap
+ adc $acc6,xzr,xzr // t[14]
+ add $acc6,$acc6,$t3
+
+ cbz $cnt,.Lsqr8x_outer_break
+
+ mov $n0,$a0
+ ldp $a0,$a1,[$tp,#8*0]
+ ldp $a2,$a3,[$tp,#8*2]
+ ldp $a4,$a5,[$tp,#8*4]
+ ldp $a6,$a7,[$tp,#8*6]
+ adds $acc0,$acc0,$a0
+ adcs $acc1,$acc1,$a1
+ ldp $a0,$a1,[$ap,#8*0]
+ adcs $acc2,$acc2,$a2
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$ap,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$ap,#8*4]
+ adcs $acc6,$acc6,$a6
+ mov $rp,$ap
+ adcs $acc7,xzr,$a7
+ ldp $a6,$a7,[$ap,#8*6]
+ add $ap,$ap,#8*8
+ //adc $carry,xzr,xzr // moved below
+ mov $cnt,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+.Lsqr8x_mul:
+ mul $t0,$a0,$n0
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
+ mul $t1,$a1,$n0
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$n0
+ mul $t3,$a3,$n0
+ adds $acc0,$acc0,$t0
+ mul $t0,$a4,$n0
+ adcs $acc1,$acc1,$t1
+ mul $t1,$a5,$n0
+ adcs $acc2,$acc2,$t2
+ mul $t2,$a6,$n0
+ adcs $acc3,$acc3,$t3
+ mul $t3,$a7,$n0
+ adcs $acc4,$acc4,$t0
+ umulh $t0,$a0,$n0
+ adcs $acc5,$acc5,$t1
+ umulh $t1,$a1,$n0
+ adcs $acc6,$acc6,$t2
+ umulh $t2,$a2,$n0
+ adcs $acc7,$acc7,$t3
+ umulh $t3,$a3,$n0
+ adc $carry,$carry,xzr
+ str $acc0,[$tp],#8
+ adds $acc0,$acc1,$t0
+ umulh $t0,$a4,$n0
+ adcs $acc1,$acc2,$t1
+ umulh $t1,$a5,$n0
+ adcs $acc2,$acc3,$t2
+ umulh $t2,$a6,$n0
+ adcs $acc3,$acc4,$t3
+ umulh $t3,$a7,$n0
+ ldr $n0,[$rp,$cnt]
+ adcs $acc4,$acc5,$t0
+ adcs $acc5,$acc6,$t1
+ adcs $acc6,$acc7,$t2
+ adcs $acc7,$carry,$t3
+ //adc $carry,xzr,xzr // moved above
+ cbnz $cnt,.Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp $ap,$ap_end // done yet?
+ b.eq .Lsqr8x_break
+
+ ldp $a0,$a1,[$tp,#8*0]
+ ldp $a2,$a3,[$tp,#8*2]
+ ldp $a4,$a5,[$tp,#8*4]
+ ldp $a6,$a7,[$tp,#8*6]
+ adds $acc0,$acc0,$a0
+ ldr $n0,[$rp,#-8*8]
+ adcs $acc1,$acc1,$a1
+ ldp $a0,$a1,[$ap,#8*0]
+ adcs $acc2,$acc2,$a2
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$ap,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$ap,#8*4]
+ adcs $acc6,$acc6,$a6
+ mov $cnt,#-8*8
+ adcs $acc7,$acc7,$a7
+ ldp $a6,$a7,[$ap,#8*6]
+ add $ap,$ap,#8*8
+ //adc $carry,xzr,xzr // moved above
+ b .Lsqr8x_mul
+
+.align 4
+.Lsqr8x_break:
+ ldp $a0,$a1,[$rp,#8*0]
+ add $ap,$rp,#8*8
+ ldp $a2,$a3,[$rp,#8*2]
+ sub $t0,$ap_end,$ap // is it last iteration?
+ ldp $a4,$a5,[$rp,#8*4]
+ sub $t1,$tp,$t0
+ ldp $a6,$a7,[$rp,#8*6]
+ cbz $t0,.Lsqr8x_outer_loop
+
+ stp $acc0,$acc1,[$tp,#8*0]
+ ldp $acc0,$acc1,[$t1,#8*0]
+ stp $acc2,$acc3,[$tp,#8*2]
+ ldp $acc2,$acc3,[$t1,#8*2]
+ stp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc4,$acc5,[$t1,#8*4]
+ stp $acc6,$acc7,[$tp,#8*6]
+ mov $tp,$t1
+ ldp $acc6,$acc7,[$t1,#8*6]
+ b .Lsqr8x_outer_loop
+
+.align 4
+.Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
+ ldp $t1,$t2,[sp,#8*1]
+ ldp $a5,$a7,[$t0,#8*2]
+ add $ap,$t0,#8*4
+ ldp $t3,$t0,[sp,#8*3]
+
+ stp $acc0,$acc1,[$tp,#8*0]
+ mul $acc0,$a1,$a1
+ stp $acc2,$acc3,[$tp,#8*2]
+ umulh $a1,$a1,$a1
+ stp $acc4,$acc5,[$tp,#8*4]
+ mul $a2,$a3,$a3
+ stp $acc6,$acc7,[$tp,#8*6]
+ mov $tp,sp
+ umulh $a3,$a3,$a3
+ adds $acc1,$a1,$t1,lsl#1
+ extr $t1,$t2,$t1,#63
+ sub $cnt,$num,#8*4
+
+.Lsqr4x_shift_n_add:
+ adcs $acc2,$a2,$t1
+ extr $t2,$t3,$t2,#63
+ sub $cnt,$cnt,#8*4
+ adcs $acc3,$a3,$t2
+ ldp $t1,$t2,[$tp,#8*5]
+ mul $a4,$a5,$a5
+ ldp $a1,$a3,[$ap],#8*2
+ umulh $a5,$a5,$a5
+ mul $a6,$a7,$a7
+ umulh $a7,$a7,$a7
+ extr $t3,$t0,$t3,#63
+ stp $acc0,$acc1,[$tp,#8*0]
+ adcs $acc4,$a4,$t3
+ extr $t0,$t1,$t0,#63
+ stp $acc2,$acc3,[$tp,#8*2]
+ adcs $acc5,$a5,$t0
+ ldp $t3,$t0,[$tp,#8*7]
+ extr $t1,$t2,$t1,#63
+ adcs $acc6,$a6,$t1
+ extr $t2,$t3,$t2,#63
+ adcs $acc7,$a7,$t2
+ ldp $t1,$t2,[$tp,#8*9]
+ mul $a0,$a1,$a1
+ ldp $a5,$a7,[$ap],#8*2
+ umulh $a1,$a1,$a1
+ mul $a2,$a3,$a3
+ umulh $a3,$a3,$a3
+ stp $acc4,$acc5,[$tp,#8*4]
+ extr $t3,$t0,$t3,#63
+ stp $acc6,$acc7,[$tp,#8*6]
+ add $tp,$tp,#8*8
+ adcs $acc0,$a0,$t3
+ extr $t0,$t1,$t0,#63
+ adcs $acc1,$a1,$t0
+ ldp $t3,$t0,[$tp,#8*3]
+ extr $t1,$t2,$t1,#63
+ cbnz $cnt,.Lsqr4x_shift_n_add
+___
+my ($np,$np_end)=($ap,$ap_end);
+$code.=<<___;
+ ldp $np,$n0,[x29,#104] // pull np and n0
+
+ adcs $acc2,$a2,$t1
+ extr $t2,$t3,$t2,#63
+ adcs $acc3,$a3,$t2
+ ldp $t1,$t2,[$tp,#8*5]
+ mul $a4,$a5,$a5
+ umulh $a5,$a5,$a5
+ stp $acc0,$acc1,[$tp,#8*0]
+ mul $a6,$a7,$a7
+ umulh $a7,$a7,$a7
+ stp $acc2,$acc3,[$tp,#8*2]
+ extr $t3,$t0,$t3,#63
+ adcs $acc4,$a4,$t3
+ extr $t0,$t1,$t0,#63
+ ldp $acc0,$acc1,[sp,#8*0]
+ adcs $acc5,$a5,$t0
+ extr $t1,$t2,$t1,#63
+ ldp $a0,$a1,[$np,#8*0]
+ adcs $acc6,$a6,$t1
+ extr $t2,xzr,$t2,#63
+ ldp $a2,$a3,[$np,#8*2]
+ adc $acc7,$a7,$t2
+ ldp $a4,$a5,[$np,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul $na0,$n0,$acc0 // t[0]*n0
+ ldp $a6,$a7,[$np,#8*6]
+ add $np_end,$np,$num
+ ldp $acc2,$acc3,[sp,#8*2]
+ stp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc4,$acc5,[sp,#8*4]
+ stp $acc6,$acc7,[$tp,#8*6]
+ ldp $acc6,$acc7,[sp,#8*6]
+ add $np,$np,#8*8
+ mov $topmost,xzr // initial top-most carry
+ mov $tp,sp
+ mov $cnt,#8
+
+.Lsqr8x_reduction:
+ // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
+ mul $t1,$a1,$na0
+ sub $cnt,$cnt,#1
+ mul $t2,$a2,$na0
+ str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
+ mul $t3,$a3,$na0
+ // (*) adds xzr,$acc0,$t0
+ subs xzr,$acc0,#1 // (*)
+ mul $t0,$a4,$na0
+ adcs $acc0,$acc1,$t1
+ mul $t1,$a5,$na0
+ adcs $acc1,$acc2,$t2
+ mul $t2,$a6,$na0
+ adcs $acc2,$acc3,$t3
+ mul $t3,$a7,$na0
+ adcs $acc3,$acc4,$t0
+ umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
+ adcs $acc4,$acc5,$t1
+ umulh $t1,$a1,$na0
+ adcs $acc5,$acc6,$t2
+ umulh $t2,$a2,$na0
+ adcs $acc6,$acc7,$t3
+ umulh $t3,$a3,$na0
+ adc $acc7,xzr,xzr
+ adds $acc0,$acc0,$t0
+ umulh $t0,$a4,$na0
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$a5,$na0
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$a6,$na0
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$a7,$na0
+ mul $na0,$n0,$acc0 // next t[0]*n0
+ adcs $acc4,$acc4,$t0
+ adcs $acc5,$acc5,$t1
+ adcs $acc6,$acc6,$t2
+ adc $acc7,$acc7,$t3
+ cbnz $cnt,.Lsqr8x_reduction
+
+ ldp $t0,$t1,[$tp,#8*0]
+ ldp $t2,$t3,[$tp,#8*2]
+ mov $rp,$tp
+ sub $cnt,$np_end,$np // done yet?
+ adds $acc0,$acc0,$t0
+ adcs $acc1,$acc1,$t1
+ ldp $t0,$t1,[$tp,#8*4]
+ adcs $acc2,$acc2,$t2
+ adcs $acc3,$acc3,$t3
+ ldp $t2,$t3,[$tp,#8*6]
+ adcs $acc4,$acc4,$t0
+ adcs $acc5,$acc5,$t1
+ adcs $acc6,$acc6,$t2
+ adcs $acc7,$acc7,$t3
+ //adc $carry,xzr,xzr // moved below
+ cbz $cnt,.Lsqr8x8_post_condition
+
+ ldr $n0,[$tp,#-8*8]
+ ldp $a0,$a1,[$np,#8*0]
+ ldp $a2,$a3,[$np,#8*2]
+ ldp $a4,$a5,[$np,#8*4]
+ mov $cnt,#-8*8
+ ldp $a6,$a7,[$np,#8*6]
+ add $np,$np,#8*8
+
+.Lsqr8x_tail:
+ mul $t0,$a0,$n0
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
+ mul $t1,$a1,$n0
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$n0
+ mul $t3,$a3,$n0
+ adds $acc0,$acc0,$t0
+ mul $t0,$a4,$n0
+ adcs $acc1,$acc1,$t1
+ mul $t1,$a5,$n0
+ adcs $acc2,$acc2,$t2
+ mul $t2,$a6,$n0
+ adcs $acc3,$acc3,$t3
+ mul $t3,$a7,$n0
+ adcs $acc4,$acc4,$t0
+ umulh $t0,$a0,$n0
+ adcs $acc5,$acc5,$t1
+ umulh $t1,$a1,$n0
+ adcs $acc6,$acc6,$t2
+ umulh $t2,$a2,$n0
+ adcs $acc7,$acc7,$t3
+ umulh $t3,$a3,$n0
+ adc $carry,$carry,xzr
+ str $acc0,[$tp],#8
+ adds $acc0,$acc1,$t0
+ umulh $t0,$a4,$n0
+ adcs $acc1,$acc2,$t1
+ umulh $t1,$a5,$n0
+ adcs $acc2,$acc3,$t2
+ umulh $t2,$a6,$n0
+ adcs $acc3,$acc4,$t3
+ umulh $t3,$a7,$n0
+ ldr $n0,[$rp,$cnt]
+ adcs $acc4,$acc5,$t0
+ adcs $acc5,$acc6,$t1
+ adcs $acc6,$acc7,$t2
+ adcs $acc7,$carry,$t3
+ //adc $carry,xzr,xzr // moved above
+ cbnz $cnt,.Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp $a0,$a1,[$tp,#8*0]
+ sub $cnt,$np_end,$np // done yet?
+ sub $t2,$np_end,$num // rewinded np
+ ldp $a2,$a3,[$tp,#8*2]
+ ldp $a4,$a5,[$tp,#8*4]
+ ldp $a6,$a7,[$tp,#8*6]
+ cbz $cnt,.Lsqr8x_tail_break
+
+ ldr $n0,[$rp,#-8*8]
+ adds $acc0,$acc0,$a0
+ adcs $acc1,$acc1,$a1
+ ldp $a0,$a1,[$np,#8*0]
+ adcs $acc2,$acc2,$a2
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$np,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$np,#8*4]
+ adcs $acc6,$acc6,$a6
+ mov $cnt,#-8*8
+ adcs $acc7,$acc7,$a7
+ ldp $a6,$a7,[$np,#8*6]
+ add $np,$np,#8*8
+ //adc $carry,xzr,xzr // moved above
+ b .Lsqr8x_tail
+
+.align 4
+.Lsqr8x_tail_break:
+ ldr $n0,[x29,#112] // pull n0
+ add $cnt,$tp,#8*8 // end of current t[num] window
+
+ subs xzr,$topmost,#1 // "move" top-most carry to carry bit
+ adcs $t0,$acc0,$a0
+ adcs $t1,$acc1,$a1
+ ldp $acc0,$acc1,[$rp,#8*0]
+ adcs $acc2,$acc2,$a2
+ ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
+ adcs $acc3,$acc3,$a3
+ ldp $a2,$a3,[$t2,#8*2]
+ adcs $acc4,$acc4,$a4
+ adcs $acc5,$acc5,$a5
+ ldp $a4,$a5,[$t2,#8*4]
+ adcs $acc6,$acc6,$a6
+ adcs $acc7,$acc7,$a7
+ ldp $a6,$a7,[$t2,#8*6]
+ add $np,$t2,#8*8
+ adc $topmost,xzr,xzr // top-most carry
+ mul $na0,$n0,$acc0
+ stp $t0,$t1,[$tp,#8*0]
+ stp $acc2,$acc3,[$tp,#8*2]
+ ldp $acc2,$acc3,[$rp,#8*2]
+ stp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc4,$acc5,[$rp,#8*4]
+ cmp $cnt,x29 // did we hit the bottom?
+ stp $acc6,$acc7,[$tp,#8*6]
+ mov $tp,$rp // slide the window
+ ldp $acc6,$acc7,[$rp,#8*6]
+ mov $cnt,#8
+ b.ne .Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr $rp,[x29,#96] // pull rp
+ add $tp,$tp,#8*8
+ subs $t0,$acc0,$a0
+ sbcs $t1,$acc1,$a1
+ sub $cnt,$num,#8*8
+ mov $ap_end,$rp // $rp copy
+
+.Lsqr8x_sub:
+ sbcs $t2,$acc2,$a2
+ ldp $a0,$a1,[$np,#8*0]
+ sbcs $t3,$acc3,$a3
+ stp $t0,$t1,[$rp,#8*0]
+ sbcs $t0,$acc4,$a4
+ ldp $a2,$a3,[$np,#8*2]
+ sbcs $t1,$acc5,$a5
+ stp $t2,$t3,[$rp,#8*2]
+ sbcs $t2,$acc6,$a6
+ ldp $a4,$a5,[$np,#8*4]
+ sbcs $t3,$acc7,$a7
+ ldp $a6,$a7,[$np,#8*6]
+ add $np,$np,#8*8
+ ldp $acc0,$acc1,[$tp,#8*0]
+ sub $cnt,$cnt,#8*8
+ ldp $acc2,$acc3,[$tp,#8*2]
+ ldp $acc4,$acc5,[$tp,#8*4]
+ ldp $acc6,$acc7,[$tp,#8*6]
+ add $tp,$tp,#8*8
+ stp $t0,$t1,[$rp,#8*4]
+ sbcs $t0,$acc0,$a0
+ stp $t2,$t3,[$rp,#8*6]
+ add $rp,$rp,#8*8
+ sbcs $t1,$acc1,$a1
+ cbnz $cnt,.Lsqr8x_sub
+
+ sbcs $t2,$acc2,$a2
+ mov $tp,sp
+ add $ap,sp,$num
+ ldp $a0,$a1,[$ap_end,#8*0]
+ sbcs $t3,$acc3,$a3
+ stp $t0,$t1,[$rp,#8*0]
+ sbcs $t0,$acc4,$a4
+ ldp $a2,$a3,[$ap_end,#8*2]
+ sbcs $t1,$acc5,$a5
+ stp $t2,$t3,[$rp,#8*2]
+ sbcs $t2,$acc6,$a6
+ ldp $acc0,$acc1,[$ap,#8*0]
+ sbcs $t3,$acc7,$a7
+ ldp $acc2,$acc3,[$ap,#8*2]
+ sbcs xzr,$topmost,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp $t0,$t1,[$rp,#8*4]
+ stp $t2,$t3,[$rp,#8*6]
+
+ sub $cnt,$num,#8*4
+.Lsqr4x_cond_copy:
+ sub $cnt,$cnt,#8*4
+ csel $t0,$acc0,$a0,lo
+ stp xzr,xzr,[$tp,#8*0]
+ csel $t1,$acc1,$a1,lo
+ ldp $a0,$a1,[$ap_end,#8*4]
+ ldp $acc0,$acc1,[$ap,#8*4]
+ csel $t2,$acc2,$a2,lo
+ stp xzr,xzr,[$tp,#8*2]
+ add $tp,$tp,#8*4
+ csel $t3,$acc3,$a3,lo
+ ldp $a2,$a3,[$ap_end,#8*6]
+ ldp $acc2,$acc3,[$ap,#8*6]
+ add $ap,$ap,#8*4
+ stp $t0,$t1,[$ap_end,#8*0]
+ stp $t2,$t3,[$ap_end,#8*2]
+ add $ap_end,$ap_end,#8*4
+ stp xzr,xzr,[$ap,#8*0]
+ stp xzr,xzr,[$ap,#8*2]
+ cbnz $cnt,.Lsqr4x_cond_copy
+
+ csel $t0,$acc0,$a0,lo
+ stp xzr,xzr,[$tp,#8*0]
+ csel $t1,$acc1,$a1,lo
+ stp xzr,xzr,[$tp,#8*2]
+ csel $t2,$acc2,$a2,lo
+ csel $t3,$acc3,$a3,lo
+ stp $t0,$t1,[$ap_end,#8*0]
+ stp $t2,$t3,[$ap_end,#8*2]
+
+ b .Lsqr8x_done
+
+.align 4
+.Lsqr8x8_post_condition:
+ adc $carry,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // $acc0-7,$carry hold result, $a0-7 hold modulus
+ subs $a0,$acc0,$a0
+ ldr $ap,[x29,#96] // pull rp
+ sbcs $a1,$acc1,$a1
+ stp xzr,xzr,[sp,#8*0]
+ sbcs $a2,$acc2,$a2
+ stp xzr,xzr,[sp,#8*2]
+ sbcs $a3,$acc3,$a3
+ stp xzr,xzr,[sp,#8*4]
+ sbcs $a4,$acc4,$a4
+ stp xzr,xzr,[sp,#8*6]
+ sbcs $a5,$acc5,$a5
+ stp xzr,xzr,[sp,#8*8]
+ sbcs $a6,$acc6,$a6
+ stp xzr,xzr,[sp,#8*10]
+ sbcs $a7,$acc7,$a7
+ stp xzr,xzr,[sp,#8*12]
+ sbcs $carry,$carry,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // $a0-7 hold result-modulus
+ csel $a0,$acc0,$a0,lo
+ csel $a1,$acc1,$a1,lo
+ csel $a2,$acc2,$a2,lo
+ csel $a3,$acc3,$a3,lo
+ stp $a0,$a1,[$ap,#8*0]
+ csel $a4,$acc4,$a4,lo
+ csel $a5,$acc5,$a5,lo
+ stp $a2,$a3,[$ap,#8*2]
+ csel $a6,$acc6,$a6,lo
+ csel $a7,$acc7,$a7,lo
+ stp $a4,$a5,[$ap,#8*4]
+ stp $a6,$a7,[$ap,#8*6]
+
+.Lsqr8x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ ret
+.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
+___
+}
+
+{
+########################################################################
+# Even though this might look as ARMv8 adaptation of mulx4x_mont from
+# x86_64-mont5 module, it's different in sense that it performs
+# reduction 256 bits at a time.
+
+my ($a0,$a1,$a2,$a3,
+ $t0,$t1,$t2,$t3,
+ $m0,$m1,$m2,$m3,
+ $acc0,$acc1,$acc2,$acc3,$acc4,
+ $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
+my $bp_end=$rp;
+my ($carry,$topmost) = ($rp,"x30");
+
+$code.=<<___;
+.type __bn_mul4x_mont,%function
+.align 5
+__bn_mul4x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ sub $tp,sp,$num,lsl#3
+ lsl $num,$num,#3
+ ldr $n0,[$n0] // *n0
+ sub sp,$tp,#8*4 // alloca
+
+ add $t0,$bp,$num
+ add $ap_end,$ap,$num
+ stp $rp,$t0,[x29,#96] // offload rp and &b[num]
+
+ ldr $bi,[$bp,#8*0] // b[0]
+ ldp $a0,$a1,[$ap,#8*0] // a[0..3]
+ ldp $a2,$a3,[$ap,#8*2]
+ add $ap,$ap,#8*4
+ mov $acc0,xzr
+ mov $acc1,xzr
+ mov $acc2,xzr
+ mov $acc3,xzr
+ ldp $m0,$m1,[$np,#8*0] // n[0..3]
+ ldp $m2,$m3,[$np,#8*2]
+ adds $np,$np,#8*4 // clear carry bit
+ mov $carry,xzr
+ mov $cnt,#0
+ mov $tp,sp
+
+.Loop_mul4x_1st_reduction:
+ mul $t0,$a0,$bi // lo(a[0..3]*b[0])
+ adc $carry,$carry,xzr // modulo-scheduled
+ mul $t1,$a1,$bi
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$bi
+ and $cnt,$cnt,#31
+ mul $t3,$a3,$bi
+ adds $acc0,$acc0,$t0
+ umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
+ adcs $acc1,$acc1,$t1
+ mul $mi,$acc0,$n0 // t[0]*n0
+ adcs $acc2,$acc2,$t2
+ umulh $t1,$a1,$bi
+ adcs $acc3,$acc3,$t3
+ umulh $t2,$a2,$bi
+ adc $acc4,xzr,xzr
+ umulh $t3,$a3,$bi
+ ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
+ adds $acc1,$acc1,$t0
+ // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
+ str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
+ adcs $acc2,$acc2,$t1
+ mul $t1,$m1,$mi
+ adcs $acc3,$acc3,$t2
+ mul $t2,$m2,$mi
+ adc $acc4,$acc4,$t3 // can't overflow
+ mul $t3,$m3,$mi
+ // (*) adds xzr,$acc0,$t0
+ subs xzr,$acc0,#1 // (*)
+ umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
+ adcs $acc0,$acc1,$t1
+ umulh $t1,$m1,$mi
+ adcs $acc1,$acc2,$t2
+ umulh $t2,$m2,$mi
+ adcs $acc2,$acc3,$t3
+ umulh $t3,$m3,$mi
+ adcs $acc3,$acc4,$carry
+ adc $carry,xzr,xzr
+ adds $acc0,$acc0,$t0
+ sub $t0,$ap_end,$ap
+ adcs $acc1,$acc1,$t1
+ adcs $acc2,$acc2,$t2
+ adcs $acc3,$acc3,$t3
+ //adc $carry,$carry,xzr
+ cbnz $cnt,.Loop_mul4x_1st_reduction
+
+ cbz $t0,.Lmul4x4_post_condition
+
+ ldp $a0,$a1,[$ap,#8*0] // a[4..7]
+ ldp $a2,$a3,[$ap,#8*2]
+ add $ap,$ap,#8*4
+ ldr $mi,[sp] // a[0]*n0
+ ldp $m0,$m1,[$np,#8*0] // n[4..7]
+ ldp $m2,$m3,[$np,#8*2]
+ add $np,$np,#8*4
+
+.Loop_mul4x_1st_tail:
+ mul $t0,$a0,$bi // lo(a[4..7]*b[i])
+ adc $carry,$carry,xzr // modulo-scheduled
+ mul $t1,$a1,$bi
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$bi
+ and $cnt,$cnt,#31
+ mul $t3,$a3,$bi
+ adds $acc0,$acc0,$t0
+ umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$a1,$bi
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$a2,$bi
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$a3,$bi
+ adc $acc4,xzr,xzr
+ ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
+ adds $acc1,$acc1,$t0
+ mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
+ adcs $acc2,$acc2,$t1
+ mul $t1,$m1,$mi
+ adcs $acc3,$acc3,$t2
+ mul $t2,$m2,$mi
+ adc $acc4,$acc4,$t3 // can't overflow
+ mul $t3,$m3,$mi
+ adds $acc0,$acc0,$t0
+ umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$m1,$mi
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$m2,$mi
+ adcs $acc3,$acc3,$t3
+ adcs $acc4,$acc4,$carry
+ umulh $t3,$m3,$mi
+ adc $carry,xzr,xzr
+ ldr $mi,[sp,$cnt] // next t[0]*n0
+ str $acc0,[$tp],#8 // result!!!
+ adds $acc0,$acc1,$t0
+ sub $t0,$ap_end,$ap // done yet?
+ adcs $acc1,$acc2,$t1
+ adcs $acc2,$acc3,$t2
+ adcs $acc3,$acc4,$t3
+ //adc $carry,$carry,xzr
+ cbnz $cnt,.Loop_mul4x_1st_tail
+
+ sub $t1,$ap_end,$num // rewinded $ap
+ cbz $t0,.Lmul4x_proceed
+
+ ldp $a0,$a1,[$ap,#8*0]
+ ldp $a2,$a3,[$ap,#8*2]
+ add $ap,$ap,#8*4
+ ldp $m0,$m1,[$np,#8*0]
+ ldp $m2,$m3,[$np,#8*2]
+ add $np,$np,#8*4
+ b .Loop_mul4x_1st_tail
+
+.align 5
+.Lmul4x_proceed:
+ ldr $bi,[$bp,#8*4]! // *++b
+ adc $topmost,$carry,xzr
+ ldp $a0,$a1,[$t1,#8*0] // a[0..3]
+ sub $np,$np,$num // rewind np
+ ldp $a2,$a3,[$t1,#8*2]
+ add $ap,$t1,#8*4
+
+ stp $acc0,$acc1,[$tp,#8*0] // result!!!
+ ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
+ stp $acc2,$acc3,[$tp,#8*2] // result!!!
+ ldp $acc2,$acc3,[sp,#8*6]
+
+ ldp $m0,$m1,[$np,#8*0] // n[0..3]
+ mov $tp,sp
+ ldp $m2,$m3,[$np,#8*2]
+ adds $np,$np,#8*4 // clear carry bit
+ mov $carry,xzr
+
+.align 4
+.Loop_mul4x_reduction:
+ mul $t0,$a0,$bi // lo(a[0..3]*b[4])
+ adc $carry,$carry,xzr // modulo-scheduled
+ mul $t1,$a1,$bi
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$bi
+ and $cnt,$cnt,#31
+ mul $t3,$a3,$bi
+ adds $acc0,$acc0,$t0
+ umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
+ adcs $acc1,$acc1,$t1
+ mul $mi,$acc0,$n0 // t[0]*n0
+ adcs $acc2,$acc2,$t2
+ umulh $t1,$a1,$bi
+ adcs $acc3,$acc3,$t3
+ umulh $t2,$a2,$bi
+ adc $acc4,xzr,xzr
+ umulh $t3,$a3,$bi
+ ldr $bi,[$bp,$cnt] // next b[i]
+ adds $acc1,$acc1,$t0
+ // (*) mul $t0,$m0,$mi
+ str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
+ adcs $acc2,$acc2,$t1
+ mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
+ adcs $acc3,$acc3,$t2
+ mul $t2,$m2,$mi
+ adc $acc4,$acc4,$t3 // can't overflow
+ mul $t3,$m3,$mi
+ // (*) adds xzr,$acc0,$t0
+ subs xzr,$acc0,#1 // (*)
+ umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
+ adcs $acc0,$acc1,$t1
+ umulh $t1,$m1,$mi
+ adcs $acc1,$acc2,$t2
+ umulh $t2,$m2,$mi
+ adcs $acc2,$acc3,$t3
+ umulh $t3,$m3,$mi
+ adcs $acc3,$acc4,$carry
+ adc $carry,xzr,xzr
+ adds $acc0,$acc0,$t0
+ adcs $acc1,$acc1,$t1
+ adcs $acc2,$acc2,$t2
+ adcs $acc3,$acc3,$t3
+ //adc $carry,$carry,xzr
+ cbnz $cnt,.Loop_mul4x_reduction
+
+ adc $carry,$carry,xzr
+ ldp $t0,$t1,[$tp,#8*4] // t[4..7]
+ ldp $t2,$t3,[$tp,#8*6]
+ ldp $a0,$a1,[$ap,#8*0] // a[4..7]
+ ldp $a2,$a3,[$ap,#8*2]
+ add $ap,$ap,#8*4
+ adds $acc0,$acc0,$t0
+ adcs $acc1,$acc1,$t1
+ adcs $acc2,$acc2,$t2
+ adcs $acc3,$acc3,$t3
+ //adc $carry,$carry,xzr
+
+ ldr $mi,[sp] // t[0]*n0
+ ldp $m0,$m1,[$np,#8*0] // n[4..7]
+ ldp $m2,$m3,[$np,#8*2]
+ add $np,$np,#8*4
+
+.align 4
+.Loop_mul4x_tail:
+ mul $t0,$a0,$bi // lo(a[4..7]*b[4])
+ adc $carry,$carry,xzr // modulo-scheduled
+ mul $t1,$a1,$bi
+ add $cnt,$cnt,#8
+ mul $t2,$a2,$bi
+ and $cnt,$cnt,#31
+ mul $t3,$a3,$bi
+ adds $acc0,$acc0,$t0
+ umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$a1,$bi
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$a2,$bi
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$a3,$bi
+ adc $acc4,xzr,xzr
+ ldr $bi,[$bp,$cnt] // next b[i]
+ adds $acc1,$acc1,$t0
+ mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
+ adcs $acc2,$acc2,$t1
+ mul $t1,$m1,$mi
+ adcs $acc3,$acc3,$t2
+ mul $t2,$m2,$mi
+ adc $acc4,$acc4,$t3 // can't overflow
+ mul $t3,$m3,$mi
+ adds $acc0,$acc0,$t0
+ umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$m1,$mi
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$m2,$mi
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$m3,$mi
+ adcs $acc4,$acc4,$carry
+ ldr $mi,[sp,$cnt] // next a[0]*n0
+ adc $carry,xzr,xzr
+ str $acc0,[$tp],#8 // result!!!
+ adds $acc0,$acc1,$t0
+ sub $t0,$ap_end,$ap // done yet?
+ adcs $acc1,$acc2,$t1
+ adcs $acc2,$acc3,$t2
+ adcs $acc3,$acc4,$t3
+ //adc $carry,$carry,xzr
+ cbnz $cnt,.Loop_mul4x_tail
+
+ sub $t1,$np,$num // rewinded np?
+ adc $carry,$carry,xzr
+ cbz $t0,.Loop_mul4x_break
+
+ ldp $t0,$t1,[$tp,#8*4]
+ ldp $t2,$t3,[$tp,#8*6]
+ ldp $a0,$a1,[$ap,#8*0]
+ ldp $a2,$a3,[$ap,#8*2]
+ add $ap,$ap,#8*4
+ adds $acc0,$acc0,$t0
+ adcs $acc1,$acc1,$t1
+ adcs $acc2,$acc2,$t2
+ adcs $acc3,$acc3,$t3
+ //adc $carry,$carry,xzr
+ ldp $m0,$m1,[$np,#8*0]
+ ldp $m2,$m3,[$np,#8*2]
+ add $np,$np,#8*4
+ b .Loop_mul4x_tail
+
+.align 4
+.Loop_mul4x_break:
+ ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
+ adds $acc0,$acc0,$topmost
+ add $bp,$bp,#8*4 // bp++
+ adcs $acc1,$acc1,xzr
+ sub $ap,$ap,$num // rewind ap
+ adcs $acc2,$acc2,xzr
+ stp $acc0,$acc1,[$tp,#8*0] // result!!!
+ adcs $acc3,$acc3,xzr
+ ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
+ adc $topmost,$carry,xzr
+ stp $acc2,$acc3,[$tp,#8*2] // result!!!
+ cmp $bp,$t3 // done yet?
+ ldp $acc2,$acc3,[sp,#8*6]
+ ldp $m0,$m1,[$t1,#8*0] // n[0..3]
+ ldp $m2,$m3,[$t1,#8*2]
+ add $np,$t1,#8*4
+ b.eq .Lmul4x_post
+
+ ldr $bi,[$bp]
+ ldp $a0,$a1,[$ap,#8*0] // a[0..3]
+ ldp $a2,$a3,[$ap,#8*2]
+ adds $ap,$ap,#8*4 // clear carry bit
+ mov $carry,xzr
+ mov $tp,sp
+ b .Loop_mul4x_reduction
+
+.align 4
+.Lmul4x_post:
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ mov $rp,$t2
+ mov $ap_end,$t2 // $rp copy
+ subs $t0,$acc0,$m0
+ add $tp,sp,#8*8
+ sbcs $t1,$acc1,$m1
+ sub $cnt,$num,#8*4
+
+.Lmul4x_sub:
+ sbcs $t2,$acc2,$m2
+ ldp $m0,$m1,[$np,#8*0]
+ sub $cnt,$cnt,#8*4
+ ldp $acc0,$acc1,[$tp,#8*0]
+ sbcs $t3,$acc3,$m3
+ ldp $m2,$m3,[$np,#8*2]
+ add $np,$np,#8*4
+ ldp $acc2,$acc3,[$tp,#8*2]
+ add $tp,$tp,#8*4
+ stp $t0,$t1,[$rp,#8*0]
+ sbcs $t0,$acc0,$m0
+ stp $t2,$t3,[$rp,#8*2]
+ add $rp,$rp,#8*4
+ sbcs $t1,$acc1,$m1
+ cbnz $cnt,.Lmul4x_sub
+
+ sbcs $t2,$acc2,$m2
+ mov $tp,sp
+ add $ap,sp,#8*4
+ ldp $a0,$a1,[$ap_end,#8*0]
+ sbcs $t3,$acc3,$m3
+ stp $t0,$t1,[$rp,#8*0]
+ ldp $a2,$a3,[$ap_end,#8*2]
+ stp $t2,$t3,[$rp,#8*2]
+ ldp $acc0,$acc1,[$ap,#8*0]
+ ldp $acc2,$acc3,[$ap,#8*2]
+ sbcs xzr,$topmost,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+
+ sub $cnt,$num,#8*4
+.Lmul4x_cond_copy:
+ sub $cnt,$cnt,#8*4
+ csel $t0,$acc0,$a0,lo
+ stp xzr,xzr,[$tp,#8*0]
+ csel $t1,$acc1,$a1,lo
+ ldp $a0,$a1,[$ap_end,#8*4]
+ ldp $acc0,$acc1,[$ap,#8*4]
+ csel $t2,$acc2,$a2,lo
+ stp xzr,xzr,[$tp,#8*2]
+ add $tp,$tp,#8*4
+ csel $t3,$acc3,$a3,lo
+ ldp $a2,$a3,[$ap_end,#8*6]
+ ldp $acc2,$acc3,[$ap,#8*6]
+ add $ap,$ap,#8*4
+ stp $t0,$t1,[$ap_end,#8*0]
+ stp $t2,$t3,[$ap_end,#8*2]
+ add $ap_end,$ap_end,#8*4
+ cbnz $cnt,.Lmul4x_cond_copy
+
+ csel $t0,$acc0,$a0,lo
+ stp xzr,xzr,[$tp,#8*0]
+ csel $t1,$acc1,$a1,lo
+ stp xzr,xzr,[$tp,#8*2]
+ csel $t2,$acc2,$a2,lo
+ stp xzr,xzr,[$tp,#8*3]
+ csel $t3,$acc3,$a3,lo
+ stp xzr,xzr,[$tp,#8*4]
+ stp $t0,$t1,[$ap_end,#8*0]
+ stp $t2,$t3,[$ap_end,#8*2]
+
+ b .Lmul4x_done
+
+.align 4
+.Lmul4x4_post_condition:
+ adc $carry,$carry,xzr
+ ldr $ap,[x29,#96] // pull rp
+ // $acc0-3,$carry hold result, $m0-7 hold modulus
+ subs $a0,$acc0,$m0
+ ldr x30,[x29,#8] // pull return address
+ sbcs $a1,$acc1,$m1
+ stp xzr,xzr,[sp,#8*0]
+ sbcs $a2,$acc2,$m2
+ stp xzr,xzr,[sp,#8*2]
+ sbcs $a3,$acc3,$m3
+ stp xzr,xzr,[sp,#8*4]
+ sbcs xzr,$carry,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*6]
+
+ // $a0-3 hold result-modulus
+ csel $a0,$acc0,$a0,lo
+ csel $a1,$acc1,$a1,lo
+ csel $a2,$acc2,$a2,lo
+ csel $a3,$acc3,$a3,lo
+ stp $a0,$a1,[$ap,#8*0]
+ stp $a2,$a3,[$ap,#8*2]
+
+.Lmul4x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ ret
+.size __bn_mul4x_mont,.-__bn_mul4x_mont
+___
+}
+$code.=<<___;
+.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+print $code;
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/bn-586.pl b/openssl-1.1.0h/crypto/bn/asm/bn-586.pl
new file mode 100644
index 0000000..1ca1bbf
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/bn-586.pl
@@ -0,0 +1,785 @@
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_sub_part_words("bn_sub_part_words");
+
+&asm_finish();
+
+close STDOUT;
+
+sub bn_mul_add_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("maw_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
+ &pxor("mm1","mm1"); # mm1 = carry_in
+ &jmp(&label("maw_sse2_entry"));
+
+ &set_label("maw_sse2_unrolled",16);
+ &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
+ &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
+ &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
+ &pmuludq("mm2","mm0"); # mm2 = w*a[0]
+ &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
+ &pmuludq("mm4","mm0"); # mm4 = w*a[1]
+ &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
+ &pmuludq("mm6","mm0"); # mm6 = w*a[2]
+ &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
+ &pmuludq("mm7","mm0"); # mm7 = w*a[3]
+ &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
+ &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
+ &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
+ &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
+ &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
+ &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
+ &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
+ &movd(&DWP(0,$r,"",0),"mm1");
+ &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
+ &pmuludq("mm2","mm0"); # mm2 = w*a[4]
+ &psrlq("mm1",32); # mm1 = carry0
+ &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
+ &pmuludq("mm4","mm0"); # mm4 = w*a[5]
+ &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
+ &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
+ &pmuludq("mm6","mm0"); # mm6 = w*a[6]
+ &movd(&DWP(4,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry1
+ &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
+ &add($a,32);
+ &pmuludq("mm3","mm0"); # mm3 = w*a[7]
+ &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
+ &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
+ &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
+ &movd(&DWP(8,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry2
+ &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
+ &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
+ &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
+ &movd(&DWP(12,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry3
+ &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
+ &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
+ &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
+ &movd(&DWP(16,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry4
+ &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
+ &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
+ &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
+ &movd(&DWP(20,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry5
+ &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
+ &movd(&DWP(24,$r,"",0),"mm1");
+ &psrlq("mm1",32); # mm1 = carry6
+ &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
+ &movd(&DWP(28,$r,"",0),"mm1");
+ &lea($r,&DWP(32,$r));
+ &psrlq("mm1",32); # mm1 = carry_out
+
+ &sub($c,8);
+ &jz(&label("maw_sse2_exit"));
+ &set_label("maw_sse2_entry");
+ &test($c,0xfffffff8);
+ &jnz(&label("maw_sse2_unrolled"));
+
+ &set_label("maw_sse2_loop",4);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm3"); # carry += r[i]
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
+ &jnz(&label("maw_sse2_loop"));
+ &set_label("maw_sse2_exit");
+ &movd("eax","mm1"); # c = carry_out
+ &emms();
+ &ret();
+
+ &set_label("maw_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ebp";
+ $r="edi";
+ $c="esi";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+
+ &mov("ecx",&wparam(2)); #
+ &mov($a,&wparam(1)); #
+
+ &and("ecx",0xfffffff8); # num / 8
+ &mov($w,&wparam(3)); #
+
+ &push("ecx"); # Up the stack for a tmp variable
+
+ &jz(&label("maw_finish"));
+
+ &set_label("maw_loop",16);
+
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+
+ &mov("eax",&DWP($i,$a)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+= c
+ &adc("edx",0); # H(t)+=carry
+ &add("eax",&DWP($i,$r)); # L(t)+= *r
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i,$r),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ }
+
+ &comment("");
+ &sub("ecx",8);
+ &lea($a,&DWP(32,$a));
+ &lea($r,&DWP(32,$r));
+ &jnz(&label("maw_loop"));
+
+ &set_label("maw_finish",0);
+ &mov("ecx",&wparam(2)); # get num
+ &and("ecx",7);
+ &jnz(&label("maw_finish2")); # helps branch prediction
+ &jmp(&label("maw_end"));
+
+ &set_label("maw_finish2",1);
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ &adc("edx",0); # H(t)+=carry
+ &add("eax",&DWP($i*4,$r)); # L(t)+= *r
+ &adc("edx",0); # H(t)+=carry
+ &dec("ecx") if ($i != 7-1);
+ &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ &jz(&label("maw_end")) if ($i != 7-1);
+ }
+ &set_label("maw_end",0);
+ &mov("eax",$c);
+
+ &pop("ecx"); # clear variable from
+
+ &function_end($name);
+ }
+
+sub bn_mul_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("mw_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
+ &pxor("mm1","mm1"); # mm1 = carry = 0
+
+ &set_label("mw_sse2_loop",16);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
+ &jnz(&label("mw_sse2_loop"));
+
+ &movd("eax","mm1"); # return carry
+ &emms();
+ &ret();
+ &set_label("mw_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ecx";
+ $r="edi";
+ $c="esi";
+ $num="ebp";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+ &mov($a,&wparam(1)); #
+ &mov($num,&wparam(2)); #
+ &mov($w,&wparam(3)); #
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("mw_finish"));
+
+ &set_label("mw_loop",0);
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+
+ &mov("eax",&DWP($i,$a,"",0)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ # XXX
+
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
+
+ &mov($c,"edx"); # c= H(t);
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,32);
+ &sub($num,8);
+ &jz(&label("mw_finish"));
+ &jmp(&label("mw_loop"));
+
+ &set_label("mw_finish",0);
+ &mov($num,&wparam(2)); # get num
+ &and($num,7);
+ &jnz(&label("mw_finish2"));
+ &jmp(&label("mw_end"));
+
+ &set_label("mw_finish2",1);
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a,"",0));# *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ # XXX
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ &dec($num) if ($i != 7-1);
+ &jz(&label("mw_end")) if ($i != 7-1);
+ }
+ &set_label("mw_end",0);
+ &mov("eax",$c);
+
+ &function_end($name);
+ }
+
+sub bn_sqr_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("sqr_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+
+ &set_label("sqr_sse2_loop",16);
+ &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
+ &pmuludq("mm0","mm0"); # a[i] *= a[i]
+ &lea($a,&DWP(4,$a)); # a++
+ &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
+ &sub($c,1);
+ &lea($r,&DWP(8,$r)); # r += 2
+ &jnz(&label("sqr_sse2_loop"));
+
+ &emms();
+ &ret();
+ &set_label("sqr_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $r="esi";
+ $a="edi";
+ $num="ebx";
+
+ &mov($r,&wparam(0)); #
+ &mov($a,&wparam(1)); #
+ &mov($num,&wparam(2)); #
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("sw_finish"));
+
+ &set_label("sw_loop",0);
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+ &mov("eax",&DWP($i,$a,"",0)); # *a
+ # XXX
+ &mul("eax"); # *a * *a
+ &mov(&DWP($i*2,$r,"",0),"eax"); #
+ &mov(&DWP($i*2+4,$r,"",0),"edx");#
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,64);
+ &sub($num,8);
+ &jnz(&label("sw_loop"));
+
+ &set_label("sw_finish",0);
+ &mov($num,&wparam(2)); # get num
+ &and($num,7);
+ &jz(&label("sw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a,"",0)); # *a
+ # XXX
+ &mul("eax"); # *a * *a
+ &mov(&DWP($i*8,$r,"",0),"eax"); #
+ &dec($num) if ($i != 7-1);
+ &mov(&DWP($i*8+4,$r,"",0),"edx");
+ &jz(&label("sw_end")) if ($i != 7-1);
+ }
+ &set_label("sw_end",0);
+
+ &function_end($name);
+ }
+
+sub bn_div_words
+ {
+ local($name)=@_;
+
+ &function_begin_B($name,"");
+ &mov("edx",&wparam(0)); #
+ &mov("eax",&wparam(1)); #
+ &mov("ecx",&wparam(2)); #
+ &div("ecx");
+ &ret();
+ &function_end_B($name);
+ }
+
+sub bn_add_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &add($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &add($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &add($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &add($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
+sub bn_sub_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
+sub bn_sub_part_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP(0,$a,"",0)); # *a
+ &mov($tmp2,&DWP(0,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP(0,$r,"",0),$tmp1); # *r
+ &add($a, 4);
+ &add($b, 4);
+ &add($r, 4);
+ &dec($num) if ($i != 6);
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+ &cmp(&wparam(4),0);
+ &je(&label("pw_end"));
+
+ &mov($num,&wparam(4)); # get dl
+ &cmp($num,0);
+ &je(&label("pw_end"));
+ &jge(&label("pw_pos"));
+
+ &comment("pw_neg");
+ &mov($tmp2,0);
+ &sub($tmp2,$num);
+ &mov($num,$tmp2);
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("pw_neg_finish"));
+
+ &set_label("pw_neg_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("dl<0 Round $i");
+
+ &mov($tmp1,0);
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("pw_neg_loop"));
+
+ &set_label("pw_neg_finish",0);
+ &mov($tmp2,&wparam(4)); # get dl
+ &mov($num,0);
+ &sub($num,$tmp2);
+ &and($num,7);
+ &jz(&label("pw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("dl<0 Tail Round $i");
+ &mov($tmp1,0);
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jz(&label("pw_end")) if ($i != 6);
+ }
+
+ &jmp(&label("pw_end"));
+
+ &set_label("pw_pos",0);
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("pw_pos_finish"));
+
+ &set_label("pw_pos_loop",0);
+
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("dl>0 Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &sub($tmp1,$c);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jnc(&label("pw_nc".$i));
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("pw_pos_loop"));
+
+ &set_label("pw_pos_finish",0);
+ &mov($num,&wparam(4)); # get dl
+ &and($num,7);
+ &jz(&label("pw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("dl>0 Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &sub($tmp1,$c);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &jnc(&label("pw_tail_nc".$i));
+ &dec($num) if ($i != 6);
+ &jz(&label("pw_end")) if ($i != 6);
+ }
+ &mov($c,1);
+ &jmp(&label("pw_end"));
+
+ &set_label("pw_nc_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &set_label("pw_nc".$i,0);
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("pw_nc_loop"));
+
+ &mov($num,&wparam(4)); # get dl
+ &and($num,7);
+ &jz(&label("pw_nc_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ &set_label("pw_tail_nc".$i,0);
+ &dec($num) if ($i != 6);
+ &jz(&label("pw_nc_end")) if ($i != 6);
+ }
+
+ &set_label("pw_nc_end",0);
+ &mov($c,0);
+
+ &set_label("pw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
diff --git a/openssl-1.1.0h/crypto/bn/asm/bn-c64xplus.asm b/openssl-1.1.0h/crypto/bn/asm/bn-c64xplus.asm
new file mode 100644
index 0000000..de6d377
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/bn-c64xplus.asm
@@ -0,0 +1,382 @@
+;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+;;
+;; Licensed under the OpenSSL license (the "License"). You may not use
+;; this file except in compliance with the License. You can obtain a copy
+;; in the file LICENSE in the source distribution or at
+;; https://www.openssl.org/source/license.html
+;;
+;;====================================================================
+;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+;; project.
+;;
+;; Rights for redistribution and usage in source and binary forms are
+;; granted according to the OpenSSL license. Warranty of any kind is
+;; disclaimed.
+;;====================================================================
+;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
+;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
+;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
+;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
+;;====================================================================
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg bn_mul_add_words,_bn_mul_add_words
+ .asg bn_mul_words,_bn_mul_words
+ .asg bn_sqr_words,_bn_sqr_words
+ .asg bn_add_words,_bn_add_words
+ .asg bn_sub_words,_bn_sub_words
+ .asg bn_div_words,_bn_div_words
+ .asg bn_sqr_comba8,_bn_sqr_comba8
+ .asg bn_mul_comba8,_bn_mul_comba8
+ .asg bn_sqr_comba4,_bn_sqr_comba4
+ .asg bn_mul_comba4,_bn_mul_comba4
+ .endif
+
+ .asg B3,RA
+ .asg A4,ARG0
+ .asg B4,ARG1
+ .asg A6,ARG2
+ .asg B6,ARG3
+ .asg A8,ARG4
+ .asg B8,ARG5
+ .asg A4,RET
+ .asg A15,FP
+ .asg B14,DP
+ .asg B15,SP
+
+ .global _bn_mul_add_words
+_bn_mul_add_words:
+ .asmfunc
+ MV ARG2,B0
+ [!B0] BNOP RA
+||[!B0] MVK 0,RET
+ [B0] MVC B0,ILC
+ [B0] ZERO A19 ; high part of accumulator
+|| [B0] MV ARG0,A2
+|| [B0] MV ARG3,A3
+ NOP 3
+
+ SPLOOP 2 ; 2*n+10
+;;====================================================================
+ LDW *ARG1++,B7 ; ap[i]
+ NOP 3
+ LDW *ARG0++,A7 ; rp[i]
+ MPY32U B7,A3,A17:A16
+ NOP 3 ; [2,0] in epilogue
+ ADDU A16,A7,A21:A20
+ ADDU A19,A21:A20,A19:A18
+|| MV.S A17,A23
+ SPKERNEL 2,1 ; leave slot for "return value"
+|| STW A18,*A2++ ; rp[i]
+|| ADD A19,A23,A19
+;;====================================================================
+ BNOP RA,4
+ MV A19,RET ; return value
+ .endasmfunc
+
+ .global _bn_mul_words
+_bn_mul_words:
+ .asmfunc
+ MV ARG2,B0
+ [!B0] BNOP RA
+||[!B0] MVK 0,RET
+ [B0] MVC B0,ILC
+ [B0] ZERO A19 ; high part of accumulator
+ NOP 3
+
+ SPLOOP 2 ; 2*n+10
+;;====================================================================
+ LDW *ARG1++,A7 ; ap[i]
+ NOP 4
+ MPY32U A7,ARG3,A17:A16
+ NOP 4 ; [2,0] in epiloque
+ ADDU A19,A16,A19:A18
+|| MV.S A17,A21
+ SPKERNEL 2,1 ; leave slot for "return value"
+|| STW A18,*ARG0++ ; rp[i]
+|| ADD.L A19,A21,A19
+;;====================================================================
+ BNOP RA,4
+ MV A19,RET ; return value
+ .endasmfunc
+
+ .global _bn_sqr_words
+_bn_sqr_words:
+ .asmfunc
+ MV ARG2,B0
+ [!B0] BNOP RA
+||[!B0] MVK 0,RET
+ [B0] MVC B0,ILC
+ [B0] MV ARG0,B2
+|| [B0] ADD 4,ARG0,ARG0
+ NOP 3
+
+ SPLOOP 2 ; 2*n+10
+;;====================================================================
+ LDW *ARG1++,B7 ; ap[i]
+ NOP 4
+ MPY32U B7,B7,B1:B0
+ NOP 3 ; [2,0] in epilogue
+ STW B0,*B2++(8) ; rp[2*i]
+ MV B1,A1
+ SPKERNEL 2,0 ; fully overlap BNOP RA,5
+|| STW A1,*ARG0++(8) ; rp[2*i+1]
+;;====================================================================
+ BNOP RA,5
+ .endasmfunc
+
+ .global _bn_add_words
+_bn_add_words:
+ .asmfunc
+ MV ARG3,B0
+ [!B0] BNOP RA
+||[!B0] MVK 0,RET
+ [B0] MVC B0,ILC
+ [B0] ZERO A1 ; carry flag
+|| [B0] MV ARG0,A3
+ NOP 3
+
+ SPLOOP 2 ; 2*n+6
+;;====================================================================
+ LDW *ARG2++,A7 ; bp[i]
+|| LDW *ARG1++,B7 ; ap[i]
+ NOP 4
+ ADDU A7,B7,A9:A8
+ ADDU A1,A9:A8,A1:A0
+ SPKERNEL 0,0 ; fully overlap BNOP RA,5
+|| STW A0,*A3++ ; write result
+|| MV A1,RET ; keep carry flag in RET
+;;====================================================================
+ BNOP RA,5
+ .endasmfunc
+
+ .global _bn_sub_words
+_bn_sub_words:
+ .asmfunc
+ MV ARG3,B0
+ [!B0] BNOP RA
+||[!B0] MVK 0,RET
+ [B0] MVC B0,ILC
+ [B0] ZERO A2 ; borrow flag
+|| [B0] MV ARG0,A3
+ NOP 3
+
+ SPLOOP 2 ; 2*n+6
+;;====================================================================
+ LDW *ARG2++,A7 ; bp[i]
+|| LDW *ARG1++,B7 ; ap[i]
+ NOP 4
+ SUBU B7,A7,A1:A0
+ [A2] SUB A1:A0,1,A1:A0
+ SPKERNEL 0,1 ; leave slot for "return borrow flag"
+|| STW A0,*A3++ ; write result
+|| AND 1,A1,A2 ; pass on borrow flag
+;;====================================================================
+ BNOP RA,4
+ AND 1,A1,RET ; return borrow flag
+ .endasmfunc
+
+ .global _bn_div_words
+_bn_div_words:
+ .asmfunc
+ LMBD 1,A6,A0 ; leading zero bits in dv
+ LMBD 1,A4,A1 ; leading zero bits in hi
+|| MVK 32,B0
+ CMPLTU A1,A0,A2
+|| ADD A0,B0,B0
+ [ A2] BNOP RA
+||[ A2] MVK -1,A4 ; return overflow
+||[!A2] MV A4,A3 ; reassign hi
+ [!A2] MV B4,A4 ; reassign lo, will be quotient
+||[!A2] MVC B0,ILC
+ [!A2] SHL A6,A0,A6 ; normalize dv
+|| MVK 1,A1
+
+ [!A2] CMPLTU A3,A6,A1 ; hi<dv?
+||[!A2] SHL A4,1,A5:A4 ; lo<<1
+ [!A1] SUB A3,A6,A3 ; hi-=dv
+||[!A1] OR 1,A4,A4
+ [!A2] SHRU A3,31,A1 ; upper bit
+||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
+
+ SPLOOP 3
+ [!A1] CMPLTU A3,A6,A1 ; hi<dv?
+||[ A1] ZERO A1
+|| SHL A4,1,A5:A4 ; lo<<1
+ [!A1] SUB A3,A6,A3 ; hi-=dv
+||[!A1] OR 1,A4,A4 ; quotient
+ SHRU A3,31,A1 ; upper bit
+|| ADDAH A5,A3,A3 ; hi<<1|lo>>31
+ SPKERNEL
+
+ BNOP RA,5
+ .endasmfunc
+
+;;====================================================================
+;; Not really Comba algorithm, just straightforward NxM... Dedicated
+;; fully unrolled real Comba implementations are asymptotically 2x
+;; faster, but naturally larger undertaking. Purpose of this exercise
+;; was rather to learn to master nested SPLOOPs...
+;;====================================================================
+ .global _bn_sqr_comba8
+ .global _bn_mul_comba8
+_bn_sqr_comba8:
+ MV ARG1,ARG2
+_bn_mul_comba8:
+ .asmfunc
+ MVK 8,B0 ; N, RILC
+|| MVK 8,A0 ; M, outer loop counter
+|| MV ARG1,A5 ; copy ap
+|| MV ARG0,B4 ; copy rp
+|| ZERO B19 ; high part of accumulator
+ MVC B0,RILC
+|| SUB B0,2,B1 ; N-2, initial ILC
+|| SUB B0,1,B2 ; const B2=N-1
+|| LDW *A5++,B6 ; ap[0]
+|| MV A0,A3 ; const A3=M
+sploopNxM?: ; for best performance arrange M<=N
+ [A0] SPLOOPD 2 ; 2*n+10
+|| MVC B1,ILC
+|| ADDAW B4,B0,B5
+|| ZERO B7
+|| LDW *A5++,A9 ; pre-fetch ap[1]
+|| ZERO A1
+|| SUB A0,1,A0
+;;====================================================================
+;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
+;; This is because of Advisory 15 from TI publication SPRZ247I.
+ LDW *ARG2++,A7 ; bp[i]
+ NOP 3
+ [A1] LDW *B5++,B7 ; rp[i]
+ MPY32U A7,B6,B17:B16
+ NOP 3
+ ADDU B16,B7,B21:B20
+ ADDU B19,B21:B20,B19:B18
+|| MV.S B17,B23
+ SPKERNEL
+|| STW B18,*B4++ ; rp[i]
+|| ADD.S B19,B23,B19
+;;====================================================================
+outer?: ; m*2*(n+1)+10
+ SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
+ SPMASKR
+|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
+ MVD A9,B6 ; move through .M unit(*)
+ [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
+ SUBAW B5,B2,B5 ; rewind rp to rp[1]
+ MVK 1,A1
+ [A0] BNOP.S1 outer?,4
+|| [A0] SUB.L A0,1,A0
+ STW B19,*B4--[B2] ; rewind rp tp rp[1]
+|| ZERO.S B19 ; high part of accumulator
+;; end of outer?
+ BNOP RA,5 ; return
+ .endasmfunc
+;; (*) It should be noted that B6 is used as input to MPY32U in
+;; chronologically next cycle in *preceding* SPLOOP iteration.
+;; Normally such arrangement would require DINT, but at this
+;; point SPLOOP is draining and interrupts are disabled
+;; implicitly.
+
+ .global _bn_sqr_comba4
+ .global _bn_mul_comba4
+_bn_sqr_comba4:
+ MV ARG1,ARG2
+_bn_mul_comba4:
+ .asmfunc
+ .if 0
+ BNOP sploopNxM?,3
+ ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
+ ;; because of low-counter effect, when prologue phase finishes
+ ;; before SPKERNEL instruction is reached. As result it's 25%
+ ;; slower than expected...
+ MVK 4,B0 ; N, RILC
+|| MVK 4,A0 ; M, outer loop counter
+|| MV ARG1,A5 ; copy ap
+|| MV ARG0,B4 ; copy rp
+|| ZERO B19 ; high part of accumulator
+ MVC B0,RILC
+|| SUB B0,2,B1 ; first ILC
+|| SUB B0,1,B2 ; const B2=N-1
+|| LDW *A5++,B6 ; ap[0]
+|| MV A0,A3 ; const A3=M
+ .else
+ ;; This alternative is an exercise in fully unrolled Comba
+ ;; algorithm implementation that operates at n*(n+1)+12, or
+ ;; as little as 32 cycles...
+ LDW *ARG1[0],B16 ; a[0]
+|| LDW *ARG2[0],A16 ; b[0]
+ LDW *ARG1[1],B17 ; a[1]
+|| LDW *ARG2[1],A17 ; b[1]
+ LDW *ARG1[2],B18 ; a[2]
+|| LDW *ARG2[2],A18 ; b[2]
+ LDW *ARG1[3],B19 ; a[3]
+|| LDW *ARG2[3],A19 ; b[3]
+ NOP
+ MPY32U A16,B16,A1:A0 ; a[0]*b[0]
+ MPY32U A17,B16,A23:A22 ; a[0]*b[1]
+ MPY32U A16,B17,A25:A24 ; a[1]*b[0]
+ MPY32U A16,B18,A27:A26 ; a[2]*b[0]
+ STW A0,*ARG0[0]
+|| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
+ MPY32U A18,B16,A31:A30 ; a[0]*b[2]
+|| ADDU A22,A1,A1:A0
+ MV A23,B0
+|| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
+|| ADDU A24,A1:A0,A1:A0
+ ADDU A25,B0,B1:B0
+|| STW A0,*ARG0[1]
+|| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
+|| ADDU A26,A1,A9:A8
+ ADDU A27,B1,B9:B8
+|| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
+|| ADDU A28,A9:A8,A9:A8
+ ADDU A29,B9:B8,B9:B8
+|| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
+|| ADDU A30,A9:A8,A9:A8
+ ADDU A31,B9:B8,B9:B8
+|| ADDU B0,A9:A8,A9:A8
+ STW A8,*ARG0[2]
+|| ADDU A20,A9,A1:A0
+ ADDU A21,B9,B1:B0
+|| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
+|| ADDU A22,A1:A0,A1:A0
+ ADDU A23,B1:B0,B1:B0
+|| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
+|| ADDU A24,A1:A0,A1:A0
+ ADDU A25,B1:B0,B1:B0
+|| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
+|| ADDU A26,A1:A0,A1:A0
+ ADDU A27,B1:B0,B1:B0
+|| ADDU B8,A1:A0,A1:A0
+ STW A0,*ARG0[3]
+|| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
+|| ADDU A20,A1,A9:A8
+ ADDU A21,B1,B9:B8
+|| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
+|| ADDU A22,A9:A8,A9:A8
+ ADDU A23,B9:B8,B9:B8
+|| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
+|| ADDU A24,A9:A8,A9:A8
+ ADDU A25,B9:B8,B9:B8
+|| ADDU B0,A9:A8,A9:A8
+ STW A8,*ARG0[4]
+|| ADDU A26,A9,A1:A0
+ ADDU A27,B9,B1:B0
+|| ADDU A28,A1:A0,A1:A0
+ ADDU A29,B1:B0,B1:B0
+|| BNOP RA
+|| ADDU B8,A1:A0,A1:A0
+ STW A0,*ARG0[5]
+|| ADDU A30,A1,A9:A8
+ ADD A31,B1,B8
+ ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
+ ADD B8,A9,A9
+|| STW A8,*ARG0[6]
+ STW A9,*ARG0[7]
+ .endif
+ .endasmfunc
diff --git a/openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl b/openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl
new file mode 100644
index 0000000..c0e5400
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/c64xplus-gf2m.pl
@@ -0,0 +1,160 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# February 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... The subroutine runs in 37 cycles, which is
+# 4.5x faster than compiler-generated code. Though comparison is
+# totally unfair, because this module utilizes Galois Field Multiply
+# instruction.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
+
+($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
+($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
+($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
+($A,$B)=($Alo,$B_1);
+$xFF="B1";
+
+sub mul_1x1_upper {
+my ($A,$B)=@_;
+$code.=<<___;
+ EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
+|| AND $B,$xFF,$B_0
+|| SHRU $B,24,$B_3
+ SHRU $A,16, $Ahi ; smash $A to two halfwords
+|| EXTU $A,16,16,$Alo
+
+ XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication
+|| XORMPY $Ahi,$B_2,$Ahix2
+|| EXTU $B,16,24,$B_1
+ XORMPY $Alo,$B_0,$Alox0
+|| XORMPY $Ahi,$B_0,$Ahix0
+ XORMPY $Alo,$B_3,$Alox3
+|| XORMPY $Ahi,$B_3,$Ahix3
+ XORMPY $Alo,$B_1,$Alox1
+|| XORMPY $Ahi,$B_1,$Ahix1
+___
+}
+sub mul_1x1_merged {
+my ($OUTlo,$OUThi,$A,$B)=@_;
+$code.=<<___;
+ EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
+|| AND $B,$xFF,$B_0
+|| SHRU $B,24,$B_3
+ SHRU $A,16, $Ahi ; smash $A to two halfwords
+|| EXTU $A,16,16,$Alo
+
+ XOR $Ahix0,$Alox2,$Ahix0
+|| MV $Ahix2,$OUThi
+|| XORMPY $Alo,$B_2,$Alox2
+ XORMPY $Ahi,$B_2,$Ahix2
+|| EXTU $B,16,24,$B_1
+|| XORMPY $Alo,$B_0,A1 ; $Alox0
+ XOR $Ahix1,$Alox3,$Ahix1
+|| SHL $Ahix0,16,$OUTlo
+|| SHRU $Ahix0,16,$Ahix0
+ XOR $Alox0,$OUTlo,$OUTlo
+|| XOR $Ahix0,$OUThi,$OUThi
+|| XORMPY $Ahi,$B_0,$Ahix0
+|| XORMPY $Alo,$B_3,$Alox3
+|| SHL $Alox1,8,$Alox1
+|| SHL $Ahix3,8,$Ahix3
+ XOR $Alox1,$OUTlo,$OUTlo
+|| XOR $Ahix3,$OUThi,$OUThi
+|| XORMPY $Ahi,$B_3,$Ahix3
+|| SHL $Ahix1,24,$Alox1
+|| SHRU $Ahix1,8, $Ahix1
+ XOR $Alox1,$OUTlo,$OUTlo
+|| XOR $Ahix1,$OUThi,$OUThi
+|| XORMPY $Alo,$B_1,$Alox1
+|| XORMPY $Ahi,$B_1,$Ahix1
+|| MV A1,$Alox0
+___
+}
+sub mul_1x1_lower {
+my ($OUTlo,$OUThi)=@_;
+$code.=<<___;
+ ;NOP
+ XOR $Ahix0,$Alox2,$Ahix0
+|| MV $Ahix2,$OUThi
+ NOP
+ XOR $Ahix1,$Alox3,$Ahix1
+|| SHL $Ahix0,16,$OUTlo
+|| SHRU $Ahix0,16,$Ahix0
+ XOR $Alox0,$OUTlo,$OUTlo
+|| XOR $Ahix0,$OUThi,$OUThi
+|| SHL $Alox1,8,$Alox1
+|| SHL $Ahix3,8,$Ahix3
+ XOR $Alox1,$OUTlo,$OUTlo
+|| XOR $Ahix3,$OUThi,$OUThi
+|| SHL $Ahix1,24,$Alox1
+|| SHRU $Ahix1,8, $Ahix1
+ XOR $Alox1,$OUTlo,$OUTlo
+|| XOR $Ahix1,$OUThi,$OUThi
+___
+}
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
+ .endif
+
+ .global _bn_GF2m_mul_2x2
+_bn_GF2m_mul_2x2:
+ .asmfunc
+ MVK 0xFF,$xFF
+___
+ &mul_1x1_upper($a0,$b0); # a0·b0
+$code.=<<___;
+|| MV $b1,$B
+ MV $a1,$A
+___
+ &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
+$code.=<<___;
+|| XOR $b0,$b1,$B
+ XOR $a0,$a1,$A
+___
+ &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
+$code.=<<___;
+ XOR A28,A31,A29
+|| XOR B28,B31,B29 ; a0·b0+a1·b1
+___
+ &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
+$code.=<<___;
+|| BNOP B3
+ XOR A29,A30,A30
+|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ XOR B28,A30,A30
+|| STW A28,*${rp}[0]
+ XOR B30,A31,A31
+|| STW A30,*${rp}[1]
+ STW A31,*${rp}[2]
+ STW B31,*${rp}[3]
+ .endasmfunc
+___
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/co-586.pl b/openssl-1.1.0h/crypto/bn/asm/co-586.pl
new file mode 100644
index 0000000..60d0363
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/co-586.pl
@@ -0,0 +1,298 @@
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+close STDOUT;
+
+sub mul_add_c
+ {
+ local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("mul a[$ai]*b[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ &mul("edx");
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
+ &mov("eax",&wparam(0)) if $pos > 0; # load r[]
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
+ }
+
+sub sqr_add_c
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ }
+
+sub sqr_add_c2
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$a,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add("eax","eax");
+ ###
+ &adc("edx","edx");
+ ###
+ &adc($c2,0);
+ &add($c0,"eax");
+ &adc($c1,"edx");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ &adc($c2,0);
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+ ###
+ }
+
+sub bn_mul_comba
+ {
+ local($name,$num)=@_;
+ local($a,$b,$c0,$c1,$c2);
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($tot,$end);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $b="edi";
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ &push("esi");
+ &mov($a,&wparam(1));
+ &push("edi");
+ &mov($b,&wparam(2));
+ &push("ebp");
+ &push("ebx");
+
+ &xor($c0,$c0);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+ &xor($c1,$c1);
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("################## Calculate word $i");
+
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($j+1) == $end)
+ {
+ $v=1;
+ $v=2 if (($i+1) == $tot);
+ }
+ else
+ { $v=0; }
+ if (($j+1) != $end)
+ {
+ $na=($ai-1);
+ $nb=($bi+1);
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+ &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ # &mov("eax",&wparam(0));
+ # &mov(&DWP($i*4,"eax","",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &comment("save r[$i]");
+ # &mov("eax",&wparam(0));
+ &mov(&DWP($i*4,"eax","",0),$c0);
+
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
+
+sub bn_sqr_comba
+ {
+ local($name,$num)=@_;
+ local($r,$a,$c0,$c1,$c2)=@_;
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($b,$tot,$end,$half);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $r="edi";
+
+ &push("esi");
+ &push("edi");
+ &push("ebp");
+ &push("ebx");
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &xor($c0,$c0);
+ &xor($c1,$c1);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("############### Calculate word $i");
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($ai-1) < ($bi+1))
+ {
+ $v=1;
+ $v=2 if ($i+1) == $tot;
+ }
+ else
+ { $v=0; }
+ if (!$v)
+ {
+ $na=$ai-1;
+ $nb=$bi+1;
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+ if ($ai == $bi)
+ {
+ &sqr_add_c($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ else
+ {
+ &sqr_add_c2($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ #&mov(&DWP($i*4,$r,"",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ last;
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &mov(&DWP($i*4,$r,"",0),$c0);
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
diff --git a/openssl-1.1.0h/crypto/bn/asm/ia64-mont.pl b/openssl-1.1.0h/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000..5cc5c59
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,860 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+# stalls after ldf8, xma and getf.sig outside inner loop and
+# improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+# once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+# acute interest, because upcoming Tukwila's individual cores are
+# reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+# sign verify sign/s verify/s
+# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
+# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
+# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
+# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
+# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
+# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
+# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
+# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
+# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
+# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
+# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
+# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
+# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+$output=pop;
+
+if ($^O eq "hpux") {
+ $ADDP="addp4";
+ for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+// const BN_ULONG *bp,const BN_ULONG *np,
+// const BN_ULONG *n0p,int num);
+.align 64
+.global bn_mul_mont#
+.proc bn_mul_mont#
+bn_mul_mont:
+ .prologue
+ .body
+{ .mmi; cmp4.le p6,p7=2,r37;;
+(p6) cmp4.lt.unc p8,p9=8,r37
+ mov ret0=r0 };;
+{ .bbb;
+(p9) br.cond.dptk.many bn_mul_mont_8
+(p8) br.cond.dpnt.many bn_mul_mont_general
+(p7) br.ret.spnt.many b0 };;
+.endp bn_mul_mont#
+
+prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
+
+rptr=r8; aptr=r9; bptr=r14; nptr=r15;
+tptr=r16; // &tp[0]
+tp_1=r17; // &tp[-1]
+num=r18; len=r19; lc=r20;
+topbit=r21; // carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align 64
+.local bn_mul_mont_general#
+.proc bn_mul_mont_general#
+bn_mul_mont_general:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ $ADDP aptr=0,in1
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; .vframe prevsp
+ mov prevsp=sp
+ $ADDP bptr=0,in2
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+ .body
+ .rotf alo[6],nlo[4],ahi[8],nhi[6]
+ .rotr a[3],n[3],t[2]
+
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 alo[4]=[aptr],16 // ap[0]
+ $ADDP r30=8,in1 };;
+{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
+ ldf8 alo[2]=[aptr],16 // ap[2]
+ $ADDP in4=0,in4 };;
+{ .mmi; ldf8 alo[1]=[r30] // ap[3]
+ ldf8 n0=[in4] // n0
+ $ADDP rptr=0,in0 }
+{ .mmi; $ADDP nptr=0,in3
+ mov r31=16
+ zxt4 num=in5 };;
+{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
+ shladd len=num,3,r0
+ shladd r31=num,3,r31 };;
+{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
+ add lc=-5,num
+ sub r31=sp,r31 };;
+{ .mfb; and sp=-16,r31 // alloca
+ xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
+ nop.b 0 }
+{ .mfb; nop.m 0
+ xmpy.lu alo[4]=alo[4],bi
+ brp.loop.imp .L1st_ctop,.L1st_cend-16
+ };;
+{ .mfi; nop.m 0
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
+ add tp_1=8,sp }
+{ .mfi; nop.m 0
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20001f<<16
+ // ------^----- (p40) at first (p23)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; nop.m 0
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
+ mov ar.lc=lc }
+{ .mfi; nop.m 0
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+.align 32
+.L1st_ctop:
+.pred.rel "mutex",p40,p42
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23) }
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p23) st8 [tp_1]=n[2],8
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mmb; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ br.ctop.sptk .L1st_ctop };;
+.L1st_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ add num=-1,num };; // num--
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ sub aptr=aptr,len };; // rewind
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+ sub nptr=nptr,len };;
+{ .mmi; .pred.rel "mutex",p39,p41
+(p39) add topbit=r0,r0
+(p41) add topbit=r0,r0,1
+ nop.i 0 }
+{ .mmi; st8 [tp_1]=n[0]
+ add tptr=16,sp
+ add tp_1=8,sp };;
+
+.Louter:
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 ahi[3]=[tptr] // tp[0]
+ add r30=8,aptr };;
+{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
+ ldf8 alo[3]=[r30],16 // ap[1]
+ add r31=8,nptr };;
+{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
+ xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
+ brp.loop.imp .Linner_ctop,.Linner_cend-16
+ }
+{ .mfb; ldf8 alo[1]=[r30] // ap[3]
+ xma.lu alo[4]=alo[4],bi,ahi[3]
+ clrrrb.pr };;
+{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
+ nop.i 0 }
+{ .mfi; ldf8 nlo[1]=[r31] // np[1]
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20101f<<16
+ // ------^----- (p40) at first (p23)
+ // --------^--- (p30) at first (p22)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
+ mov ar.lc=lc }
+{ .mfi;
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align 32
+.Linner_ctop:
+.pred.rel "mutex",p40,p42
+.pred.rel "mutex",p30,p32
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23)
+{ .mfi; (p16) nop.m 0
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p16) nop.f 0
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p21) ld8 t[0]=[tptr],8
+ (p16) nop.f 0
+ (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p30) add a[1]=a[1],t[1] } // (p22)
+{ .mfi; (p16) nop.m 0
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p32) add a[1]=a[1],t[1],1 };; // (p22)
+{ .mmi; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
+{ .mmb; (p23) st8 [tp_1]=n[2],8
+ (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
+ br.ctop.sptk .Linner_ctop };;
+.Linner_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ nop.i 0 };;
+
+{ .mmi; .pred.rel "mutex",p31,p33
+(p31) add a[0]=a[0],topbit
+(p33) add a[0]=a[0],topbit,1
+ mov topbit=r0 };;
+{ .mfi; .pred.rel "mutex",p31,p33
+(p31) cmp.ltu p32,p30=a[0],topbit
+(p33) cmp.leu p32,p30=a[0],topbit
+ }
+{ .mfi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ };;
+{ .mmi; .pred.rel "mutex",p44,p46
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+(p32) add topbit=r0,r0,1 }
+
+{ .mmi; st8 [tp_1]=n[0],8
+ cmp4.ne p6,p0=1,num
+ sub aptr=aptr,len };; // rewind
+{ .mmi; sub nptr=nptr,len
+(p41) add topbit=r0,r0,1
+ add tptr=16,sp }
+{ .mmb; add tp_1=8,sp
+ add num=-1,num // num--
+(p6) br.cond.sptk.many .Louter };;
+
+{ .mbb; add lc=4,lc
+ brp.loop.imp .Lsub_ctop,.Lsub_cend-16
+ clrrrb.pr };;
+{ .mii; nop.m 0
+ mov pr.rot=0x10001<<16
+ // ------^---- (p33) at first (p17)
+ mov ar.lc=lc }
+{ .mii; nop.m 0
+ mov ar.ec=3
+ nop.i 0 };;
+
+.Lsub_ctop:
+.pred.rel "mutex",p33,p35
+{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
+ (p16) nop.f 0
+ (p33) sub n[1]=t[1],n[1] } // (p17)
+{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
+ (p16) nop.f 0
+ (p35) sub n[1]=t[1],n[1],1 };; // (p17)
+{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
+ (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
+ (p18) nop.b 0 }
+{ .mib; (p18) nop.m 0
+ (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
+ br.ctop.sptk .Lsub_ctop };;
+.Lsub_cend:
+
+{ .mmb; .pred.rel "mutex",p34,p36
+(p34) sub topbit=topbit,r0 // (p19)
+(p36) sub topbit=topbit,r0,1
+ brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
+ }
+{ .mmb; sub rptr=rptr,len // rewind
+ sub tptr=tptr,len
+ clrrrb.pr };;
+{ .mmi; and aptr=tptr,topbit
+ andcm bptr=rptr,topbit
+ mov pr.rot=1<<16 };;
+{ .mii; or nptr=aptr,bptr
+ mov ar.lc=lc
+ mov ar.ec=3 };;
+
+.Lcopy_ctop:
+{ .mmb; (p16) ld8 n[0]=[nptr],8
+ (p18) st8 [tptr]=r0,8
+ (p16) nop.b 0 }
+{ .mmb; (p16) nop.m 0
+ (p18) st8 [rptr]=n[2],8
+ br.ctop.sptk .Lcopy_ctop };;
+.Lcopy_cend:
+
+{ .mmi; mov ret0=1 // signal "handled"
+ rum 1<<5 // clear um.mfh
+ mov ar.lc=prevlc }
+{ .mib; .restore sp
+ mov sp=prevsp
+ mov pr=prevpr,0x1ffff
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_general#
+
+a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
+n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
+t0=r15;
+
+ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align 64
+.skip 48 // aligns loop body
+.local bn_mul_mont_8#
+.proc bn_mul_mont_8#
+bn_mul_mont_8:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ .vframe prevsp
+ mov prevsp=sp
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; add r17=-6*16,sp
+ add sp=-7*16,sp
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+{ .mmi; .save.gf 0,0x10
+ stf.spill [sp]=f16,-16
+ .save.gf 0,0x20
+ stf.spill [r17]=f17,32
+ add r16=-5*16,prevsp};;
+{ .mmi; .save.gf 0,0x40
+ stf.spill [r16]=f18,32
+ .save.gf 0,0x80
+ stf.spill [r17]=f19,32
+ $ADDP aptr=0,in1 };;
+{ .mmi; .save.gf 0,0x100
+ stf.spill [r16]=f20,32
+ .save.gf 0,0x200
+ stf.spill [r17]=f21,32
+ $ADDP r29=8,in1 };;
+{ .mmi; .save.gf 0,0x400
+ stf.spill [r16]=f22
+ .save.gf 0,0x800
+ stf.spill [r17]=f23
+ $ADDP rptr=0,in0 };;
+
+ .body
+ .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+ .rotr t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
+ ldf8 ai1=[r29],16 // ap[1]
+ $ADDP bptr=0,in2 }
+{ .mmi; $ADDP r30=8,in2
+ $ADDP nptr=0,in3
+ $ADDP r31=8,in3 };;
+{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
+ ldf8 bj[6]=[r30],16 // bp[1]
+ cmp4.le p4,p5=3,in5 }
+{ .mmi; ldf8 ni0=[nptr],16 // np[0]
+ ldf8 ni1=[r31],16 // np[1]
+ cmp4.le p6,p7=4,in5 };;
+
+{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
+ (p5)fcvt.fxu ai2=f0
+ cmp4.le p8,p9=5,in5 }
+{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
+ (p7)fcvt.fxu ai3=f0
+ cmp4.le p10,p11=6,in5 }
+{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
+ (p5)fcvt.fxu bj[5]=f0
+ cmp4.le p12,p13=7,in5 }
+{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
+ (p7)fcvt.fxu bj[4]=f0
+ cmp4.le p14,p15=8,in5 }
+{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
+ (p5)fcvt.fxu ni2=f0
+ addp4 r28=-1,in5 }
+{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
+ (p7)fcvt.fxu ni3=f0
+ $ADDP in4=0,in4 };;
+
+{ .mfi; ldf8 n0=[in4]
+ fcvt.fxu tf[1]=f0
+ nop.i 0 }
+
+{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
+ (p9)fcvt.fxu ai4=f0
+ mov t[0]=r0 }
+{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
+ (p11)fcvt.fxu ai5=f0
+ mov t[1]=r0 }
+{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
+ (p9)fcvt.fxu bj[3]=f0
+ mov t[2]=r0 }
+{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
+ (p11)fcvt.fxu bj[2]=f0
+ mov t[3]=r0 }
+{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
+ (p9)fcvt.fxu ni4=f0
+ mov t[4]=r0 }
+{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
+ (p11)fcvt.fxu ni5=f0
+ mov t[5]=r0 };;
+
+{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
+ (p13)fcvt.fxu ai6=f0
+ mov t[6]=r0 }
+{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
+ (p15)fcvt.fxu ai7=f0
+ mov t[7]=r0 }
+{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
+ (p13)fcvt.fxu bj[1]=f0
+ mov ar.lc=r28 }
+{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
+ (p15)fcvt.fxu bj[0]=f0
+ mov ar.ec=1 }
+{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
+ (p13)fcvt.fxu ni6=f0
+ mov pr.rot=1<<16 }
+{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
+ (p15)fcvt.fxu ni7=f0
+ brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
+ };;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 0:
+ (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
+ (p40) add a3=a3,n3 } // (p17) a3+=n3
+{ .mfi; (p42) add a3=a3,n3,1
+ (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig a7=alo[8] // 1:
+ (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
+ (p50) add t[6]=t[6],a3,1 };;
+{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
+ (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
+ (p40) cmp.ltu p43,p41=a3,n3 }
+{ .mfi; (p42) cmp.leu p43,p41=a3,n3
+ (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n5=nlo[6] // 3:
+ (p48) cmp.ltu p51,p49=t[6],a3
+ (p50) cmp.leu p51,p49=t[6],a3 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 4:
+ (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
+ (p41) add a4=a4,n4 } // (p17) a4+=n4
+{ .mfi; (p43) add a4=a4,n4,1
+ (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
+ (p16) nop.i 0 };;
+{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
+ (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
+ (p51) add t[5]=t[5],a4,1 };;
+{ .mfi; (p16) nop.m 0 // 6:
+ (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
+ (p41) cmp.ltu p42,p40=a4,n4 }
+{ .mfi; (p43) cmp.leu p42,p40=a4,n4
+ (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n6=nlo[7] // 7:
+ (p49) cmp.ltu p50,p48=t[5],a4
+ (p51) cmp.leu p50,p48=t[5],a4 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 8:
+ (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
+ (p40) add a5=a5,n5 } // (p17) a5+=n5
+{ .mfi; (p42) add a5=a5,n5,1
+ (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a1=alo[1] // 9:
+ (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
+ (p50) add t[4]=t[4],a5,1 };;
+{ .mfi; (p16) nop.m 0 // 10:
+ (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
+ (p40) cmp.ltu p43,p41=a5,n5 }
+{ .mfi; (p42) cmp.leu p43,p41=a5,n5
+ (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n7=nlo[8] // 11:
+ (p48) cmp.ltu p51,p49=t[4],a5
+ (p50) cmp.leu p51,p49=t[4],a5 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
+ (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
+ (p41) add a6=a6,n6 } // (p17) a6+=n6
+{ .mfi; (p43) add a6=a6,n6,1
+ (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a2=alo[2] // 13:
+ (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
+ (p51) add t[3]=t[3],a6,1 };;
+{ .mfi; (p16) nop.m 0 // 14:
+ (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
+ (p41) cmp.ltu p42,p40=a6,n6 }
+{ .mfi; (p43) cmp.leu p42,p40=a6,n6
+ (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
+ (p16) nop.i 0 };;
+{ .mii; (p16) nop.m 0 // 15:
+ (p49) cmp.ltu p50,p48=t[3],a6
+ (p51) cmp.leu p50,p48=t[3],a6 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 16:
+ (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
+ (p40) add a7=a7,n7 } // (p17) a7+=n7
+{ .mfi; (p42) add a7=a7,n7,1
+ (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a3=alo[3] // 17:
+ (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
+ (p50) add t[2]=t[2],a7,1 };;
+{ .mfi; (p16) nop.m 0 // 18:
+ (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
+ (p40) cmp.ltu p43,p41=a7,n7 }
+{ .mfi; (p42) cmp.leu p43,p41=a7,n7
+ (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n1=nlo[1] // 19:
+ (p48) cmp.ltu p51,p49=t[2],a7
+ (p50) cmp.leu p51,p49=t[2],a7 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 20:
+ (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
+ (p41) add a8=a8,n8 } // (p17) a8+=n8
+{ .mfi; (p43) add a8=a8,n8,1
+ (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a4=alo[4] // 21:
+ (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
+ (p51) add t[1]=t[1],a8,1 };;
+{ .mfi; (p16) nop.m 0 // 22:
+ (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
+ (p41) cmp.ltu p42,p40=a8,n8 }
+{ .mfi; (p43) cmp.leu p42,p40=a8,n8
+ (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n2=nlo[2] // 23:
+ (p49) cmp.ltu p50,p48=t[1],a8
+ (p51) cmp.leu p50,p48=t[1],a8 };;
+{ .mfi; (p16) nop.m 0 // 24:
+ (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
+ (p16) add a1=a1,n1 } // (p16) a1+=n1
+{ .mfi; (p16) nop.m 0
+ (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
+ (p17) mov t[0]=r0 };;
+{ .mii; (p16) getf.sig a5=alo[5] // 25:
+ (p16) add t0=t[7],a1 // (p16) t[7]+=a1
+ (p42) add t[0]=t[0],r0,1 };;
+{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
+ (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
+ (p50) add t[0]=t[0],r0,1 }
+{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
+ (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n3=nlo[3] // 27:
+ (p16) cmp.ltu.unc p50,p48=t0,a1
+ (p16) nop.i 0 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 28:
+ (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
+ (p40) add a2=a2,n2 } // (p16) a2+=n2
+{ .mfi; (p42) add a2=a2,n2,1
+ (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a6=alo[6] // 29:
+ (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
+ (p50) add t[6]=t[6],a2,1 };;
+{ .mfi; (p16) nop.m 0 // 30:
+ (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
+ (p40) cmp.ltu p41,p39=a2,n2 }
+{ .mfi; (p42) cmp.leu p41,p39=a2,n2
+ (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
+ (p16) nop.i 0 };;
+{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
+ (p16) nop.f 0
+ (p48) cmp.ltu p49,p47=t[6],a2 }
+{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
+ (p16) nop.f 0
+ br.ctop.sptk.many .Louter_8_ctop };;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mmi; (p0) getf.sig n1=ni0 // 0:
+ (p40) add a3=a3,n3 // (p17) a3+=n3
+ (p42) add a3=a3,n3,1 };;
+{ .mii; (p17) getf.sig a7=alo[8] // 1:
+ (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
+ (p50) add t[6]=t[6],a3,1 };;
+{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
+ (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
+ (p40) cmp.ltu p43,p41=a3,n3 }
+{ .mfi; (p42) cmp.leu p43,p41=a3,n3
+ (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
+ (p0) nop.i 0 };;
+{ .mii; (p17) getf.sig n5=nlo[6] // 3:
+ (p48) cmp.ltu p51,p49=t[6],a3
+ (p50) cmp.leu p51,p49=t[6],a3 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mmi; (p0) getf.sig n2=ni1 // 4:
+ (p41) add a4=a4,n4 // (p17) a4+=n4
+ (p43) add a4=a4,n4,1 };;
+{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
+ (p0) nop.f 0
+ (p51) add t[5]=t[5],a4,1 };;
+{ .mfi; (p0) getf.sig n3=ni2 // 6:
+ (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
+ (p41) cmp.ltu p42,p40=a4,n4 }
+{ .mfi; (p43) cmp.leu p42,p40=a4,n4
+ (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
+ (p0) nop.i 0 };;
+{ .mii; (p17) getf.sig n6=nlo[7] // 7:
+ (p49) cmp.ltu p50,p48=t[5],a4
+ (p51) cmp.leu p50,p48=t[5],a4 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mii; (p0) getf.sig n4=ni3 // 8:
+ (p40) add a5=a5,n5 // (p17) a5+=n5
+ (p42) add a5=a5,n5,1 };;
+{ .mii; (p0) nop.m 0 // 9:
+ (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
+ (p50) add t[4]=t[4],a5,1 };;
+{ .mii; (p0) nop.m 0 // 10:
+ (p40) cmp.ltu p43,p41=a5,n5
+ (p42) cmp.leu p43,p41=a5,n5 };;
+{ .mii; (p17) getf.sig n7=nlo[8] // 11:
+ (p48) cmp.ltu p51,p49=t[4],a5
+ (p50) cmp.leu p51,p49=t[4],a5 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mii; (p17) getf.sig n8=nhi[8] // 12:
+ (p41) add a6=a6,n6 // (p17) a6+=n6
+ (p43) add a6=a6,n6,1 };;
+{ .mii; (p0) getf.sig n5=ni4 // 13:
+ (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
+ (p51) add t[3]=t[3],a6,1 };;
+{ .mii; (p0) nop.m 0 // 14:
+ (p41) cmp.ltu p42,p40=a6,n6
+ (p43) cmp.leu p42,p40=a6,n6 };;
+{ .mii; (p0) getf.sig n6=ni5 // 15:
+ (p49) cmp.ltu p50,p48=t[3],a6
+ (p51) cmp.leu p50,p48=t[3],a6 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mii; (p0) nop.m 0 // 16:
+ (p40) add a7=a7,n7 // (p17) a7+=n7
+ (p42) add a7=a7,n7,1 };;
+{ .mii; (p0) nop.m 0 // 17:
+ (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
+ (p50) add t[2]=t[2],a7,1 };;
+{ .mii; (p0) nop.m 0 // 18:
+ (p40) cmp.ltu p43,p41=a7,n7
+ (p42) cmp.leu p43,p41=a7,n7 };;
+{ .mii; (p0) getf.sig n7=ni6 // 19:
+ (p48) cmp.ltu p51,p49=t[2],a7
+ (p50) cmp.leu p51,p49=t[2],a7 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mii; (p0) nop.m 0 // 20:
+ (p41) add a8=a8,n8 // (p17) a8+=n8
+ (p43) add a8=a8,n8,1 };;
+{ .mmi; (p0) nop.m 0 // 21:
+ (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
+ (p51) add t[1]=t[1],a8,1 }
+{ .mmi; (p17) mov t[0]=r0
+ (p41) cmp.ltu p42,p40=a8,n8
+ (p43) cmp.leu p42,p40=a8,n8 };;
+{ .mmi; (p0) getf.sig n8=ni7 // 22:
+ (p49) cmp.ltu p50,p48=t[1],a8
+ (p51) cmp.leu p50,p48=t[1],a8 }
+{ .mmi; (p42) add t[0]=t[0],r0,1
+ (p0) add r16=-7*16,prevsp
+ (p0) add r17=-6*16,prevsp };;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi; (p50)add t[0]=t[0],r0,1
+ add r18=-5*16,prevsp
+ sub n1=t0,n1 };;
+{ .mmi; cmp.gtu p34,p32=n1,t0;;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n2=t[7],n2
+ (p34)sub n2=t[7],n2,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
+ (p34)cmp.geu p35,p33=n2,t[7];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n3=t[6],n3 }
+{ .mmi; (p35)sub n3=t[6],n3,1;;
+ (p33)cmp.gtu p34,p32=n3,t[6]
+ (p35)cmp.geu p34,p32=n3,t[6] };;
+ .pred.rel "mutex",p32,p34
+{ .mii; (p32)sub n4=t[5],n4
+ (p34)sub n4=t[5],n4,1;;
+ (p32)cmp.gtu p35,p33=n4,t[5] }
+{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n5=t[4],n5
+ (p35)sub n5=t[4],n5,1 };;
+{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
+ (p35)cmp.geu p34,p32=n5,t[4];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n6=t[3],n6 }
+{ .mmi; (p34)sub n6=t[3],n6,1;;
+ (p32)cmp.gtu p35,p33=n6,t[3]
+ (p34)cmp.geu p35,p33=n6,t[3] };;
+ .pred.rel "mutex",p33,p35
+{ .mii; (p33)sub n7=t[2],n7
+ (p35)sub n7=t[2],n7,1;;
+ (p33)cmp.gtu p34,p32=n7,t[2] }
+{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n8=t[1],n8
+ (p34)sub n8=t[1],n8,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
+ (p34)cmp.geu p35,p33=n8,t[1];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub a8=t[0],r0 }
+{ .mmi; (p35)sub a8=t[0],r0,1;;
+ (p33)cmp.gtu p34,p32=a8,t[0]
+ (p35)cmp.geu p34,p32=a8,t[0] };;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+ .pred.rel "mutex",p32,p34
+{ .mmi; (p32)st8 [rptr]=n1,8
+ (p34)st8 [rptr]=t0,8
+ add r19=-4*16,prevsp};;
+{ .mmb; (p32)st8 [rptr]=n2,8
+ (p34)st8 [rptr]=t[7],8
+ (p5)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n3,8
+ (p34)st8 [rptr]=t[6],8
+ (p7)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n4,8
+ (p34)st8 [rptr]=t[5],8
+ (p9)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n5,8
+ (p34)st8 [rptr]=t[4],8
+ (p11)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n6,8
+ (p34)st8 [rptr]=t[3],8
+ (p13)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n7,8
+ (p34)st8 [rptr]=t[2],8
+ (p15)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n8,8
+ (p34)st8 [rptr]=t[1],8
+ nop.b 0 };;
+.Ldone: // epilogue
+{ .mmi; ldf.fill f16=[r16],64
+ ldf.fill f17=[r17],64
+ nop.i 0 }
+{ .mmi; ldf.fill f18=[r18],64
+ ldf.fill f19=[r19],64
+ mov pr=prevpr,0x1ffff };;
+{ .mmi; ldf.fill f20=[r16]
+ ldf.fill f21=[r17]
+ mov ar.lc=prevlc }
+{ .mmi; ldf.fill f22=[r18]
+ ldf.fill f23=[r19]
+ mov ret0=1 } // signal "handled"
+{ .mib; rum 1<<5
+ .restore sp
+ mov sp=prevsp
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_8#
+
+.type copyright#,\@object
+copyright:
+stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+open STDOUT,">$output" if $output;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/ia64.S b/openssl-1.1.0h/crypto/bn/asm/ia64.S
new file mode 100644
index 0000000..f2404a3
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/ia64.S
@@ -0,0 +1,1562 @@
+.explicit
+.text
+.ident "ia64.S, Version 2.1"
+.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+
+// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+//
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+//
+// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
+// different from Itanium to this module viewpoint. Most notably, is it
+// "wider" than Itanium? Can you experience loop scalability as
+// discussed in commentary sections? Not really:-( Itanium2 has 6
+// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
+// spin twice as fast, as I need 8 IALU ports. Amount of floating point
+// ports is the same, i.e. 2, while I need 4. In other words, to this
+// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
+// essentially different in respect to this module, and a re-tune was
+// required. Well, because some instruction latencies has changed. Most
+// noticeably those intensively used:
+//
+// Itanium Itanium2
+// ldf8 9 6 L2 hit
+// ld8 2 1 L1 hit
+// getf 2 5
+// xma[->getf] 7[+1] 4[+0]
+// add[->st8] 1[+1] 1[+0]
+//
+// What does it mean? You might ratiocinate that the original code
+// should run just faster... Because sum of latencies is smaller...
+// Wrong! Note that getf latency increased. This means that if a loop is
+// scheduled for lower latency (as they were), then it will suffer from
+// stall condition and the code will therefore turn anti-scalable, e.g.
+// original bn_mul_words spun at 5*n or 2.5 times slower than expected
+// on Itanium2! What to do? Reschedule loops for Itanium2? But then
+// Itanium would exhibit anti-scalability. So I've chosen to reschedule
+// for worst latency for every instruction aiming for best *all-round*
+// performance.
+
+// Q. How much faster does it get?
+// A. Here is the output from 'openssl speed rsa dsa' for vanilla
+// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
+// Linux 7.1 2.96-81):
+//
+// sign verify sign/s verify/s
+// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
+// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
+// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
+// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
+// sign verify sign/s verify/s
+// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
+// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
+//
+// And here is similar output but for this assembler
+// implementation:-)
+//
+// sign verify sign/s verify/s
+// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
+// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
+// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
+// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
+// sign verify sign/s verify/s
+// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
+// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
+//
+// Yes, you may argue that it's not fair comparison as it's
+// possible to craft the C implementation with BN_UMULT_HIGH
+// inline assembler macro. But of course! Here is the output
+// with the macro:
+//
+// sign verify sign/s verify/s
+// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
+// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
+// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
+// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
+// sign verify sign/s verify/s
+// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
+// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
+//
+// My code is still way faster, huh:-) And I believe that even
+// higher performance can be achieved. Note that as keys get
+// longer, performance gain is larger. Why? According to the
+// profiler there is another player in the field, namely
+// BN_from_montgomery consuming larger and larger portion of CPU
+// time as keysize decreases. I therefore consider putting effort
+// to assembler implementation of the following routine:
+//
+// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
+// {
+// int i,j;
+// BN_ULONG v;
+//
+// for (i=0; i<nl; i++)
+// {
+// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
+// nrp++;
+// rp++;
+// if (((nrp[-1]+=v)&BN_MASK2) < v)
+// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
+// }
+// }
+//
+// It might as well be beneficial to implement even combaX
+// variants, as it appears as it can literally unleash the
+// performance (see comment section to bn_mul_comba8 below).
+//
+// And finally for your reference the output for 0.9.6a compiled
+// with SGIcc version 0.01.0-12 (keep in mind that for the moment
+// of this writing it's not possible to convince SGIcc to use
+// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
+// i.e. for a compiler generated one:-):
+//
+// sign verify sign/s verify/s
+// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
+// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
+// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
+// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
+// sign verify sign/s verify/s
+// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
+// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
+//
+// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
+// system running Redhat Linux 7.1 (very special thanks to Ray
+// McCaffity of Williams Communications for providing an account).
+//
+// Q. What's the heck with 'rum 1<<5' at the end of every function?
+// A. Well, by clearing the "upper FP registers written" bit of the
+// User Mask I want to excuse the kernel from preserving upper
+// (f32-f128) FP register bank over process context switch, thus
+// minimizing bus bandwidth consumption during the switch (i.e.
+// after PKI opration completes and the program is off doing
+// something else like bulk symmetric encryption). Having said
+// this, I also want to point out that it might be good idea
+// to compile the whole toolkit (as well as majority of the
+// programs for that matter) with -mfixed-range=f32-f127 command
+// line option. No, it doesn't prevent the compiler from writing
+// to upper bank, but at least discourages to do so. If you don't
+// like the idea you have the option to compile the module with
+// -Drum=nop.m in command line.
+//
+
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+#define ADDP addp4
+#else
+#define ADDP add
+#endif
+
+#if 1
+//
+// bn_[add|sub]_words routines.
+//
+// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
+// data reside in L1 cache, i.e. 2 ticks away). It's possible to
+// compress the epilogue and get down to 2*n+6, but at the cost of
+// scalability (the neat feature of this implementation is that it
+// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
+// I consider that the epilogue is short enough as it is to trade tiny
+// performance loss on Itanium for scalability.
+//
+// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
+//
+.global bn_add_words#
+.proc bn_add_words#
+.align 64
+.skip 32 // makes the loop body aligned at 64-byte boundary
+bn_add_words:
+ .prologue
+ .save ar.pfs,r2
+{ .mii; alloc r2=ar.pfs,4,12,0,16
+ cmp4.le p6,p0=r35,r0 };;
+{ .mfb; mov r8=r0 // return value
+(p6) br.ret.spnt.many b0 };;
+
+{ .mib; sub r10=r35,r0,1
+ .save ar.lc,r3
+ mov r3=ar.lc
+ brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
+ }
+{ .mib; ADDP r14=0,r32 // rp
+ .save pr,r9
+ mov r9=pr };;
+ .body
+{ .mii; ADDP r15=0,r33 // ap
+ mov ar.lc=r10
+ mov ar.ec=6 }
+{ .mib; ADDP r16=0,r34 // bp
+ mov pr.rot=1<<16 };;
+
+.L_bn_add_words_ctop:
+{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
+ (p18) add r39=r37,r34
+ (p19) cmp.ltu.unc p56,p0=r40,r38 }
+{ .mfb; (p0) nop.m 0x0
+ (p0) nop.f 0x0
+ (p0) nop.b 0x0 }
+{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
+ (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
+ (p58) add r41=1,r41 } // (p20)
+{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
+ (p0) nop.f 0x0
+ br.ctop.sptk .L_bn_add_words_ctop };;
+.L_bn_add_words_cend:
+
+{ .mii;
+(p59) add r8=1,r8 // return value
+ mov pr=r9,0x1ffff
+ mov ar.lc=r3 }
+{ .mbb; nop.b 0x0
+ br.ret.sptk.many b0 };;
+.endp bn_add_words#
+
+//
+// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
+//
+.global bn_sub_words#
+.proc bn_sub_words#
+.align 64
+.skip 32 // makes the loop body aligned at 64-byte boundary
+bn_sub_words:
+ .prologue
+ .save ar.pfs,r2
+{ .mii; alloc r2=ar.pfs,4,12,0,16
+ cmp4.le p6,p0=r35,r0 };;
+{ .mfb; mov r8=r0 // return value
+(p6) br.ret.spnt.many b0 };;
+
+{ .mib; sub r10=r35,r0,1
+ .save ar.lc,r3
+ mov r3=ar.lc
+ brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
+ }
+{ .mib; ADDP r14=0,r32 // rp
+ .save pr,r9
+ mov r9=pr };;
+ .body
+{ .mii; ADDP r15=0,r33 // ap
+ mov ar.lc=r10
+ mov ar.ec=6 }
+{ .mib; ADDP r16=0,r34 // bp
+ mov pr.rot=1<<16 };;
+
+.L_bn_sub_words_ctop:
+{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
+ (p18) sub r39=r37,r34
+ (p19) cmp.gtu.unc p56,p0=r40,r38 }
+{ .mfb; (p0) nop.m 0x0
+ (p0) nop.f 0x0
+ (p0) nop.b 0x0 }
+{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
+ (p58) cmp.eq.or p57,p0=0,r41 // (p20)
+ (p58) add r41=-1,r41 } // (p20)
+{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
+ (p0) nop.b 0x0
+ br.ctop.sptk .L_bn_sub_words_ctop };;
+.L_bn_sub_words_cend:
+
+{ .mii;
+(p59) add r8=1,r8 // return value
+ mov pr=r9,0x1ffff
+ mov ar.lc=r3 }
+{ .mbb; nop.b 0x0
+ br.ret.sptk.many b0 };;
+.endp bn_sub_words#
+#endif
+
+#if 0
+#define XMA_TEMPTATION
+#endif
+
+#if 1
+//
+// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+//
+.global bn_mul_words#
+.proc bn_mul_words#
+.align 64
+.skip 32 // makes the loop body aligned at 64-byte boundary
+bn_mul_words:
+ .prologue
+ .save ar.pfs,r2
+#ifdef XMA_TEMPTATION
+{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
+#else
+{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
+#endif
+{ .mib; mov r8=r0 // return value
+ cmp4.le p6,p0=r34,r0
+(p6) br.ret.spnt.many b0 };;
+
+{ .mii; sub r10=r34,r0,1
+ .save ar.lc,r3
+ mov r3=ar.lc
+ .save pr,r9
+ mov r9=pr };;
+
+ .body
+{ .mib; setf.sig f8=r35 // w
+ mov pr.rot=0x800001<<16
+ // ------^----- serves as (p50) at first (p27)
+ brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
+ }
+
+#ifndef XMA_TEMPTATION
+
+{ .mmi; ADDP r14=0,r32 // rp
+ ADDP r15=0,r33 // ap
+ mov ar.lc=r10 }
+{ .mmi; mov r40=0 // serves as r35 at first (p27)
+ mov ar.ec=13 };;
+
+// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
+// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
+// bypass L1 cache and L2 latency is actually best-case scenario for
+// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
+// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
+// would give us ~5% in *overall* performance improvement on "wider"
+// IA-64, but would hurt Itanium for about same because of longer
+// epilogue. As it's a matter of few percents in either case I've
+// chosen to trade the scalability for development time (you can see
+// this very instruction sequence in bn_mul_add_words loop which in
+// turn is scalable).
+.L_bn_mul_words_ctop:
+{ .mfi; (p25) getf.sig r36=f52 // low
+ (p21) xmpy.lu f48=f37,f8
+ (p28) cmp.ltu p54,p50=r41,r39 }
+{ .mfi; (p16) ldf8 f32=[r15],8
+ (p21) xmpy.hu f40=f37,f8
+ (p0) nop.i 0x0 };;
+{ .mii; (p25) getf.sig r32=f44 // high
+ .pred.rel "mutex",p50,p54
+ (p50) add r40=r38,r35 // (p27)
+ (p54) add r40=r38,r35,1 } // (p27)
+{ .mfb; (p28) st8 [r14]=r41,8
+ (p0) nop.f 0x0
+ br.ctop.sptk .L_bn_mul_words_ctop };;
+.L_bn_mul_words_cend:
+
+{ .mii; nop.m 0x0
+.pred.rel "mutex",p51,p55
+(p51) add r8=r36,r0
+(p55) add r8=r36,r0,1 }
+{ .mfb; nop.m 0x0
+ nop.f 0x0
+ nop.b 0x0 }
+
+#else // XMA_TEMPTATION
+
+ setf.sig f37=r0 // serves as carry at (p18) tick
+ mov ar.lc=r10
+ mov ar.ec=5;;
+
+// Most of you examining this code very likely wonder why in the name
+// of Intel the following loop is commented out? Indeed, it looks so
+// neat that you find it hard to believe that it's something wrong
+// with it, right? The catch is that every iteration depends on the
+// result from previous one and the latter isn't available instantly.
+// The loop therefore spins at the latency of xma minus 1, or in other
+// words at 6*(n+4) ticks:-( Compare to the "production" loop above
+// that runs in 2*(n+11) where the low latency problem is worked around
+// by moving the dependency to one-tick latent integer ALU. Note that
+// "distance" between ldf8 and xma is not latency of ldf8, but the
+// *difference* between xma and ldf8 latencies.
+.L_bn_mul_words_ctop:
+{ .mfi; (p16) ldf8 f32=[r33],8
+ (p18) xma.hu f38=f34,f8,f39 }
+{ .mfb; (p20) stf8 [r32]=f37,8
+ (p18) xma.lu f35=f34,f8,f39
+ br.ctop.sptk .L_bn_mul_words_ctop };;
+.L_bn_mul_words_cend:
+
+ getf.sig r8=f41 // the return value
+
+#endif // XMA_TEMPTATION
+
+{ .mii; nop.m 0x0
+ mov pr=r9,0x1ffff
+ mov ar.lc=r3 }
+{ .mfb; rum 1<<5 // clear um.mfh
+ nop.f 0x0
+ br.ret.sptk.many b0 };;
+.endp bn_mul_words#
+#endif
+
+#if 1
+//
+// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+//
+.global bn_mul_add_words#
+.proc bn_mul_add_words#
+.align 64
+.skip 48 // makes the loop body aligned at 64-byte boundary
+bn_mul_add_words:
+ .prologue
+ .save ar.pfs,r2
+{ .mmi; alloc r2=ar.pfs,4,4,0,8
+ cmp4.le p6,p0=r34,r0
+ .save ar.lc,r3
+ mov r3=ar.lc };;
+{ .mib; mov r8=r0 // return value
+ sub r10=r34,r0,1
+(p6) br.ret.spnt.many b0 };;
+
+{ .mib; setf.sig f8=r35 // w
+ .save pr,r9
+ mov r9=pr
+ brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
+ }
+ .body
+{ .mmi; ADDP r14=0,r32 // rp
+ ADDP r15=0,r33 // ap
+ mov ar.lc=r10 }
+{ .mii; ADDP r16=0,r32 // rp copy
+ mov pr.rot=0x2001<<16
+ // ------^----- serves as (p40) at first (p27)
+ mov ar.ec=11 };;
+
+// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
+// Itanium 2. Yes, unlike previous versions it scales:-) Previous
+// version was performing *all* additions in IALU and was starving
+// for those even on Itanium 2. In this version one addition is
+// moved to FPU and is folded with multiplication. This is at cost
+// of propagating the result from previous call to this subroutine
+// to L2 cache... In other words negligible even for shorter keys.
+// *Overall* performance improvement [over previous version] varies
+// from 11 to 22 percent depending on key length.
+.L_bn_mul_add_words_ctop:
+.pred.rel "mutex",p40,p42
+{ .mfi; (p23) getf.sig r36=f45 // low
+ (p20) xma.lu f42=f36,f8,f50 // low
+ (p40) add r39=r39,r35 } // (p27)
+{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
+ (p20) xma.hu f36=f36,f8,f50 // high
+ (p42) add r39=r39,r35,1 };; // (p27)
+{ .mmi; (p24) getf.sig r32=f40 // high
+ (p16) ldf8 f46=[r16],8 // *(rp1++)
+ (p40) cmp.ltu p41,p39=r39,r35 } // (p27)
+{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
+ (p42) cmp.leu p41,p39=r39,r35 // (p27)
+ br.ctop.sptk .L_bn_mul_add_words_ctop};;
+.L_bn_mul_add_words_cend:
+
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) add r8=r35,r0
+(p42) add r8=r35,r0,1
+ mov pr=r9,0x1ffff }
+{ .mib; rum 1<<5 // clear um.mfh
+ mov ar.lc=r3
+ br.ret.sptk.many b0 };;
+.endp bn_mul_add_words#
+#endif
+
+#if 1
+//
+// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
+//
+.global bn_sqr_words#
+.proc bn_sqr_words#
+.align 64
+.skip 32 // makes the loop body aligned at 64-byte boundary
+bn_sqr_words:
+ .prologue
+ .save ar.pfs,r2
+{ .mii; alloc r2=ar.pfs,3,0,0,0
+ sxt4 r34=r34 };;
+{ .mii; cmp.le p6,p0=r34,r0
+ mov r8=r0 } // return value
+{ .mfb; ADDP r32=0,r32
+ nop.f 0x0
+(p6) br.ret.spnt.many b0 };;
+
+{ .mii; sub r10=r34,r0,1
+ .save ar.lc,r3
+ mov r3=ar.lc
+ .save pr,r9
+ mov r9=pr };;
+
+ .body
+{ .mib; ADDP r33=0,r33
+ mov pr.rot=1<<16
+ brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
+ }
+{ .mii; add r34=8,r32
+ mov ar.lc=r10
+ mov ar.ec=18 };;
+
+// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
+// possible to compress the epilogue (I'm getting tired to write this
+// comment over and over) and get down to 2*n+16 at the cost of
+// scalability. The decision will very likely be reconsidered after the
+// benchmark program is profiled. I.e. if perfomance gain on Itanium
+// will appear larger than loss on "wider" IA-64, then the loop should
+// be explicitly split and the epilogue compressed.
+.L_bn_sqr_words_ctop:
+{ .mfi; (p16) ldf8 f32=[r33],8
+ (p25) xmpy.lu f42=f41,f41
+ (p0) nop.i 0x0 }
+{ .mib; (p33) stf8 [r32]=f50,16
+ (p0) nop.i 0x0
+ (p0) nop.b 0x0 }
+{ .mfi; (p0) nop.m 0x0
+ (p25) xmpy.hu f52=f41,f41
+ (p0) nop.i 0x0 }
+{ .mib; (p33) stf8 [r34]=f60,16
+ (p0) nop.i 0x0
+ br.ctop.sptk .L_bn_sqr_words_ctop };;
+.L_bn_sqr_words_cend:
+
+{ .mii; nop.m 0x0
+ mov pr=r9,0x1ffff
+ mov ar.lc=r3 }
+{ .mfb; rum 1<<5 // clear um.mfh
+ nop.f 0x0
+ br.ret.sptk.many b0 };;
+.endp bn_sqr_words#
+#endif
+
+#if 1
+// Apparently we win nothing by implementing special bn_sqr_comba8.
+// Yes, it is possible to reduce the number of multiplications by
+// almost factor of two, but then the amount of additions would
+// increase by factor of two (as we would have to perform those
+// otherwise performed by xma ourselves). Normally we would trade
+// anyway as multiplications are way more expensive, but not this
+// time... Multiplication kernel is fully pipelined and as we drain
+// one 128-bit multiplication result per clock cycle multiplications
+// are effectively as inexpensive as additions. Special implementation
+// might become of interest for "wider" IA-64 implementation as you'll
+// be able to get through the multiplication phase faster (there won't
+// be any stall issues as discussed in the commentary section below and
+// you therefore will be able to employ all 4 FP units)... But these
+// Itanium days it's simply too hard to justify the effort so I just
+// drop down to bn_mul_comba8 code:-)
+//
+// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+//
+.global bn_sqr_comba8#
+.proc bn_sqr_comba8#
+.align 64
+bn_sqr_comba8:
+ .prologue
+ .save ar.pfs,r2
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+{ .mii; alloc r2=ar.pfs,2,1,0,0
+ addp4 r33=0,r33
+ addp4 r32=0,r32 };;
+{ .mii;
+#else
+{ .mii; alloc r2=ar.pfs,2,1,0,0
+#endif
+ mov r34=r33
+ add r14=8,r33 };;
+ .body
+{ .mii; add r17=8,r34
+ add r15=16,r33
+ add r18=16,r34 }
+{ .mfb; add r16=24,r33
+ br .L_cheat_entry_point8 };;
+.endp bn_sqr_comba8#
+#endif
+
+#if 1
+// I've estimated this routine to run in ~120 ticks, but in reality
+// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
+// cycles consumed for instructions fetch? Or did I misinterpret some
+// clause in Itanium µ-architecture manual? Comments are welcomed and
+// highly appreciated.
+//
+// On Itanium 2 it takes ~190 ticks. This is because of stalls on
+// result from getf.sig. I do nothing about it at this point for
+// reasons depicted below.
+//
+// However! It should be noted that even 160 ticks is darn good result
+// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
+// C version (compiled with gcc with inline assembler). I really
+// kicked compiler's butt here, didn't I? Yeah! This brings us to the
+// following statement. It's damn shame that this routine isn't called
+// very often nowadays! According to the profiler most CPU time is
+// consumed by bn_mul_add_words called from BN_from_montgomery. In
+// order to estimate what we're missing, I've compared the performance
+// of this routine against "traditional" implementation, i.e. against
+// following routine:
+//
+// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
+// r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
+// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
+// r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
+// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
+// r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
+// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
+// r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
+// }
+//
+// The one below is over 8 times faster than the one above:-( Even
+// more reasons to "combafy" bn_mul_add_mont...
+//
+// And yes, this routine really made me wish there were an optimizing
+// assembler! It also feels like it deserves a dedication.
+//
+// To my wife for being there and to my kids...
+//
+// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+//
+#define carry1 r14
+#define carry2 r15
+#define carry3 r34
+.global bn_mul_comba8#
+.proc bn_mul_comba8#
+.align 64
+bn_mul_comba8:
+ .prologue
+ .save ar.pfs,r2
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+{ .mii; alloc r2=ar.pfs,3,0,0,0
+ addp4 r33=0,r33
+ addp4 r34=0,r34 };;
+{ .mii; addp4 r32=0,r32
+#else
+{ .mii; alloc r2=ar.pfs,3,0,0,0
+#endif
+ add r14=8,r33
+ add r17=8,r34 }
+ .body
+{ .mii; add r15=16,r33
+ add r18=16,r34
+ add r16=24,r33 }
+.L_cheat_entry_point8:
+{ .mmi; add r19=24,r34
+
+ ldf8 f32=[r33],32 };;
+
+{ .mmi; ldf8 f120=[r34],32
+ ldf8 f121=[r17],32 }
+{ .mmi; ldf8 f122=[r18],32
+ ldf8 f123=[r19],32 };;
+{ .mmi; ldf8 f124=[r34]
+ ldf8 f125=[r17] }
+{ .mmi; ldf8 f126=[r18]
+ ldf8 f127=[r19] }
+
+{ .mmi; ldf8 f33=[r14],32
+ ldf8 f34=[r15],32 }
+{ .mmi; ldf8 f35=[r16],32;;
+ ldf8 f36=[r33] }
+{ .mmi; ldf8 f37=[r14]
+ ldf8 f38=[r15] }
+{ .mfi; ldf8 f39=[r16]
+// -------\ Entering multiplier's heaven /-------
+// ------------\ /------------
+// -----------------\ /-----------------
+// ----------------------\/----------------------
+ xma.hu f41=f32,f120,f0 }
+{ .mfi; xma.lu f40=f32,f120,f0 };; // (*)
+{ .mfi; xma.hu f51=f32,f121,f0 }
+{ .mfi; xma.lu f50=f32,f121,f0 };;
+{ .mfi; xma.hu f61=f32,f122,f0 }
+{ .mfi; xma.lu f60=f32,f122,f0 };;
+{ .mfi; xma.hu f71=f32,f123,f0 }
+{ .mfi; xma.lu f70=f32,f123,f0 };;
+{ .mfi; xma.hu f81=f32,f124,f0 }
+{ .mfi; xma.lu f80=f32,f124,f0 };;
+{ .mfi; xma.hu f91=f32,f125,f0 }
+{ .mfi; xma.lu f90=f32,f125,f0 };;
+{ .mfi; xma.hu f101=f32,f126,f0 }
+{ .mfi; xma.lu f100=f32,f126,f0 };;
+{ .mfi; xma.hu f111=f32,f127,f0 }
+{ .mfi; xma.lu f110=f32,f127,f0 };;//
+// (*) You can argue that splitting at every second bundle would
+// prevent "wider" IA-64 implementations from achieving the peak
+// performance. Well, not really... The catch is that if you
+// intend to keep 4 FP units busy by splitting at every fourth
+// bundle and thus perform these 16 multiplications in 4 ticks,
+// the first bundle *below* would stall because the result from
+// the first xma bundle *above* won't be available for another 3
+// ticks (if not more, being an optimist, I assume that "wider"
+// implementation will have same latency:-). This stall will hold
+// you back and the performance would be as if every second bundle
+// were split *anyway*...
+{ .mfi; getf.sig r16=f40
+ xma.hu f42=f33,f120,f41
+ add r33=8,r32 }
+{ .mfi; xma.lu f41=f33,f120,f41 };;
+{ .mfi; getf.sig r24=f50
+ xma.hu f52=f33,f121,f51 }
+{ .mfi; xma.lu f51=f33,f121,f51 };;
+{ .mfi; st8 [r32]=r16,16
+ xma.hu f62=f33,f122,f61 }
+{ .mfi; xma.lu f61=f33,f122,f61 };;
+{ .mfi; xma.hu f72=f33,f123,f71 }
+{ .mfi; xma.lu f71=f33,f123,f71 };;
+{ .mfi; xma.hu f82=f33,f124,f81 }
+{ .mfi; xma.lu f81=f33,f124,f81 };;
+{ .mfi; xma.hu f92=f33,f125,f91 }
+{ .mfi; xma.lu f91=f33,f125,f91 };;
+{ .mfi; xma.hu f102=f33,f126,f101 }
+{ .mfi; xma.lu f101=f33,f126,f101 };;
+{ .mfi; xma.hu f112=f33,f127,f111 }
+{ .mfi; xma.lu f111=f33,f127,f111 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r25=f41
+ xma.hu f43=f34,f120,f42 }
+{ .mfi; xma.lu f42=f34,f120,f42 };;
+{ .mfi; getf.sig r16=f60
+ xma.hu f53=f34,f121,f52 }
+{ .mfi; xma.lu f52=f34,f121,f52 };;
+{ .mfi; getf.sig r17=f51
+ xma.hu f63=f34,f122,f62
+ add r25=r25,r24 }
+{ .mfi; xma.lu f62=f34,f122,f62
+ mov carry1=0 };;
+{ .mfi; cmp.ltu p6,p0=r25,r24
+ xma.hu f73=f34,f123,f72 }
+{ .mfi; xma.lu f72=f34,f123,f72 };;
+{ .mfi; st8 [r33]=r25,16
+ xma.hu f83=f34,f124,f82
+(p6) add carry1=1,carry1 }
+{ .mfi; xma.lu f82=f34,f124,f82 };;
+{ .mfi; xma.hu f93=f34,f125,f92 }
+{ .mfi; xma.lu f92=f34,f125,f92 };;
+{ .mfi; xma.hu f103=f34,f126,f102 }
+{ .mfi; xma.lu f102=f34,f126,f102 };;
+{ .mfi; xma.hu f113=f34,f127,f112 }
+{ .mfi; xma.lu f112=f34,f127,f112 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r18=f42
+ xma.hu f44=f35,f120,f43
+ add r17=r17,r16 }
+{ .mfi; xma.lu f43=f35,f120,f43 };;
+{ .mfi; getf.sig r24=f70
+ xma.hu f54=f35,f121,f53 }
+{ .mfi; mov carry2=0
+ xma.lu f53=f35,f121,f53 };;
+{ .mfi; getf.sig r25=f61
+ xma.hu f64=f35,f122,f63
+ cmp.ltu p7,p0=r17,r16 }
+{ .mfi; add r18=r18,r17
+ xma.lu f63=f35,f122,f63 };;
+{ .mfi; getf.sig r26=f52
+ xma.hu f74=f35,f123,f73
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r18,r17
+ xma.lu f73=f35,f123,f73
+ add r18=r18,carry1 };;
+{ .mfi;
+ xma.hu f84=f35,f124,f83
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r18,carry1
+ xma.lu f83=f35,f124,f83 };;
+{ .mfi; st8 [r32]=r18,16
+ xma.hu f94=f35,f125,f93
+(p7) add carry2=1,carry2 }
+{ .mfi; xma.lu f93=f35,f125,f93 };;
+{ .mfi; xma.hu f104=f35,f126,f103 }
+{ .mfi; xma.lu f103=f35,f126,f103 };;
+{ .mfi; xma.hu f114=f35,f127,f113 }
+{ .mfi; mov carry1=0
+ xma.lu f113=f35,f127,f113
+ add r25=r25,r24 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r27=f43
+ xma.hu f45=f36,f120,f44
+ cmp.ltu p6,p0=r25,r24 }
+{ .mfi; xma.lu f44=f36,f120,f44
+ add r26=r26,r25 };;
+{ .mfi; getf.sig r16=f80
+ xma.hu f55=f36,f121,f54
+(p6) add carry1=1,carry1 }
+{ .mfi; xma.lu f54=f36,f121,f54 };;
+{ .mfi; getf.sig r17=f71
+ xma.hu f65=f36,f122,f64
+ cmp.ltu p6,p0=r26,r25 }
+{ .mfi; xma.lu f64=f36,f122,f64
+ add r27=r27,r26 };;
+{ .mfi; getf.sig r18=f62
+ xma.hu f75=f36,f123,f74
+(p6) add carry1=1,carry1 }
+{ .mfi; cmp.ltu p6,p0=r27,r26
+ xma.lu f74=f36,f123,f74
+ add r27=r27,carry2 };;
+{ .mfi; getf.sig r19=f53
+ xma.hu f85=f36,f124,f84
+(p6) add carry1=1,carry1 }
+{ .mfi; xma.lu f84=f36,f124,f84
+ cmp.ltu p6,p0=r27,carry2 };;
+{ .mfi; st8 [r33]=r27,16
+ xma.hu f95=f36,f125,f94
+(p6) add carry1=1,carry1 }
+{ .mfi; xma.lu f94=f36,f125,f94 };;
+{ .mfi; xma.hu f105=f36,f126,f104 }
+{ .mfi; mov carry2=0
+ xma.lu f104=f36,f126,f104
+ add r17=r17,r16 };;
+{ .mfi; xma.hu f115=f36,f127,f114
+ cmp.ltu p7,p0=r17,r16 }
+{ .mfi; xma.lu f114=f36,f127,f114
+ add r18=r18,r17 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r20=f44
+ xma.hu f46=f37,f120,f45
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r18,r17
+ xma.lu f45=f37,f120,f45
+ add r19=r19,r18 };;
+{ .mfi; getf.sig r24=f90
+ xma.hu f56=f37,f121,f55 }
+{ .mfi; xma.lu f55=f37,f121,f55 };;
+{ .mfi; getf.sig r25=f81
+ xma.hu f66=f37,f122,f65
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r19,r18
+ xma.lu f65=f37,f122,f65
+ add r20=r20,r19 };;
+{ .mfi; getf.sig r26=f72
+ xma.hu f76=f37,f123,f75
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r20,r19
+ xma.lu f75=f37,f123,f75
+ add r20=r20,carry1 };;
+{ .mfi; getf.sig r27=f63
+ xma.hu f86=f37,f124,f85
+(p7) add carry2=1,carry2 }
+{ .mfi; xma.lu f85=f37,f124,f85
+ cmp.ltu p7,p0=r20,carry1 };;
+{ .mfi; getf.sig r28=f54
+ xma.hu f96=f37,f125,f95
+(p7) add carry2=1,carry2 }
+{ .mfi; st8 [r32]=r20,16
+ xma.lu f95=f37,f125,f95 };;
+{ .mfi; xma.hu f106=f37,f126,f105 }
+{ .mfi; mov carry1=0
+ xma.lu f105=f37,f126,f105
+ add r25=r25,r24 };;
+{ .mfi; xma.hu f116=f37,f127,f115
+ cmp.ltu p6,p0=r25,r24 }
+{ .mfi; xma.lu f115=f37,f127,f115
+ add r26=r26,r25 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r29=f45
+ xma.hu f47=f38,f120,f46
+(p6) add carry1=1,carry1 }
+{ .mfi; cmp.ltu p6,p0=r26,r25
+ xma.lu f46=f38,f120,f46
+ add r27=r27,r26 };;
+{ .mfi; getf.sig r16=f100
+ xma.hu f57=f38,f121,f56
+(p6) add carry1=1,carry1 }
+{ .mfi; cmp.ltu p6,p0=r27,r26
+ xma.lu f56=f38,f121,f56
+ add r28=r28,r27 };;
+{ .mfi; getf.sig r17=f91
+ xma.hu f67=f38,f122,f66
+(p6) add carry1=1,carry1 }
+{ .mfi; cmp.ltu p6,p0=r28,r27
+ xma.lu f66=f38,f122,f66
+ add r29=r29,r28 };;
+{ .mfi; getf.sig r18=f82
+ xma.hu f77=f38,f123,f76
+(p6) add carry1=1,carry1 }
+{ .mfi; cmp.ltu p6,p0=r29,r28
+ xma.lu f76=f38,f123,f76
+ add r29=r29,carry2 };;
+{ .mfi; getf.sig r19=f73
+ xma.hu f87=f38,f124,f86
+(p6) add carry1=1,carry1 }
+{ .mfi; xma.lu f86=f38,f124,f86
+ cmp.ltu p6,p0=r29,carry2 };;
+{ .mfi; getf.sig r20=f64
+ xma.hu f97=f38,f125,f96
+(p6) add carry1=1,carry1 }
+{ .mfi; st8 [r33]=r29,16
+ xma.lu f96=f38,f125,f96 };;
+{ .mfi; getf.sig r21=f55
+ xma.hu f107=f38,f126,f106 }
+{ .mfi; mov carry2=0
+ xma.lu f106=f38,f126,f106
+ add r17=r17,r16 };;
+{ .mfi; xma.hu f117=f38,f127,f116
+ cmp.ltu p7,p0=r17,r16 }
+{ .mfi; xma.lu f116=f38,f127,f116
+ add r18=r18,r17 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r22=f46
+ xma.hu f48=f39,f120,f47
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r18,r17
+ xma.lu f47=f39,f120,f47
+ add r19=r19,r18 };;
+{ .mfi; getf.sig r24=f110
+ xma.hu f58=f39,f121,f57
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r19,r18
+ xma.lu f57=f39,f121,f57
+ add r20=r20,r19 };;
+{ .mfi; getf.sig r25=f101
+ xma.hu f68=f39,f122,f67
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r20,r19
+ xma.lu f67=f39,f122,f67
+ add r21=r21,r20 };;
+{ .mfi; getf.sig r26=f92
+ xma.hu f78=f39,f123,f77
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r21,r20
+ xma.lu f77=f39,f123,f77
+ add r22=r22,r21 };;
+{ .mfi; getf.sig r27=f83
+ xma.hu f88=f39,f124,f87
+(p7) add carry2=1,carry2 }
+{ .mfi; cmp.ltu p7,p0=r22,r21
+ xma.lu f87=f39,f124,f87
+ add r22=r22,carry1 };;
+{ .mfi; getf.sig r28=f74
+ xma.hu f98=f39,f125,f97
+(p7) add carry2=1,carry2 }
+{ .mfi; xma.lu f97=f39,f125,f97
+ cmp.ltu p7,p0=r22,carry1 };;
+{ .mfi; getf.sig r29=f65
+ xma.hu f108=f39,f126,f107
+(p7) add carry2=1,carry2 }
+{ .mfi; st8 [r32]=r22,16
+ xma.lu f107=f39,f126,f107 };;
+{ .mfi; getf.sig r30=f56
+ xma.hu f118=f39,f127,f117 }
+{ .mfi; xma.lu f117=f39,f127,f117 };;//
+//-------------------------------------------------//
+// Leaving muliplier's heaven... Quite a ride, huh?
+
+{ .mii; getf.sig r31=f47
+ add r25=r25,r24
+ mov carry1=0 };;
+{ .mii; getf.sig r16=f111
+ cmp.ltu p6,p0=r25,r24
+ add r26=r26,r25 };;
+{ .mfb; getf.sig r17=f102 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,r25
+ add r27=r27,r26 };;
+{ .mfb; nop.m 0x0 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r27,r26
+ add r28=r28,r27 };;
+{ .mii; getf.sig r18=f93
+ add r17=r17,r16
+ mov carry3=0 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r28,r27
+ add r29=r29,r28 };;
+{ .mii; getf.sig r19=f84
+ cmp.ltu p7,p0=r17,r16 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r29,r28
+ add r30=r30,r29 };;
+{ .mii; getf.sig r20=f75
+ add r18=r18,r17 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r30,r29
+ add r31=r31,r30 };;
+{ .mfb; getf.sig r21=f66 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r18,r17
+ add r19=r19,r18 }
+{ .mfb; nop.m 0x0 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r31,r30
+ add r31=r31,carry2 };;
+{ .mfb; getf.sig r22=f57 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r19,r18
+ add r20=r20,r19 }
+{ .mfb; nop.m 0x0 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r31,carry2 };;
+{ .mfb; getf.sig r23=f48 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r20,r19
+ add r21=r21,r20 }
+{ .mii;
+(p6) add carry1=1,carry1 }
+{ .mfb; st8 [r33]=r31,16 };;
+
+{ .mfb; getf.sig r24=f112 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r21,r20
+ add r22=r22,r21 };;
+{ .mfb; getf.sig r25=f103 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r22,r21
+ add r23=r23,r22 };;
+{ .mfb; getf.sig r26=f94 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r23,r22
+ add r23=r23,carry1 };;
+{ .mfb; getf.sig r27=f85 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p8=r23,carry1};;
+{ .mii; getf.sig r28=f76
+ add r25=r25,r24
+ mov carry1=0 }
+{ .mii; st8 [r32]=r23,16
+ (p7) add carry2=1,carry3
+ (p8) add carry2=0,carry3 };;
+
+{ .mfb; nop.m 0x0 }
+{ .mii; getf.sig r29=f67
+ cmp.ltu p6,p0=r25,r24
+ add r26=r26,r25 };;
+{ .mfb; getf.sig r30=f58 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,r25
+ add r27=r27,r26 };;
+{ .mfb; getf.sig r16=f113 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r27,r26
+ add r28=r28,r27 };;
+{ .mfb; getf.sig r17=f104 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r28,r27
+ add r29=r29,r28 };;
+{ .mfb; getf.sig r18=f95 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r29,r28
+ add r30=r30,r29 };;
+{ .mii; getf.sig r19=f86
+ add r17=r17,r16
+ mov carry3=0 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r30,r29
+ add r30=r30,carry2 };;
+{ .mii; getf.sig r20=f77
+ cmp.ltu p7,p0=r17,r16
+ add r18=r18,r17 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r30,carry2 };;
+{ .mfb; getf.sig r21=f68 }
+{ .mii; st8 [r33]=r30,16
+(p6) add carry1=1,carry1 };;
+
+{ .mfb; getf.sig r24=f114 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r18,r17
+ add r19=r19,r18 };;
+{ .mfb; getf.sig r25=f105 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r19,r18
+ add r20=r20,r19 };;
+{ .mfb; getf.sig r26=f96 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r20,r19
+ add r21=r21,r20 };;
+{ .mfb; getf.sig r27=f87 }
+{ .mii; (p7) add carry3=1,carry3
+ cmp.ltu p7,p0=r21,r20
+ add r21=r21,carry1 };;
+{ .mib; getf.sig r28=f78
+ add r25=r25,r24 }
+{ .mib; (p7) add carry3=1,carry3
+ cmp.ltu p7,p8=r21,carry1};;
+{ .mii; st8 [r32]=r21,16
+ (p7) add carry2=1,carry3
+ (p8) add carry2=0,carry3 }
+
+{ .mii; mov carry1=0
+ cmp.ltu p6,p0=r25,r24
+ add r26=r26,r25 };;
+{ .mfb; getf.sig r16=f115 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,r25
+ add r27=r27,r26 };;
+{ .mfb; getf.sig r17=f106 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r27,r26
+ add r28=r28,r27 };;
+{ .mfb; getf.sig r18=f97 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r28,r27
+ add r28=r28,carry2 };;
+{ .mib; getf.sig r19=f88
+ add r17=r17,r16 }
+{ .mib;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r28,carry2 };;
+{ .mii; st8 [r33]=r28,16
+(p6) add carry1=1,carry1 }
+
+{ .mii; mov carry2=0
+ cmp.ltu p7,p0=r17,r16
+ add r18=r18,r17 };;
+{ .mfb; getf.sig r24=f116 }
+{ .mii; (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r18,r17
+ add r19=r19,r18 };;
+{ .mfb; getf.sig r25=f107 }
+{ .mii; (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r19,r18
+ add r19=r19,carry1 };;
+{ .mfb; getf.sig r26=f98 }
+{ .mii; (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r19,carry1};;
+{ .mii; st8 [r32]=r19,16
+ (p7) add carry2=1,carry2 }
+
+{ .mfb; add r25=r25,r24 };;
+
+{ .mfb; getf.sig r16=f117 }
+{ .mii; mov carry1=0
+ cmp.ltu p6,p0=r25,r24
+ add r26=r26,r25 };;
+{ .mfb; getf.sig r17=f108 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,r25
+ add r26=r26,carry2 };;
+{ .mfb; nop.m 0x0 }
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,carry2 };;
+{ .mii; st8 [r33]=r26,16
+(p6) add carry1=1,carry1 }
+
+{ .mfb; add r17=r17,r16 };;
+{ .mfb; getf.sig r24=f118 }
+{ .mii; mov carry2=0
+ cmp.ltu p7,p0=r17,r16
+ add r17=r17,carry1 };;
+{ .mii; (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r17,carry1};;
+{ .mii; st8 [r32]=r17
+ (p7) add carry2=1,carry2 };;
+{ .mfb; add r24=r24,carry2 };;
+{ .mib; st8 [r33]=r24 }
+
+{ .mib; rum 1<<5 // clear um.mfh
+ br.ret.sptk.many b0 };;
+.endp bn_mul_comba8#
+#undef carry3
+#undef carry2
+#undef carry1
+#endif
+
+#if 1
+// It's possible to make it faster (see comment to bn_sqr_comba8), but
+// I reckon it doesn't worth the effort. Basically because the routine
+// (actually both of them) practically never called... So I just play
+// same trick as with bn_sqr_comba8.
+//
+// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+//
+.global bn_sqr_comba4#
+.proc bn_sqr_comba4#
+.align 64
+bn_sqr_comba4:
+ .prologue
+ .save ar.pfs,r2
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+{ .mii; alloc r2=ar.pfs,2,1,0,0
+ addp4 r32=0,r32
+ addp4 r33=0,r33 };;
+{ .mii;
+#else
+{ .mii; alloc r2=ar.pfs,2,1,0,0
+#endif
+ mov r34=r33
+ add r14=8,r33 };;
+ .body
+{ .mii; add r17=8,r34
+ add r15=16,r33
+ add r18=16,r34 }
+{ .mfb; add r16=24,r33
+ br .L_cheat_entry_point4 };;
+.endp bn_sqr_comba4#
+#endif
+
+#if 1
+// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
+//
+// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+//
+#define carry1 r14
+#define carry2 r15
+.global bn_mul_comba4#
+.proc bn_mul_comba4#
+.align 64
+bn_mul_comba4:
+ .prologue
+ .save ar.pfs,r2
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+{ .mii; alloc r2=ar.pfs,3,0,0,0
+ addp4 r33=0,r33
+ addp4 r34=0,r34 };;
+{ .mii; addp4 r32=0,r32
+#else
+{ .mii; alloc r2=ar.pfs,3,0,0,0
+#endif
+ add r14=8,r33
+ add r17=8,r34 }
+ .body
+{ .mii; add r15=16,r33
+ add r18=16,r34
+ add r16=24,r33 };;
+.L_cheat_entry_point4:
+{ .mmi; add r19=24,r34
+
+ ldf8 f32=[r33] }
+
+{ .mmi; ldf8 f120=[r34]
+ ldf8 f121=[r17] };;
+{ .mmi; ldf8 f122=[r18]
+ ldf8 f123=[r19] }
+
+{ .mmi; ldf8 f33=[r14]
+ ldf8 f34=[r15] }
+{ .mfi; ldf8 f35=[r16]
+
+ xma.hu f41=f32,f120,f0 }
+{ .mfi; xma.lu f40=f32,f120,f0 };;
+{ .mfi; xma.hu f51=f32,f121,f0 }
+{ .mfi; xma.lu f50=f32,f121,f0 };;
+{ .mfi; xma.hu f61=f32,f122,f0 }
+{ .mfi; xma.lu f60=f32,f122,f0 };;
+{ .mfi; xma.hu f71=f32,f123,f0 }
+{ .mfi; xma.lu f70=f32,f123,f0 };;//
+// Major stall takes place here, and 3 more places below. Result from
+// first xma is not available for another 3 ticks.
+{ .mfi; getf.sig r16=f40
+ xma.hu f42=f33,f120,f41
+ add r33=8,r32 }
+{ .mfi; xma.lu f41=f33,f120,f41 };;
+{ .mfi; getf.sig r24=f50
+ xma.hu f52=f33,f121,f51 }
+{ .mfi; xma.lu f51=f33,f121,f51 };;
+{ .mfi; st8 [r32]=r16,16
+ xma.hu f62=f33,f122,f61 }
+{ .mfi; xma.lu f61=f33,f122,f61 };;
+{ .mfi; xma.hu f72=f33,f123,f71 }
+{ .mfi; xma.lu f71=f33,f123,f71 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r25=f41
+ xma.hu f43=f34,f120,f42 }
+{ .mfi; xma.lu f42=f34,f120,f42 };;
+{ .mfi; getf.sig r16=f60
+ xma.hu f53=f34,f121,f52 }
+{ .mfi; xma.lu f52=f34,f121,f52 };;
+{ .mfi; getf.sig r17=f51
+ xma.hu f63=f34,f122,f62
+ add r25=r25,r24 }
+{ .mfi; mov carry1=0
+ xma.lu f62=f34,f122,f62 };;
+{ .mfi; st8 [r33]=r25,16
+ xma.hu f73=f34,f123,f72
+ cmp.ltu p6,p0=r25,r24 }
+{ .mfi; xma.lu f72=f34,f123,f72 };;//
+//-------------------------------------------------//
+{ .mfi; getf.sig r18=f42
+ xma.hu f44=f35,f120,f43
+(p6) add carry1=1,carry1 }
+{ .mfi; add r17=r17,r16
+ xma.lu f43=f35,f120,f43
+ mov carry2=0 };;
+{ .mfi; getf.sig r24=f70
+ xma.hu f54=f35,f121,f53
+ cmp.ltu p7,p0=r17,r16 }
+{ .mfi; xma.lu f53=f35,f121,f53 };;
+{ .mfi; getf.sig r25=f61
+ xma.hu f64=f35,f122,f63
+ add r18=r18,r17 }
+{ .mfi; xma.lu f63=f35,f122,f63
+(p7) add carry2=1,carry2 };;
+{ .mfi; getf.sig r26=f52
+ xma.hu f74=f35,f123,f73
+ cmp.ltu p7,p0=r18,r17 }
+{ .mfi; xma.lu f73=f35,f123,f73
+ add r18=r18,carry1 };;
+//-------------------------------------------------//
+{ .mii; st8 [r32]=r18,16
+(p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r18,carry1 };;
+
+{ .mfi; getf.sig r27=f43 // last major stall
+(p7) add carry2=1,carry2 };;
+{ .mii; getf.sig r16=f71
+ add r25=r25,r24
+ mov carry1=0 };;
+{ .mii; getf.sig r17=f62
+ cmp.ltu p6,p0=r25,r24
+ add r26=r26,r25 };;
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,r25
+ add r27=r27,r26 };;
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r27,r26
+ add r27=r27,carry2 };;
+{ .mii; getf.sig r18=f53
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r27,carry2 };;
+{ .mfi; st8 [r33]=r27,16
+(p6) add carry1=1,carry1 }
+
+{ .mii; getf.sig r19=f44
+ add r17=r17,r16
+ mov carry2=0 };;
+{ .mii; getf.sig r24=f72
+ cmp.ltu p7,p0=r17,r16
+ add r18=r18,r17 };;
+{ .mii; (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r18,r17
+ add r19=r19,r18 };;
+{ .mii; (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r19,r18
+ add r19=r19,carry1 };;
+{ .mii; getf.sig r25=f63
+ (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r19,carry1};;
+{ .mii; st8 [r32]=r19,16
+ (p7) add carry2=1,carry2 }
+
+{ .mii; getf.sig r26=f54
+ add r25=r25,r24
+ mov carry1=0 };;
+{ .mii; getf.sig r16=f73
+ cmp.ltu p6,p0=r25,r24
+ add r26=r26,r25 };;
+{ .mii;
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,r25
+ add r26=r26,carry2 };;
+{ .mii; getf.sig r17=f64
+(p6) add carry1=1,carry1
+ cmp.ltu p6,p0=r26,carry2 };;
+{ .mii; st8 [r33]=r26,16
+(p6) add carry1=1,carry1 }
+
+{ .mii; getf.sig r24=f74
+ add r17=r17,r16
+ mov carry2=0 };;
+{ .mii; cmp.ltu p7,p0=r17,r16
+ add r17=r17,carry1 };;
+
+{ .mii; (p7) add carry2=1,carry2
+ cmp.ltu p7,p0=r17,carry1};;
+{ .mii; st8 [r32]=r17,16
+ (p7) add carry2=1,carry2 };;
+
+{ .mii; add r24=r24,carry2 };;
+{ .mii; st8 [r33]=r24 }
+
+{ .mib; rum 1<<5 // clear um.mfh
+ br.ret.sptk.many b0 };;
+.endp bn_mul_comba4#
+#undef carry2
+#undef carry1
+#endif
+
+#if 1
+//
+// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
+//
+// In the nutshell it's a port of my MIPS III/IV implementation.
+//
+#define AT r14
+#define H r16
+#define HH r20
+#define L r17
+#define D r18
+#define DH r22
+#define I r21
+
+#if 0
+// Some preprocessors (most notably HP-UX) appear to be allergic to
+// macros enclosed to parenthesis [as these three were].
+#define cont p16
+#define break p0 // p20
+#define equ p24
+#else
+cont=p16
+break=p0
+equ=p24
+#endif
+
+.global abort#
+.global bn_div_words#
+.proc bn_div_words#
+.align 64
+bn_div_words:
+ .prologue
+ .save ar.pfs,r2
+{ .mii; alloc r2=ar.pfs,3,5,0,8
+ .save b0,r3
+ mov r3=b0
+ .save pr,r10
+ mov r10=pr };;
+{ .mmb; cmp.eq p6,p0=r34,r0
+ mov r8=-1
+(p6) br.ret.spnt.many b0 };;
+
+ .body
+{ .mii; mov H=r32 // save h
+ mov ar.ec=0 // don't rotate at exit
+ mov pr.rot=0 }
+{ .mii; mov L=r33 // save l
+ mov r36=r0 };;
+
+.L_divw_shift: // -vv- note signed comparison
+{ .mfi; (p0) cmp.lt p16,p0=r0,r34 // d
+ (p0) shladd r33=r34,1,r0 }
+{ .mfb; (p0) add r35=1,r36
+ (p0) nop.f 0x0
+(p16) br.wtop.dpnt .L_divw_shift };;
+
+{ .mii; mov D=r34
+ shr.u DH=r34,32
+ sub r35=64,r36 };;
+{ .mii; setf.sig f7=DH
+ shr.u AT=H,r35
+ mov I=r36 };;
+{ .mib; cmp.ne p6,p0=r0,AT
+ shl H=H,r36
+(p6) br.call.spnt.clr b0=abort };; // overflow, die...
+
+{ .mfi; fcvt.xuf.s1 f7=f7
+ shr.u AT=L,r35 };;
+{ .mii; shl L=L,r36
+ or H=H,AT };;
+
+{ .mii; nop.m 0x0
+ cmp.leu p6,p0=D,H;;
+(p6) sub H=H,D }
+
+{ .mlx; setf.sig f14=D
+ movl AT=0xffffffff };;
+///////////////////////////////////////////////////////////
+{ .mii; setf.sig f6=H
+ shr.u HH=H,32;;
+ cmp.eq p6,p7=HH,DH };;
+{ .mfb;
+(p6) setf.sig f8=AT
+(p7) fcvt.xuf.s1 f6=f6
+(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
+
+{ .mfi; getf.sig r33=f8 // q
+ xmpy.lu f9=f8,f14 }
+{ .mfi; xmpy.hu f10=f8,f14
+ shrp H=H,L,32 };;
+
+{ .mmi; getf.sig r35=f9 // tl
+ getf.sig r31=f10 };; // th
+
+.L_divw_1st_iter:
+{ .mii; (p0) add r32=-1,r33
+ (p0) cmp.eq equ,cont=HH,r31 };;
+{ .mii; (p0) cmp.ltu p8,p0=r35,D
+ (p0) sub r34=r35,D
+ (equ) cmp.leu break,cont=r35,H };;
+{ .mib; (cont) cmp.leu cont,break=HH,r31
+ (p8) add r31=-1,r31
+(cont) br.wtop.spnt .L_divw_1st_iter };;
+///////////////////////////////////////////////////////////
+{ .mii; sub H=H,r35
+ shl r8=r33,32
+ shl L=L,32 };;
+///////////////////////////////////////////////////////////
+{ .mii; setf.sig f6=H
+ shr.u HH=H,32;;
+ cmp.eq p6,p7=HH,DH };;
+{ .mfb;
+(p6) setf.sig f8=AT
+(p7) fcvt.xuf.s1 f6=f6
+(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
+
+{ .mfi; getf.sig r33=f8 // q
+ xmpy.lu f9=f8,f14 }
+{ .mfi; xmpy.hu f10=f8,f14
+ shrp H=H,L,32 };;
+
+{ .mmi; getf.sig r35=f9 // tl
+ getf.sig r31=f10 };; // th
+
+.L_divw_2nd_iter:
+{ .mii; (p0) add r32=-1,r33
+ (p0) cmp.eq equ,cont=HH,r31 };;
+{ .mii; (p0) cmp.ltu p8,p0=r35,D
+ (p0) sub r34=r35,D
+ (equ) cmp.leu break,cont=r35,H };;
+{ .mib; (cont) cmp.leu cont,break=HH,r31
+ (p8) add r31=-1,r31
+(cont) br.wtop.spnt .L_divw_2nd_iter };;
+///////////////////////////////////////////////////////////
+{ .mii; sub H=H,r35
+ or r8=r8,r33
+ mov ar.pfs=r2 };;
+{ .mii; shr.u r9=H,I // remainder if anybody wants it
+ mov pr=r10,0x1ffff }
+{ .mfb; br.ret.sptk.many b0 };;
+
+// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
+// procedure.
+//
+// inputs: f6 = (double)a, f7 = (double)b
+// output: f8 = (int)(a/b)
+// clobbered: f8,f9,f10,f11,pred
+pred=p15
+// One can argue that this snippet is copyrighted to Intel
+// Corporation, as it's essentially identical to one of those
+// found in "Divide, Square Root and Remainder" section at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+// Yes, I admit that the referred code was used as template,
+// but after I realized that there hardly is any other instruction
+// sequence which would perform this operation. I mean I figure that
+// any independent attempt to implement high-performance division
+// will result in code virtually identical to the Intel code. It
+// should be noted though that below division kernel is 1 cycle
+// faster than Intel one (note commented splits:-), not to mention
+// original prologue (rather lack of one) and epilogue.
+.align 32
+.skip 16
+.L_udiv64_32_b6:
+ frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b
+
+(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
+(pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0
+(pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0
+(pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0
+(pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0
+(pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1
+(pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1
+(pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2
+(pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2
+
+ fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
+ br.ret.sptk.many b6;;
+.endp bn_div_words#
+#endif
diff --git a/openssl-1.1.0h/crypto/bn/asm/mips-mont.pl b/openssl-1.1.0h/crypto/bn/asm/mips-mont.pl
new file mode 100644
index 0000000..a907571
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/mips-mont.pl
@@ -0,0 +1,433 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+ $PTR_ADD="dadd"; # incidentally works even on n32
+ $PTR_SUB="dsub"; # incidentally works even on n32
+ $REG_S="sd";
+ $REG_L="ld";
+ $SZREG=8;
+} else {
+ $PTR_ADD="add";
+ $PTR_SUB="sub";
+ $REG_S="sw";
+ $REG_L="lw";
+ $SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+ $LD="ld";
+ $ST="sd";
+ $MULTU="dmultu";
+ $ADDU="daddu";
+ $SUBU="dsubu";
+ $BNSZ=8;
+} else {
+ $LD="lw";
+ $ST="sw";
+ $MULTU="multu";
+ $ADDU="addu";
+ $SUBU="subu";
+ $BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0; # BN_ULONG *rp,
+$ap=$a1; # const BN_ULONG *ap,
+$bp=$a2; # const BN_ULONG *bp,
+$np=$a3; # const BN_ULONG *np,
+$n0=$a4; # const BN_ULONG *n0,
+$num=$a5; # int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set noat
+.set noreorder
+
+.align 5
+.globl bn_mul_mont
+.ent bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+ lw $n0,16($sp)
+ lw $num,20($sp)
+___
+$code.=<<___;
+ slt $at,$num,4
+ bnez $at,1f
+ li $t0,0
+ slt $at,$num,17 # on in-order CPU
+ bnez $at,bn_mul_mont_internal
+ nop
+1: jr $ra
+ li $a0,0
+.end bn_mul_mont
+
+.align 5
+.ent bn_mul_mont_internal
+bn_mul_mont_internal:
+ .frame $fp,$FRAMESIZE*$SZREG,$ra
+ .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
+ $PTR_SUB $sp,$FRAMESIZE*$SZREG
+ $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
+ $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+ move $fp,$sp
+
+ .set reorder
+ $LD $n0,0($n0)
+ $LD $bi,0($bp) # bp[0]
+ $LD $aj,0($ap) # ap[0]
+ $LD $nj,0($np) # np[0]
+
+ $PTR_SUB $sp,2*$BNSZ # place for two extra words
+ sll $num,`log($BNSZ)/log(2)`
+ li $at,-4096
+ $PTR_SUB $sp,$num
+ and $sp,$at
+
+ $MULTU $aj,$bi
+ $LD $alo,$BNSZ($ap)
+ $LD $nlo,$BNSZ($np)
+ mflo $lo0
+ mfhi $hi0
+ $MULTU $lo0,$n0
+ mflo $m1
+
+ $MULTU $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ $MULTU $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+ $MULTU $nlo,$m1
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,$sp
+ li $j,2*$BNSZ
+.align 4
+.L1st:
+ .set noreorder
+ $PTR_ADD $aj,$ap,$j
+ $PTR_ADD $nj,$np,$j
+ $LD $aj,($aj)
+ $LD $nj,($nj)
+
+ $MULTU $aj,$bi
+ $ADDU $lo0,$alo,$hi0
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo0,$hi0
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi0,$ahi,$at
+ $ADDU $hi1,$nhi,$t0
+ mflo $alo
+ mfhi $ahi
+
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $MULTU $nj,$m1
+ $ADDU $hi1,$at
+ addu $j,$BNSZ
+ $ST $lo1,($tp)
+ sltu $t0,$j,$num
+ mflo $nlo
+ mfhi $nhi
+
+ bnez $t0,.L1st
+ $PTR_ADD $tp,$BNSZ
+ .set reorder
+
+ $ADDU $lo0,$alo,$hi0
+ sltu $at,$lo0,$hi0
+ $ADDU $hi0,$ahi,$at
+
+ $ADDU $lo1,$nlo,$hi1
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi1,$nhi,$t0
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+
+ $ST $lo1,($tp)
+
+ $ADDU $hi1,$hi0
+ sltu $at,$hi1,$hi0
+ $ST $hi1,$BNSZ($tp)
+ $ST $at,2*$BNSZ($tp)
+
+ li $i,$BNSZ
+.align 4
+.Louter:
+ $PTR_ADD $bi,$bp,$i
+ $LD $bi,($bi)
+ $LD $aj,($ap)
+ $LD $alo,$BNSZ($ap)
+ $LD $tj,($sp)
+
+ $MULTU $aj,$bi
+ $LD $nj,($np)
+ $LD $nlo,$BNSZ($np)
+ mflo $lo0
+ mfhi $hi0
+ $ADDU $lo0,$tj
+ $MULTU $lo0,$n0
+ sltu $at,$lo0,$tj
+ $ADDU $hi0,$at
+ mflo $m1
+
+ $MULTU $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ $MULTU $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+
+ $MULTU $nlo,$m1
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,$sp
+ li $j,2*$BNSZ
+ $LD $tj,$BNSZ($tp)
+.align 4
+.Linner:
+ .set noreorder
+ $PTR_ADD $aj,$ap,$j
+ $PTR_ADD $nj,$np,$j
+ $LD $aj,($aj)
+ $LD $nj,($nj)
+
+ $MULTU $aj,$bi
+ $ADDU $lo0,$alo,$hi0
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo0,$hi0
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi0,$ahi,$at
+ $ADDU $hi1,$nhi,$t0
+ mflo $alo
+ mfhi $ahi
+
+ $ADDU $lo0,$tj
+ addu $j,$BNSZ
+ $MULTU $nj,$m1
+ sltu $at,$lo0,$tj
+ $ADDU $lo1,$lo0
+ $ADDU $hi0,$at
+ sltu $t0,$lo1,$lo0
+ $LD $tj,2*$BNSZ($tp)
+ $ADDU $hi1,$t0
+ sltu $at,$j,$num
+ mflo $nlo
+ mfhi $nhi
+ $ST $lo1,($tp)
+ bnez $at,.Linner
+ $PTR_ADD $tp,$BNSZ
+ .set reorder
+
+ $ADDU $lo0,$alo,$hi0
+ sltu $at,$lo0,$hi0
+ $ADDU $hi0,$ahi,$at
+ $ADDU $lo0,$tj
+ sltu $t0,$lo0,$tj
+ $ADDU $hi0,$t0
+
+ $LD $tj,2*$BNSZ($tp)
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo1,$hi1
+ $ADDU $hi1,$nhi,$at
+ $ADDU $lo1,$lo0
+ sltu $t0,$lo1,$lo0
+ $ADDU $hi1,$t0
+ $ST $lo1,($tp)
+
+ $ADDU $lo1,$hi1,$hi0
+ sltu $hi1,$lo1,$hi0
+ $ADDU $lo1,$tj
+ sltu $at,$lo1,$tj
+ $ADDU $hi1,$at
+ $ST $lo1,$BNSZ($tp)
+ $ST $hi1,2*$BNSZ($tp)
+
+ addu $i,$BNSZ
+ sltu $t0,$i,$num
+ bnez $t0,.Louter
+
+ .set noreorder
+ $PTR_ADD $tj,$sp,$num # &tp[num]
+ move $tp,$sp
+ move $ap,$sp
+ li $hi0,0 # clear borrow bit
+
+.align 4
+.Lsub: $LD $lo0,($tp)
+ $LD $lo1,($np)
+ $PTR_ADD $tp,$BNSZ
+ $PTR_ADD $np,$BNSZ
+ $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
+ sgtu $at,$lo1,$lo0
+ $SUBU $lo0,$lo1,$hi0
+ sgtu $hi0,$lo0,$lo1
+ $ST $lo0,($rp)
+ or $hi0,$at
+ sltu $at,$tp,$tj
+ bnez $at,.Lsub
+ $PTR_ADD $rp,$BNSZ
+
+ $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
+ move $tp,$sp
+ $PTR_SUB $rp,$num # restore rp
+ not $hi1,$hi0
+
+ and $ap,$hi0,$sp
+ and $bp,$hi1,$rp
+ or $ap,$ap,$bp # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: $LD $aj,($ap)
+ $PTR_ADD $ap,$BNSZ
+ $ST $zero,($tp)
+ $PTR_ADD $tp,$BNSZ
+ sltu $at,$tp,$tj
+ $ST $aj,($rp)
+ bnez $at,.Lcopy
+ $PTR_ADD $rp,$BNSZ
+
+ li $a0,1
+ li $t0,1
+
+ .set noreorder
+ move $sp,$fp
+ $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
+ $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end bn_mul_mont_internal
+.rdata
+.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/mips.pl b/openssl-1.1.0h/crypto/bn/asm/mips.pl
new file mode 100644
index 0000000..420f01f
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/mips.pl
@@ -0,0 +1,2241 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project.
+#
+# Rights for redistribution and usage in source and binary forms are
+# granted according to the OpenSSL license. Warranty of any kind is
+# disclaimed.
+# ====================================================================
+
+
+# July 1999
+#
+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
+#
+# The module is designed to work with either of the "new" MIPS ABI(5),
+# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
+# IRIX 5.x not only because it doesn't support new ABIs but also
+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+# cause illegal instruction exception:-(
+#
+# In addition the code depends on preprocessor flags set up by MIPSpro
+# compiler driver (either as or cc) and therefore (probably?) can't be
+# compiled by the GNU assembler. GNU C driver manages fine though...
+# I mean as long as -mmips-as is specified or is the default option,
+# because then it simply invokes /usr/bin/as which in turn takes
+# perfect care of the preprocessor definitions. Another neat feature
+# offered by the MIPSpro assembler is an optimization pass. This gave
+# me the opportunity to have the code looking more regular as all those
+# architecture dependent instruction rescheduling details were left to
+# the assembler. Cool, huh?
+#
+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+# goes way over 3 times faster!
+#
+# <appro@fy.chalmers.se>
+
+# October 2010
+#
+# Adapt the module even for 32-bit ABIs and other OSes. The former was
+# achieved by mechanical replacement of 64-bit arithmetic instructions
+# such as dmultu, daddu, etc. with their 32-bit counterparts and
+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
+# >3x performance improvement naturally does not apply to 32-bit code
+# [because there is no instruction 32-bit compiler can't use], one
+# has to content with 40-85% improvement depending on benchmark and
+# key length, more for longer keys.
+
+$flavour = shift || "o32";
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+ $LD="ld";
+ $ST="sd";
+ $MULTU="dmultu";
+ $DIVU="ddivu";
+ $ADDU="daddu";
+ $SUBU="dsubu";
+ $SRL="dsrl";
+ $SLL="dsll";
+ $BNSZ=8;
+ $PTR_ADD="daddu";
+ $PTR_SUB="dsubu";
+ $SZREG=8;
+ $REG_S="sd";
+ $REG_L="ld";
+} else {
+ $LD="lw";
+ $ST="sw";
+ $MULTU="multu";
+ $DIVU="divu";
+ $ADDU="addu";
+ $SUBU="subu";
+ $SRL="srl";
+ $SLL="sll";
+ $BNSZ=4;
+ $PTR_ADD="addu";
+ $PTR_SUB="subu";
+ $SZREG=4;
+ $REG_S="sw";
+ $REG_L="lw";
+ $code=".set mips2\n";
+}
+
+# Below is N32/64 register layout used in the original module.
+#
+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
+#
+# No special adaptation is required for O32. NUBI on the other hand
+# is treated by saving/restoring ($v1,$t0..$t3).
+
+$gp=$v1 if ($flavour =~ /nubi/i);
+
+$minus4=$v1;
+
+$code.=<<___;
+.rdata
+.asciiz "mips3.s, Version 1.2"
+.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
+
+.text
+.set noat
+
+.align 5
+.globl bn_mul_add_words
+.ent bn_mul_add_words
+bn_mul_add_words:
+ .set noreorder
+ bgtz $a2,bn_mul_add_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_mul_add_words
+
+.align 5
+.ent bn_mul_add_words_internal
+bn_mul_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ beqz $ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ $LD $t1,0($a0)
+ $LD $t2,$BNSZ($a1)
+ $LD $t3,$BNSZ($a0)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta1,2*$BNSZ($a0)
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
+ # values", but it seems to work fine
+ # even on 64-bit registers.
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ $MULTU $t2,$a3
+ sltu $at,$t1,$at
+ $ST $t1,0($a0)
+ $ADDU $v0,$at
+
+ $LD $ta2,3*$BNSZ($a1)
+ $LD $ta3,3*$BNSZ($a0)
+ $ADDU $t3,$v0
+ sltu $v0,$t3,$v0
+ mflo $at
+ mfhi $t2
+ $ADDU $t3,$at
+ $ADDU $v0,$t2
+ $MULTU $ta0,$a3
+ sltu $at,$t3,$at
+ $ST $t3,$BNSZ($a0)
+ $ADDU $v0,$at
+
+ subu $a2,4
+ $PTR_ADD $a0,4*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ $ADDU $ta1,$v0
+ sltu $v0,$ta1,$v0
+ mflo $at
+ mfhi $ta0
+ $ADDU $ta1,$at
+ $ADDU $v0,$ta0
+ $MULTU $ta2,$a3
+ sltu $at,$ta1,$at
+ $ST $ta1,-2*$BNSZ($a0)
+ $ADDU $v0,$at
+
+
+ and $ta0,$a2,$minus4
+ $ADDU $ta3,$v0
+ sltu $v0,$ta3,$v0
+ mflo $at
+ mfhi $ta2
+ $ADDU $ta3,$at
+ $ADDU $v0,$ta2
+ sltu $at,$ta3,$at
+ $ST $ta3,-$BNSZ($a0)
+ .set noreorder
+ bgtz $ta0,.L_bn_mul_add_words_loop
+ $ADDU $v0,$at
+
+ beqz $a2,.L_bn_mul_add_words_return
+ nop
+
+.L_bn_mul_add_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ $LD $t1,0($a0)
+ subu $a2,1
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,0($a0)
+ $ADDU $v0,$at
+ beqz $a2,.L_bn_mul_add_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$a3
+ $LD $t1,$BNSZ($a0)
+ subu $a2,1
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$at
+ beqz $a2,.L_bn_mul_add_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$a3
+ $LD $t1,2*$BNSZ($a0)
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,2*$BNSZ($a0)
+ $ADDU $v0,$at
+
+.L_bn_mul_add_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_mul_add_words_internal
+
+.align 5
+.globl bn_mul_words
+.ent bn_mul_words
+bn_mul_words:
+ .set noreorder
+ bgtz $a2,bn_mul_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_mul_words
+
+.align 5
+.ent bn_mul_words_internal
+bn_mul_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ beqz $ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ $LD $t2,$BNSZ($a1)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta2,3*$BNSZ($a1)
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $MULTU $t2,$a3
+ $ST $v0,0($a0)
+ $ADDU $v0,$t1,$t0
+
+ subu $a2,4
+ $PTR_ADD $a0,4*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ mflo $at
+ mfhi $t2
+ $ADDU $v0,$at
+ sltu $t3,$v0,$at
+ $MULTU $ta0,$a3
+ $ST $v0,-3*$BNSZ($a0)
+ $ADDU $v0,$t3,$t2
+
+ mflo $at
+ mfhi $ta0
+ $ADDU $v0,$at
+ sltu $ta1,$v0,$at
+ $MULTU $ta2,$a3
+ $ST $v0,-2*$BNSZ($a0)
+ $ADDU $v0,$ta1,$ta0
+
+ and $ta0,$a2,$minus4
+ mflo $at
+ mfhi $ta2
+ $ADDU $v0,$at
+ sltu $ta3,$v0,$at
+ $ST $v0,-$BNSZ($a0)
+ .set noreorder
+ bgtz $ta0,.L_bn_mul_words_loop
+ $ADDU $v0,$ta3,$ta2
+
+ beqz $a2,.L_bn_mul_words_return
+ nop
+
+.L_bn_mul_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ subu $a2,1
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,0($a0)
+ $ADDU $v0,$t1,$t0
+ beqz $a2,.L_bn_mul_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$a3
+ subu $a2,1
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,$BNSZ($a0)
+ $ADDU $v0,$t1,$t0
+ beqz $a2,.L_bn_mul_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$a3
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,2*$BNSZ($a0)
+ $ADDU $v0,$t1,$t0
+
+.L_bn_mul_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_mul_words_internal
+
+.align 5
+.globl bn_sqr_words
+.ent bn_sqr_words
+bn_sqr_words:
+ .set noreorder
+ bgtz $a2,bn_sqr_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_sqr_words
+
+.align 5
+.ent bn_sqr_words_internal
+bn_sqr_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ beqz $ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+ $LD $t0,0($a1)
+ $MULTU $t0,$t0
+ $LD $t2,$BNSZ($a1)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta2,3*$BNSZ($a1)
+ mflo $t1
+ mfhi $t0
+ $ST $t1,0($a0)
+ $ST $t0,$BNSZ($a0)
+
+ $MULTU $t2,$t2
+ subu $a2,4
+ $PTR_ADD $a0,8*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ mflo $t3
+ mfhi $t2
+ $ST $t3,-6*$BNSZ($a0)
+ $ST $t2,-5*$BNSZ($a0)
+
+ $MULTU $ta0,$ta0
+ mflo $ta1
+ mfhi $ta0
+ $ST $ta1,-4*$BNSZ($a0)
+ $ST $ta0,-3*$BNSZ($a0)
+
+
+ $MULTU $ta2,$ta2
+ and $ta0,$a2,$minus4
+ mflo $ta3
+ mfhi $ta2
+ $ST $ta3,-2*$BNSZ($a0)
+
+ .set noreorder
+ bgtz $ta0,.L_bn_sqr_words_loop
+ $ST $ta2,-$BNSZ($a0)
+
+ beqz $a2,.L_bn_sqr_words_return
+ nop
+
+.L_bn_sqr_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$t0
+ subu $a2,1
+ mflo $t1
+ mfhi $t0
+ $ST $t1,0($a0)
+ $ST $t0,$BNSZ($a0)
+ beqz $a2,.L_bn_sqr_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$t0
+ subu $a2,1
+ mflo $t1
+ mfhi $t0
+ $ST $t1,2*$BNSZ($a0)
+ $ST $t0,3*$BNSZ($a0)
+ beqz $a2,.L_bn_sqr_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$t0
+ mflo $t1
+ mfhi $t0
+ $ST $t1,4*$BNSZ($a0)
+ $ST $t0,5*$BNSZ($a0)
+
+.L_bn_sqr_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+
+.end bn_sqr_words_internal
+
+.align 5
+.globl bn_add_words
+.ent bn_add_words
+bn_add_words:
+ .set noreorder
+ bgtz $a3,bn_add_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_add_words
+
+.align 5
+.ent bn_add_words_internal
+bn_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $at,$a3,$minus4
+ beqz $at,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ subu $a3,4
+ $LD $t1,$BNSZ($a1)
+ and $at,$a3,$minus4
+ $LD $t2,2*$BNSZ($a1)
+ $PTR_ADD $a2,4*$BNSZ
+ $LD $t3,3*$BNSZ($a1)
+ $PTR_ADD $a0,4*$BNSZ
+ $LD $ta1,-3*$BNSZ($a2)
+ $PTR_ADD $a1,4*$BNSZ
+ $LD $ta2,-2*$BNSZ($a2)
+ $LD $ta3,-$BNSZ($a2)
+ $ADDU $ta0,$t0
+ sltu $t8,$ta0,$t0
+ $ADDU $t0,$ta0,$v0
+ sltu $v0,$t0,$ta0
+ $ST $t0,-4*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ $ADDU $ta1,$t1
+ sltu $t9,$ta1,$t1
+ $ADDU $t1,$ta1,$v0
+ sltu $v0,$t1,$ta1
+ $ST $t1,-3*$BNSZ($a0)
+ $ADDU $v0,$t9
+
+ $ADDU $ta2,$t2
+ sltu $t8,$ta2,$t2
+ $ADDU $t2,$ta2,$v0
+ sltu $v0,$t2,$ta2
+ $ST $t2,-2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ $ADDU $ta3,$t3
+ sltu $t9,$ta3,$t3
+ $ADDU $t3,$ta3,$v0
+ sltu $v0,$t3,$ta3
+ $ST $t3,-$BNSZ($a0)
+
+ .set noreorder
+ bgtz $at,.L_bn_add_words_loop
+ $ADDU $v0,$t9
+
+ beqz $a3,.L_bn_add_words_return
+ nop
+
+.L_bn_add_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ $ADDU $ta0,$t0
+ subu $a3,1
+ sltu $t8,$ta0,$t0
+ $ADDU $t0,$ta0,$v0
+ sltu $v0,$t0,$ta0
+ $ST $t0,0($a0)
+ $ADDU $v0,$t8
+ beqz $a3,.L_bn_add_words_return
+
+ $LD $t1,$BNSZ($a1)
+ $LD $ta1,$BNSZ($a2)
+ $ADDU $ta1,$t1
+ subu $a3,1
+ sltu $t9,$ta1,$t1
+ $ADDU $t1,$ta1,$v0
+ sltu $v0,$t1,$ta1
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$t9
+ beqz $a3,.L_bn_add_words_return
+
+ $LD $t2,2*$BNSZ($a1)
+ $LD $ta2,2*$BNSZ($a2)
+ $ADDU $ta2,$t2
+ sltu $t8,$ta2,$t2
+ $ADDU $t2,$ta2,$v0
+ sltu $v0,$t2,$ta2
+ $ST $t2,2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+.L_bn_add_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+
+.end bn_add_words_internal
+
+.align 5
+.globl bn_sub_words
+.ent bn_sub_words
+bn_sub_words:
+ .set noreorder
+ bgtz $a3,bn_sub_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$zero
+.end bn_sub_words
+
+.align 5
+.ent bn_sub_words_internal
+bn_sub_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $at,$a3,$minus4
+ beqz $at,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ subu $a3,4
+ $LD $t1,$BNSZ($a1)
+ and $at,$a3,$minus4
+ $LD $t2,2*$BNSZ($a1)
+ $PTR_ADD $a2,4*$BNSZ
+ $LD $t3,3*$BNSZ($a1)
+ $PTR_ADD $a0,4*$BNSZ
+ $LD $ta1,-3*$BNSZ($a2)
+ $PTR_ADD $a1,4*$BNSZ
+ $LD $ta2,-2*$BNSZ($a2)
+ $LD $ta3,-$BNSZ($a2)
+ sltu $t8,$t0,$ta0
+ $SUBU $ta0,$t0,$ta0
+ $SUBU $t0,$ta0,$v0
+ sgtu $v0,$t0,$ta0
+ $ST $t0,-4*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ sltu $t9,$t1,$ta1
+ $SUBU $ta1,$t1,$ta1
+ $SUBU $t1,$ta1,$v0
+ sgtu $v0,$t1,$ta1
+ $ST $t1,-3*$BNSZ($a0)
+ $ADDU $v0,$t9
+
+
+ sltu $t8,$t2,$ta2
+ $SUBU $ta2,$t2,$ta2
+ $SUBU $t2,$ta2,$v0
+ sgtu $v0,$t2,$ta2
+ $ST $t2,-2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ sltu $t9,$t3,$ta3
+ $SUBU $ta3,$t3,$ta3
+ $SUBU $t3,$ta3,$v0
+ sgtu $v0,$t3,$ta3
+ $ST $t3,-$BNSZ($a0)
+
+ .set noreorder
+ bgtz $at,.L_bn_sub_words_loop
+ $ADDU $v0,$t9
+
+ beqz $a3,.L_bn_sub_words_return
+ nop
+
+.L_bn_sub_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ subu $a3,1
+ sltu $t8,$t0,$ta0
+ $SUBU $ta0,$t0,$ta0
+ $SUBU $t0,$ta0,$v0
+ sgtu $v0,$t0,$ta0
+ $ST $t0,0($a0)
+ $ADDU $v0,$t8
+ beqz $a3,.L_bn_sub_words_return
+
+ $LD $t1,$BNSZ($a1)
+ subu $a3,1
+ $LD $ta1,$BNSZ($a2)
+ sltu $t9,$t1,$ta1
+ $SUBU $ta1,$t1,$ta1
+ $SUBU $t1,$ta1,$v0
+ sgtu $v0,$t1,$ta1
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$t9
+ beqz $a3,.L_bn_sub_words_return
+
+ $LD $t2,2*$BNSZ($a1)
+ $LD $ta2,2*$BNSZ($a2)
+ sltu $t8,$t2,$ta2
+ $SUBU $ta2,$t2,$ta2
+ $SUBU $t2,$ta2,$v0
+ sgtu $v0,$t2,$ta2
+ $ST $t2,2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+.L_bn_sub_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_sub_words_internal
+
+.align 5
+.globl bn_div_3_words
+.ent bn_div_3_words
+bn_div_3_words:
+ .set noreorder
+ move $a3,$a0 # we know that bn_div_words does not
+ # touch $a3, $ta2, $ta3 and preserves $a2
+ # so that we can save two arguments
+ # and return address in registers
+ # instead of stack:-)
+
+ $LD $a0,($a3)
+ move $ta2,$a1
+ bne $a0,$a2,bn_div_3_words_internal
+ $LD $a1,-$BNSZ($a3)
+ li $v0,-1
+ jr $ra
+ move $a0,$v0
+.end bn_div_3_words
+
+.align 5
+.ent bn_div_3_words_internal
+bn_div_3_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ move $ta3,$ra
+ bal bn_div_words_internal
+ move $ra,$ta3
+ $MULTU $ta2,$v0
+ $LD $t2,-2*$BNSZ($a3)
+ move $ta0,$zero
+ mfhi $t1
+ mflo $t0
+ sltu $t8,$t1,$a1
+.L_bn_div_3_words_inner_loop:
+ bnez $t8,.L_bn_div_3_words_inner_loop_done
+ sgeu $at,$t2,$t0
+ seq $t9,$t1,$a1
+ and $at,$t9
+ sltu $t3,$t0,$ta2
+ $ADDU $a1,$a2
+ $SUBU $t1,$t3
+ $SUBU $t0,$ta2
+ sltu $t8,$t1,$a1
+ sltu $ta0,$a1,$a2
+ or $t8,$ta0
+ .set noreorder
+ beqz $at,.L_bn_div_3_words_inner_loop
+ $SUBU $v0,1
+ $ADDU $v0,1
+ .set reorder
+.L_bn_div_3_words_inner_loop_done:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_div_3_words_internal
+
+.align 5
+.globl bn_div_words
+.ent bn_div_words
+bn_div_words:
+ .set noreorder
+ bnez $a2,bn_div_words_internal
+ li $v0,-1 # I would rather signal div-by-zero
+ # which can be done with 'break 7'
+ jr $ra
+ move $a0,$v0
+.end bn_div_words
+
+.align 5
+.ent bn_div_words_internal
+bn_div_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ move $v1,$zero
+ bltz $a2,.L_bn_div_words_body
+ move $t9,$v1
+ $SLL $a2,1
+ bgtz $a2,.-4
+ addu $t9,1
+
+ .set reorder
+ negu $t1,$t9
+ li $t2,-1
+ $SLL $t2,$t1
+ and $t2,$a0
+ $SRL $at,$a1,$t1
+ .set noreorder
+ beqz $t2,.+12
+ nop
+ break 6 # signal overflow
+ .set reorder
+ $SLL $a0,$t9
+ $SLL $a1,$t9
+ or $a0,$at
+___
+$QT=$ta0;
+$HH=$ta1;
+$DH=$v1;
+$code.=<<___;
+.L_bn_div_words_body:
+ $SRL $DH,$a2,4*$BNSZ # bits
+ sgeu $at,$a0,$a2
+ .set noreorder
+ beqz $at,.+12
+ nop
+ $SUBU $a0,$a2
+ .set reorder
+
+ li $QT,-1
+ $SRL $HH,$a0,4*$BNSZ # bits
+ $SRL $QT,4*$BNSZ # q=0xffffffff
+ beq $DH,$HH,.L_bn_div_words_skip_div1
+ $DIVU $zero,$a0,$DH
+ mflo $QT
+.L_bn_div_words_skip_div1:
+ $MULTU $a2,$QT
+ $SLL $t3,$a0,4*$BNSZ # bits
+ $SRL $at,$a1,4*$BNSZ # bits
+ or $t3,$at
+ mflo $t0
+ mfhi $t1
+.L_bn_div_words_inner_loop1:
+ sltu $t2,$t3,$t0
+ seq $t8,$HH,$t1
+ sltu $at,$HH,$t1
+ and $t2,$t8
+ sltu $v0,$t0,$a2
+ or $at,$t2
+ .set noreorder
+ beqz $at,.L_bn_div_words_inner_loop1_done
+ $SUBU $t1,$v0
+ $SUBU $t0,$a2
+ b .L_bn_div_words_inner_loop1
+ $SUBU $QT,1
+ .set reorder
+.L_bn_div_words_inner_loop1_done:
+
+ $SLL $a1,4*$BNSZ # bits
+ $SUBU $a0,$t3,$t0
+ $SLL $v0,$QT,4*$BNSZ # bits
+
+ li $QT,-1
+ $SRL $HH,$a0,4*$BNSZ # bits
+ $SRL $QT,4*$BNSZ # q=0xffffffff
+ beq $DH,$HH,.L_bn_div_words_skip_div2
+ $DIVU $zero,$a0,$DH
+ mflo $QT
+.L_bn_div_words_skip_div2:
+ $MULTU $a2,$QT
+ $SLL $t3,$a0,4*$BNSZ # bits
+ $SRL $at,$a1,4*$BNSZ # bits
+ or $t3,$at
+ mflo $t0
+ mfhi $t1
+.L_bn_div_words_inner_loop2:
+ sltu $t2,$t3,$t0
+ seq $t8,$HH,$t1
+ sltu $at,$HH,$t1
+ and $t2,$t8
+ sltu $v1,$t0,$a2
+ or $at,$t2
+ .set noreorder
+ beqz $at,.L_bn_div_words_inner_loop2_done
+ $SUBU $t1,$v1
+ $SUBU $t0,$a2
+ b .L_bn_div_words_inner_loop2
+ $SUBU $QT,1
+ .set reorder
+.L_bn_div_words_inner_loop2_done:
+
+ $SUBU $a0,$t3,$t0
+ or $v0,$QT
+ $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
+ $SRL $a2,$t9 # restore $a2
+
+ .set noreorder
+ move $a1,$v1
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_div_words_internal
+___
+undef $HH; undef $QT; undef $DH;
+
+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
+
+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
+
+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
+
+$code.=<<___;
+
+.align 5
+.globl bn_mul_comba8
+.ent bn_mul_comba8
+bn_mul_comba8:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,12*$SZREG,$ra
+ .mask 0x803ff008,-$SZREG
+ $PTR_SUB $sp,12*$SZREG
+ $REG_S $ra,11*$SZREG($sp)
+ $REG_S $s5,10*$SZREG($sp)
+ $REG_S $s4,9*$SZREG($sp)
+ $REG_S $s3,8*$SZREG($sp)
+ $REG_S $s2,7*$SZREG($sp)
+ $REG_S $s1,6*$SZREG($sp)
+ $REG_S $s0,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x003f0000,-$SZREG
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $s5,5*$SZREG($sp)
+ $REG_S $s4,4*$SZREG($sp)
+ $REG_S $s3,3*$SZREG($sp)
+ $REG_S $s2,2*$SZREG($sp)
+ $REG_S $s1,1*$SZREG($sp)
+ $REG_S $s0,0*$SZREG($sp)
+___
+$code.=<<___;
+
+ .set reorder
+ $LD $a_0,0($a1) # If compiled with -mips3 option on
+ # R5000 box assembler barks on this
+ # 1ine with "should not have mult/div
+ # as last instruction in bb (R10K
+ # bug)" warning. If anybody out there
+ # has a clue about how to circumvent
+ # this do send me a note.
+ # <appro\@fy.chalmers.se>
+
+ $LD $b_0,0($a2)
+ $LD $a_1,$BNSZ($a1)
+ $LD $a_2,2*$BNSZ($a1)
+ $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_3,3*$BNSZ($a1)
+ $LD $b_1,$BNSZ($a2)
+ $LD $b_2,2*$BNSZ($a2)
+ $LD $b_3,3*$BNSZ($a2)
+ mflo $c_1
+ mfhi $c_2
+
+ $LD $a_4,4*$BNSZ($a1)
+ $LD $a_5,5*$BNSZ($a1)
+ $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD $a_6,6*$BNSZ($a1)
+ $LD $a_7,7*$BNSZ($a1)
+ $LD $b_4,4*$BNSZ($a2)
+ $LD $b_5,5*$BNSZ($a2)
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
+ $ADDU $c_3,$t_2,$at
+ $LD $b_6,6*$BNSZ($a2)
+ $LD $b_7,7*$BNSZ($a2)
+ $ST $c_1,0($a0) # r[0]=c1;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ $ST $c_2,$BNSZ($a0) # r[1]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
+ $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s5,10*$SZREG($sp)
+ $REG_L $s4,9*$SZREG($sp)
+ $REG_L $s3,8*$SZREG($sp)
+ $REG_L $s2,7*$SZREG($sp)
+ $REG_L $s1,6*$SZREG($sp)
+ $REG_L $s0,5*$SZREG($sp)
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ jr $ra
+ $PTR_ADD $sp,12*$SZREG
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+ $REG_L $s5,5*$SZREG($sp)
+ $REG_L $s4,4*$SZREG($sp)
+ $REG_L $s3,3*$SZREG($sp)
+ $REG_L $s2,2*$SZREG($sp)
+ $REG_L $s1,1*$SZREG($sp)
+ $REG_L $s0,0*$SZREG($sp)
+ jr $ra
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+.end bn_mul_comba8
+
+.align 5
+.globl bn_mul_comba4
+.ent bn_mul_comba4
+bn_mul_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ $LD $a_0,0($a1)
+ $LD $b_0,0($a2)
+ $LD $a_1,$BNSZ($a1)
+ $LD $a_2,2*$BNSZ($a1)
+ $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_3,3*$BNSZ($a1)
+ $LD $b_1,$BNSZ($a2)
+ $LD $b_2,2*$BNSZ($a2)
+ $LD $b_3,3*$BNSZ($a2)
+ mflo $c_1
+ mfhi $c_2
+ $ST $c_1,0($a0)
+
+ $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
+ $ADDU $c_3,$t_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ $ST $c_2,$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,3*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,5*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ $ST $c_1,6*$BNSZ($a0)
+ $ST $c_2,7*$BNSZ($a0)
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ nop
+.end bn_mul_comba4
+___
+
+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+
+sub add_c2 () {
+my ($hi,$lo,$c0,$c1,$c2,
+ $warm, # !$warm denotes first call with specific sequence of
+ # $c_[XYZ] when there is no Z-carry to accumulate yet;
+ $an,$bn # these two are arguments for multiplication which
+ # result is used in *next* step [which is why it's
+ # commented as "forward multiplication" below];
+ )=@_;
+$code.=<<___;
+ mflo $lo
+ mfhi $hi
+ $ADDU $c0,$lo
+ sltu $at,$c0,$lo
+ $MULTU $an,$bn # forward multiplication
+ $ADDU $c0,$lo
+ $ADDU $at,$hi
+ sltu $lo,$c0,$lo
+ $ADDU $c1,$at
+ $ADDU $hi,$lo
+___
+$code.=<<___ if (!$warm);
+ sltu $c2,$c1,$at
+ $ADDU $c1,$hi
+ sltu $hi,$c1,$hi
+ $ADDU $c2,$hi
+___
+$code.=<<___ if ($warm);
+ sltu $at,$c1,$at
+ $ADDU $c1,$hi
+ $ADDU $c2,$at
+ sltu $hi,$c1,$hi
+ $ADDU $c2,$hi
+___
+}
+
+$code.=<<___;
+
+.align 5
+.globl bn_sqr_comba8
+.ent bn_sqr_comba8
+bn_sqr_comba8:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ $LD $a_0,0($a1)
+ $LD $a_1,$BNSZ($a1)
+ $LD $a_2,2*$BNSZ($a1)
+ $LD $a_3,3*$BNSZ($a1)
+
+ $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_4,4*$BNSZ($a1)
+ $LD $a_5,5*$BNSZ($a1)
+ $LD $a_6,6*$BNSZ($a1)
+ $LD $a_7,7*$BNSZ($a1)
+ mflo $c_1
+ mfhi $c_2
+ $ST $c_1,0($a0)
+
+ $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $c_3,$t_2,$at
+ $ST $c_2,$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
+$code.=<<___;
+ $ST $c_1,3*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
+$code.=<<___;
+ $ST $c_3,5*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,6*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
+$code.=<<___;
+ $ST $c_2,7*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,8*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
+$code.=<<___;
+ $ST $c_1,9*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,10*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
+$code.=<<___;
+ $ST $c_3,11*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,12*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
+$code.=<<___;
+ $ST $c_2,13*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ $ST $c_3,14*$BNSZ($a0)
+ $ST $c_1,15*$BNSZ($a0)
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ nop
+.end bn_sqr_comba8
+
+.align 5
+.globl bn_sqr_comba4
+.ent bn_sqr_comba4
+bn_sqr_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ $LD $a_0,0($a1)
+ $LD $a_1,$BNSZ($a1)
+ $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_2,2*$BNSZ($a1)
+ $LD $a_3,3*$BNSZ($a1)
+ mflo $c_1
+ mfhi $c_2
+ $ST $c_1,0($a0)
+
+ $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $c_3,$t_2,$at
+ $ST $c_2,$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
+$code.=<<___;
+ $ST $c_1,3*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0)
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
+ $ST $c_3,5*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ $ST $c_1,6*$BNSZ($a0)
+ $ST $c_2,7*$BNSZ($a0)
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ nop
+.end bn_sqr_comba4
+___
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/pa-risc2.s b/openssl-1.1.0h/crypto/bn/asm/pa-risc2.s
new file mode 100644
index 0000000..413eac7
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/pa-risc2.s
@@ -0,0 +1,1624 @@
+; Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved.
+;
+; Licensed under the OpenSSL license (the "License"). You may not use
+; this file except in compliance with the License. You can obtain a copy
+; in the file LICENSE in the source distribution or at
+; https://www.openssl.org/source/license.html
+;
+; PA-RISC 2.0 implementation of bn_asm code, based on the
+; 64-bit version of the code. This code is effectively the
+; same as the 64-bit version except the register model is
+; slightly different given all values must be 32-bit between
+; function calls. Thus the 64-bit return values are returned
+; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
+;
+;
+; This code is approximately 2x faster than the C version
+; for RSA/DSA.
+;
+; See http://devresource.hp.com/ for more details on the PA-RISC
+; architecture. Also see the book "PA-RISC 2.0 Architecture"
+; by Gerry Kane for information on the instruction set architecture.
+;
+; Code written by Chris Ruemmler (with some help from the HP C
+; compiler).
+;
+; The code compiles with HP's assembler
+;
+
+ .level 2.0N
+ .space $TEXT$
+ .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
+
+;
+; Global Register definitions used for the routines.
+;
+; Some information about HP's runtime architecture for 32-bits.
+;
+; "Caller save" means the calling function must save the register
+; if it wants the register to be preserved.
+; "Callee save" means if a function uses the register, it must save
+; the value before using it.
+;
+; For the floating point registers
+;
+; "caller save" registers: fr4-fr11, fr22-fr31
+; "callee save" registers: fr12-fr21
+; "special" registers: fr0-fr3 (status and exception registers)
+;
+; For the integer registers
+; value zero : r0
+; "caller save" registers: r1,r19-r26
+; "callee save" registers: r3-r18
+; return register : r2 (rp)
+; return values ; r28,r29 (ret0,ret1)
+; Stack pointer ; r30 (sp)
+; millicode return ptr ; r31 (also a caller save register)
+
+
+;
+; Arguments to the routines
+;
+r_ptr .reg %r26
+a_ptr .reg %r25
+b_ptr .reg %r24
+num .reg %r24
+n .reg %r23
+
+;
+; Note that the "w" argument for bn_mul_add_words and bn_mul_words
+; is passed on the stack at a delta of -56 from the top of stack
+; as the routine is entered.
+;
+
+;
+; Globals used in some routines
+;
+
+top_overflow .reg %r23
+high_mask .reg %r22 ; value 0xffffffff80000000L
+
+
+;------------------------------------------------------------------------------
+;
+; bn_mul_add_words
+;
+;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
+; int num, BN_ULONG w)
+;
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg3 = num
+; -56(sp) = w
+;
+; Local register definitions
+;
+
+fm1 .reg %fr22
+fm .reg %fr23
+ht_temp .reg %fr24
+ht_temp_1 .reg %fr25
+lt_temp .reg %fr26
+lt_temp_1 .reg %fr27
+fm1_1 .reg %fr28
+fm_1 .reg %fr29
+
+fw_h .reg %fr7L
+fw_l .reg %fr7R
+fw .reg %fr7
+
+fht_0 .reg %fr8L
+flt_0 .reg %fr8R
+t_float_0 .reg %fr8
+
+fht_1 .reg %fr9L
+flt_1 .reg %fr9R
+t_float_1 .reg %fr9
+
+tmp_0 .reg %r31
+tmp_1 .reg %r21
+m_0 .reg %r20
+m_1 .reg %r19
+ht_0 .reg %r1
+ht_1 .reg %r3
+lt_0 .reg %r4
+lt_1 .reg %r5
+m1_0 .reg %r6
+m1_1 .reg %r7
+rp_val .reg %r8
+rp_val_1 .reg %r9
+
+bn_mul_add_words
+ .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
+ .proc
+ .callinfo frame=128
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ NOP ; Needed to make the loop 16-byte aligned
+ NOP ; needed to make the loop 16-byte aligned
+
+ STD %r5,16(%sp) ; save r5
+ NOP
+ STD %r6,24(%sp) ; save r6
+ STD %r7,32(%sp) ; save r7
+
+ STD %r8,40(%sp) ; save r8
+ STD %r9,48(%sp) ; save r9
+ COPY %r0,%ret1 ; return 0 by default
+ DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
+
+ CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
+ LDO 128(%sp),%sp ; bump stack
+
+ ;
+ ; The loop is unrolled twice, so if there is only 1 number
+ ; then go straight to the cleanup code.
+ ;
+ CMPIB,= 1,num,bn_mul_add_words_single_top
+ FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+ ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
+ ; two 32-bit mutiplies can be issued per cycle.
+ ;
+bn_mul_add_words_unroll2
+
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ LDD 0(r_ptr),rp_val ; rp[0]
+ LDD 8(r_ptr),rp_val_1 ; rp[1]
+
+ XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
+ XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
+ FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
+ FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
+
+ XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
+ XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
+ FSTD fm,-8(%sp) ; -8(sp) = m[0]
+ FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
+
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
+ XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
+ FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
+
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
+ FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
+
+ LDD -8(%sp),m_0 ; m[0]
+ LDD -40(%sp),m_1 ; m[1]
+ LDD -16(%sp),m1_0 ; m1[0]
+ LDD -48(%sp),m1_1 ; m1[1]
+
+ LDD -24(%sp),ht_0 ; ht[0]
+ LDD -56(%sp),ht_1 ; ht[1]
+ ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
+ ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
+
+ LDD -32(%sp),lt_0
+ LDD -64(%sp),lt_1
+ CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
+ ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
+
+ CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
+ ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
+ EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
+
+ EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
+ DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
+ ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
+ ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
+
+ ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+ ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
+ ADD,DC ht_1,%r0,ht_1 ; ht[1]++
+
+ ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+ ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+
+ LDO -2(num),num ; num = num - 2;
+ ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
+ ADD,DC ht_1,%r0,ht_1 ; ht[1]++
+ STD lt_0,0(r_ptr) ; rp[0] = lt[0]
+
+ ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
+ ADD,DC ht_1,%r0,%ret1 ; ht[1]++
+ LDO 16(a_ptr),a_ptr ; a_ptr += 2
+
+ STD lt_1,8(r_ptr) ; rp[1] = lt[1]
+ CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
+ LDO 16(r_ptr),r_ptr ; r_ptr += 2
+
+ CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
+
+ ;
+ ; Top of loop aligned on 64-byte boundary
+ ;
+bn_mul_add_words_single_top
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ LDD 0(r_ptr),rp_val ; rp[0]
+ LDO 8(a_ptr),a_ptr ; a_ptr++
+ XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
+ FSTD fm1,-16(%sp) ; -16(sp) = m1
+ XMPYU flt_0,fw_h,fm ; m = lt*fw_h
+ FSTD fm,-8(%sp) ; -8(sp) = m
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt
+
+ LDD -8(%sp),m_0
+ LDD -16(%sp),m1_0 ; m1 = temp1
+ ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
+ LDD -24(%sp),ht_0
+ LDD -32(%sp),lt_0
+
+ CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
+ ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
+
+ EXTRD,U tmp_0,31,32,m_0 ; m>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
+
+ ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
+ ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+ ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+ ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
+ ADD,DC ht_0,%r0,%ret1 ; ht++
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+
+bn_mul_add_words_exit
+ .EXIT
+
+ EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
+ LDD -80(%sp),%r9 ; restore r9
+ LDD -88(%sp),%r8 ; restore r8
+ LDD -96(%sp),%r7 ; restore r7
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3 ; restore r3
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+;
+; arg0 = rp
+; arg1 = ap
+; arg3 = num
+; w on stack at -56(sp)
+
+bn_mul_words
+ .proc
+ .callinfo frame=128
+ .entry
+ .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ NOP
+ STD %r5,16(%sp) ; save r5
+
+ STD %r6,24(%sp) ; save r6
+ STD %r7,32(%sp) ; save r7
+ COPY %r0,%ret1 ; return 0 by default
+ DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
+
+ CMPIB,>= 0,num,bn_mul_words_exit
+ LDO 128(%sp),%sp ; bump stack
+
+ ;
+ ; See if only 1 word to do, thus just do cleanup
+ ;
+ CMPIB,= 1,num,bn_mul_words_single_top
+ FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+ ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
+ ; two 32-bit mutiplies can be issued per cycle.
+ ;
+bn_mul_words_unroll2
+
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
+ XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
+
+ FSTD fm1,-16(%sp) ; -16(sp) = m1
+ FSTD fm1_1,-48(%sp) ; -48(sp) = m1
+ XMPYU flt_0,fw_h,fm ; m = lt*fw_h
+ XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
+
+ FSTD fm,-8(%sp) ; -8(sp) = m
+ FSTD fm_1,-40(%sp) ; -40(sp) = m
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
+ XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
+
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht
+ FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
+
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt
+ FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
+ LDD -8(%sp),m_0
+ LDD -40(%sp),m_1
+
+ LDD -16(%sp),m1_0
+ LDD -48(%sp),m1_1
+ LDD -24(%sp),ht_0
+ LDD -56(%sp),ht_1
+
+ ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
+ ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
+ LDD -32(%sp),lt_0
+ LDD -64(%sp),lt_1
+
+ CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
+ ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
+ CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
+ ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
+
+ EXTRD,U tmp_0,31,32,m_0 ; m>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
+ EXTRD,U tmp_1,31,32,m_1 ; m>>32
+ DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
+
+ ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
+ ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
+ ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
+ ADD,DC ht_1,%r0,ht_1 ; ht++
+ ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
+ ADD,DC ht_1,%r0,ht_1 ; ht++
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+ STD lt_1,8(r_ptr) ; rp[1] = lt
+
+ COPY ht_1,%ret1 ; carry = ht
+ LDO -2(num),num ; num = num - 2;
+ LDO 16(a_ptr),a_ptr ; ap += 2
+ CMPIB,<= 2,num,bn_mul_words_unroll2
+ LDO 16(r_ptr),r_ptr ; rp++
+
+ CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
+
+ ;
+ ; Top of loop aligned on 64-byte boundary
+ ;
+bn_mul_words_single_top
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+
+ XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
+ FSTD fm1,-16(%sp) ; -16(sp) = m1
+ XMPYU flt_0,fw_h,fm ; m = lt*fw_h
+ FSTD fm,-8(%sp) ; -8(sp) = m
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt
+
+ LDD -8(%sp),m_0
+ LDD -16(%sp),m1_0
+ ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
+ LDD -24(%sp),ht_0
+ LDD -32(%sp),lt_0
+
+ CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
+ ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
+
+ EXTRD,U tmp_0,31,32,m_0 ; m>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
+
+ ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
+ ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ ADD %ret1,lt_0,lt_0 ; lt = lt + c;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ COPY ht_0,%ret1 ; copy carry
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+
+bn_mul_words_exit
+ .EXIT
+ EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
+ LDD -96(%sp),%r7 ; restore r7
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3 ; restore r3
+ .PROCEND
+
+;----------------------------------------------------------------------------
+;
+;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = num
+;
+
+bn_sqr_words
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ NOP
+ STD %r5,16(%sp) ; save r5
+
+ CMPIB,>= 0,num,bn_sqr_words_exit
+ LDO 128(%sp),%sp ; bump stack
+
+ ;
+ ; If only 1, the goto straight to cleanup
+ ;
+ CMPIB,= 1,num,bn_sqr_words_single_top
+ DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+
+bn_sqr_words_unroll2
+ FLDD 0(a_ptr),t_float_0 ; a[0]
+ FLDD 8(a_ptr),t_float_1 ; a[1]
+ XMPYU fht_0,flt_0,fm ; m[0]
+ XMPYU fht_1,flt_1,fm_1 ; m[1]
+
+ FSTD fm,-24(%sp) ; store m[0]
+ FSTD fm_1,-56(%sp) ; store m[1]
+ XMPYU flt_0,flt_0,lt_temp ; lt[0]
+ XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
+
+ FSTD lt_temp,-16(%sp) ; store lt[0]
+ FSTD lt_temp_1,-48(%sp) ; store lt[1]
+ XMPYU fht_0,fht_0,ht_temp ; ht[0]
+ XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
+
+ FSTD ht_temp,-8(%sp) ; store ht[0]
+ FSTD ht_temp_1,-40(%sp) ; store ht[1]
+ LDD -24(%sp),m_0
+ LDD -56(%sp),m_1
+
+ AND m_0,high_mask,tmp_0 ; m[0] & Mask
+ AND m_1,high_mask,tmp_1 ; m[1] & Mask
+ DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
+ DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
+
+ LDD -16(%sp),lt_0
+ LDD -48(%sp),lt_1
+ EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
+ EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
+
+ LDD -8(%sp),ht_0
+ LDD -40(%sp),ht_1
+ ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
+ ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
+
+ ADD lt_0,m_0,lt_0 ; lt = lt+m
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+ STD lt_0,0(r_ptr) ; rp[0] = lt[0]
+ STD ht_0,8(r_ptr) ; rp[1] = ht[1]
+
+ ADD lt_1,m_1,lt_1 ; lt = lt+m
+ ADD,DC ht_1,%r0,ht_1 ; ht[1]++
+ STD lt_1,16(r_ptr) ; rp[2] = lt[1]
+ STD ht_1,24(r_ptr) ; rp[3] = ht[1]
+
+ LDO -2(num),num ; num = num - 2;
+ LDO 16(a_ptr),a_ptr ; ap += 2
+ CMPIB,<= 2,num,bn_sqr_words_unroll2
+ LDO 32(r_ptr),r_ptr ; rp += 4
+
+ CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
+
+ ;
+ ; Top of loop aligned on 64-byte boundary
+ ;
+bn_sqr_words_single_top
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+
+ XMPYU fht_0,flt_0,fm ; m
+ FSTD fm,-24(%sp) ; store m
+
+ XMPYU flt_0,flt_0,lt_temp ; lt
+ FSTD lt_temp,-16(%sp) ; store lt
+
+ XMPYU fht_0,fht_0,ht_temp ; ht
+ FSTD ht_temp,-8(%sp) ; store ht
+
+ LDD -24(%sp),m_0 ; load m
+ AND m_0,high_mask,tmp_0 ; m & Mask
+ DEPD,Z m_0,30,31,m_0 ; m << 32+1
+ LDD -16(%sp),lt_0 ; lt
+
+ LDD -8(%sp),ht_0 ; ht
+ EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
+ ADD m_0,lt_0,lt_0 ; lt = lt+m
+ ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+ STD ht_0,8(r_ptr) ; rp[1] = ht
+
+bn_sqr_words_exit
+ .EXIT
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = bp
+; arg3 = n
+
+t .reg %r22
+b .reg %r21
+l .reg %r20
+
+bn_add_words
+ .proc
+ .entry
+ .callinfo
+ .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .align 64
+
+ CMPIB,>= 0,n,bn_add_words_exit
+ COPY %r0,%ret1 ; return 0 by default
+
+ ;
+ ; If 2 or more numbers do the loop
+ ;
+ CMPIB,= 1,n,bn_add_words_single_top
+ NOP
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+bn_add_words_unroll2
+ LDD 0(a_ptr),t
+ LDD 0(b_ptr),b
+ ADD t,%ret1,t ; t = t+c;
+ ADD,DC %r0,%r0,%ret1 ; set c to carry
+ ADD t,b,l ; l = t + b[0]
+ ADD,DC %ret1,%r0,%ret1 ; c+= carry
+ STD l,0(r_ptr)
+
+ LDD 8(a_ptr),t
+ LDD 8(b_ptr),b
+ ADD t,%ret1,t ; t = t+c;
+ ADD,DC %r0,%r0,%ret1 ; set c to carry
+ ADD t,b,l ; l = t + b[0]
+ ADD,DC %ret1,%r0,%ret1 ; c+= carry
+ STD l,8(r_ptr)
+
+ LDO -2(n),n
+ LDO 16(a_ptr),a_ptr
+ LDO 16(b_ptr),b_ptr
+
+ CMPIB,<= 2,n,bn_add_words_unroll2
+ LDO 16(r_ptr),r_ptr
+
+ CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
+
+bn_add_words_single_top
+ LDD 0(a_ptr),t
+ LDD 0(b_ptr),b
+
+ ADD t,%ret1,t ; t = t+c;
+ ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
+ ADD t,b,l ; l = t + b[0]
+ ADD,DC %ret1,%r0,%ret1 ; c+= carry
+ STD l,0(r_ptr)
+
+bn_add_words_exit
+ .EXIT
+ BVE (%rp)
+ EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = bp
+; arg3 = n
+
+t1 .reg %r22
+t2 .reg %r21
+sub_tmp1 .reg %r20
+sub_tmp2 .reg %r19
+
+
+bn_sub_words
+ .proc
+ .callinfo
+ .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ CMPIB,>= 0,n,bn_sub_words_exit
+ COPY %r0,%ret1 ; return 0 by default
+
+ ;
+ ; If 2 or more numbers do the loop
+ ;
+ CMPIB,= 1,n,bn_sub_words_single_top
+ NOP
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+bn_sub_words_unroll2
+ LDD 0(a_ptr),t1
+ LDD 0(b_ptr),t2
+ SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
+ SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
+
+ CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
+ LDO 1(%r0),sub_tmp2
+
+ CMPCLR,*= t1,t2,%r0
+ COPY sub_tmp2,%ret1
+ STD sub_tmp1,0(r_ptr)
+
+ LDD 8(a_ptr),t1
+ LDD 8(b_ptr),t2
+ SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
+ SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
+ CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
+ LDO 1(%r0),sub_tmp2
+
+ CMPCLR,*= t1,t2,%r0
+ COPY sub_tmp2,%ret1
+ STD sub_tmp1,8(r_ptr)
+
+ LDO -2(n),n
+ LDO 16(a_ptr),a_ptr
+ LDO 16(b_ptr),b_ptr
+
+ CMPIB,<= 2,n,bn_sub_words_unroll2
+ LDO 16(r_ptr),r_ptr
+
+ CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
+
+bn_sub_words_single_top
+ LDD 0(a_ptr),t1
+ LDD 0(b_ptr),t2
+ SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
+ SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
+ CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
+ LDO 1(%r0),sub_tmp2
+
+ CMPCLR,*= t1,t2,%r0
+ COPY sub_tmp2,%ret1
+
+ STD sub_tmp1,0(r_ptr)
+
+bn_sub_words_exit
+ .EXIT
+ BVE (%rp)
+ EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+;------------------------------------------------------------------------------
+;
+; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
+;
+; arg0 = h
+; arg1 = l
+; arg2 = d
+;
+; This is mainly just output from the HP C compiler.
+;
+;------------------------------------------------------------------------------
+bn_div_words
+ .PROC
+ .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
+ .IMPORT BN_num_bits_word,CODE
+ ;--- not PIC .IMPORT __iob,DATA
+ ;--- not PIC .IMPORT fprintf,CODE
+ .IMPORT abort,CODE
+ .IMPORT $$div2U,MILLICODE
+ .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
+ .ENTRY
+ STW %r2,-20(%r30) ;offset 0x8ec
+ STW,MA %r3,192(%r30) ;offset 0x8f0
+ STW %r4,-188(%r30) ;offset 0x8f4
+ DEPD %r5,31,32,%r6 ;offset 0x8f8
+ STD %r6,-184(%r30) ;offset 0x8fc
+ DEPD %r7,31,32,%r8 ;offset 0x900
+ STD %r8,-176(%r30) ;offset 0x904
+ STW %r9,-168(%r30) ;offset 0x908
+ LDD -248(%r30),%r3 ;offset 0x90c
+ COPY %r26,%r4 ;offset 0x910
+ COPY %r24,%r5 ;offset 0x914
+ DEPD %r25,31,32,%r4 ;offset 0x918
+ CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
+ DEPD %r23,31,32,%r5 ;offset 0x920
+ MOVIB,TR -1,%r29,$00060002 ;offset 0x924
+ EXTRD,U %r29,31,32,%r28 ;offset 0x928
+$0006002A
+ LDO -1(%r29),%r29 ;offset 0x92c
+ SUB %r23,%r7,%r23 ;offset 0x930
+$00060024
+ SUB %r4,%r31,%r25 ;offset 0x934
+ AND %r25,%r19,%r26 ;offset 0x938
+ CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
+ DEPD,Z %r25,31,32,%r20 ;offset 0x940
+ OR %r20,%r24,%r21 ;offset 0x944
+ CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
+ SUB %r31,%r2,%r31 ;offset 0x94c
+$00060046
+$0006002E
+ DEPD,Z %r23,31,32,%r25 ;offset 0x950
+ EXTRD,U %r23,31,32,%r26 ;offset 0x954
+ AND %r25,%r19,%r24 ;offset 0x958
+ ADD,L %r31,%r26,%r31 ;offset 0x95c
+ CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
+ LDO 1(%r31),%r31 ;offset 0x964
+$00060032
+ CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
+ LDO -1(%r29),%r29 ;offset 0x96c
+ ADD,L %r4,%r3,%r4 ;offset 0x970
+$00060036
+ ADDIB,=,N -1,%r8,$D0 ;offset 0x974
+ SUB %r5,%r24,%r28 ;offset 0x978
+$0006003A
+ SUB %r4,%r31,%r24 ;offset 0x97c
+ SHRPD %r24,%r28,32,%r4 ;offset 0x980
+ DEPD,Z %r29,31,32,%r9 ;offset 0x984
+ DEPD,Z %r28,31,32,%r5 ;offset 0x988
+$0006001C
+ EXTRD,U %r4,31,32,%r31 ;offset 0x98c
+ CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
+ MOVB,TR %r6,%r29,$D1 ;offset 0x994
+ STD %r29,-152(%r30) ;offset 0x998
+$0006000C
+ EXTRD,U %r3,31,32,%r25 ;offset 0x99c
+ COPY %r3,%r26 ;offset 0x9a0
+ EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
+ EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
+ .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
+ B,L BN_num_bits_word,%r2 ;offset 0x9ac
+ EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
+ LDI 64,%r20 ;offset 0x9b4
+ DEPD %r7,31,32,%r5 ;offset 0x9b8
+ DEPD %r8,31,32,%r4 ;offset 0x9bc
+ DEPD %r9,31,32,%r3 ;offset 0x9c0
+ CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
+ COPY %r28,%r24 ;offset 0x9c8
+ MTSARCM %r24 ;offset 0x9cc
+ DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
+ CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
+$00060012
+ SUBI 64,%r24,%r31 ;offset 0x9d8
+ CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
+ SUB %r4,%r3,%r4 ;offset 0x9e0
+$00060016
+ CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
+ COPY %r0,%r9 ;offset 0x9e8
+ MTSARCM %r31 ;offset 0x9ec
+ DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
+ SUBI 64,%r31,%r26 ;offset 0x9f4
+ MTSAR %r26 ;offset 0x9f8
+ SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
+ MTSARCM %r31 ;offset 0xa00
+ DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
+$0006001A
+ DEPDI,Z -1,31,32,%r19 ;offset 0xa08
+ AND %r3,%r19,%r29 ;offset 0xa0c
+ EXTRD,U %r29,31,32,%r2 ;offset 0xa10
+ DEPDI,Z -1,63,32,%r6 ;offset 0xa14
+ MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
+ EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
+$D2
+ ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
+ ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
+ ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
+ ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
+ ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
+ ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
+ .CALL ;
+ B,L abort,%r2 ;offset 0xa34
+ NOP ;offset 0xa38
+ B $D3 ;offset 0xa3c
+ LDW -212(%r30),%r2 ;offset 0xa40
+$00060020
+ COPY %r4,%r26 ;offset 0xa44
+ EXTRD,U %r4,31,32,%r25 ;offset 0xa48
+ COPY %r2,%r24 ;offset 0xa4c
+ .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
+ B,L $$div2U,%r31 ;offset 0xa50
+ EXTRD,U %r2,31,32,%r23 ;offset 0xa54
+ DEPD %r28,31,32,%r29 ;offset 0xa58
+$00060022
+ STD %r29,-152(%r30) ;offset 0xa5c
+$D1
+ AND %r5,%r19,%r24 ;offset 0xa60
+ EXTRD,U %r24,31,32,%r24 ;offset 0xa64
+ STW %r2,-160(%r30) ;offset 0xa68
+ STW %r7,-128(%r30) ;offset 0xa6c
+ FLDD -152(%r30),%fr4 ;offset 0xa70
+ FLDD -152(%r30),%fr7 ;offset 0xa74
+ FLDW -160(%r30),%fr8L ;offset 0xa78
+ FLDW -128(%r30),%fr5L ;offset 0xa7c
+ XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
+ FSTD %fr10,-136(%r30) ;offset 0xa84
+ XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
+ FSTD %fr22,-144(%r30) ;offset 0xa8c
+ XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
+ XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
+ FSTD %fr11,-112(%r30) ;offset 0xa98
+ FSTD %fr23,-120(%r30) ;offset 0xa9c
+ LDD -136(%r30),%r28 ;offset 0xaa0
+ DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
+ LDD -144(%r30),%r20 ;offset 0xaa8
+ ADD,L %r20,%r31,%r31 ;offset 0xaac
+ LDD -112(%r30),%r22 ;offset 0xab0
+ DEPD,Z %r22,31,32,%r22 ;offset 0xab4
+ LDD -120(%r30),%r21 ;offset 0xab8
+ B $00060024 ;offset 0xabc
+ ADD,L %r21,%r22,%r23 ;offset 0xac0
+$D0
+ OR %r9,%r29,%r29 ;offset 0xac4
+$00060040
+ EXTRD,U %r29,31,32,%r28 ;offset 0xac8
+$00060002
+$L2
+ LDW -212(%r30),%r2 ;offset 0xacc
+$D3
+ LDW -168(%r30),%r9 ;offset 0xad0
+ LDD -176(%r30),%r8 ;offset 0xad4
+ EXTRD,U %r8,31,32,%r7 ;offset 0xad8
+ LDD -184(%r30),%r6 ;offset 0xadc
+ EXTRD,U %r6,31,32,%r5 ;offset 0xae0
+ LDW -188(%r30),%r4 ;offset 0xae4
+ BVE (%r2) ;offset 0xae8
+ .EXIT
+ LDW,MB -192(%r30),%r3 ;offset 0xaec
+ .PROCEND ;in=23,25;out=28,29;fpin=105,107;
+
+
+
+
+;----------------------------------------------------------------------------
+;
+; Registers to hold 64-bit values to manipulate. The "L" part
+; of the register corresponds to the upper 32-bits, while the "R"
+; part corresponds to the lower 32-bits
+;
+; Note, that when using b6 and b7, the code must save these before
+; using them because they are callee save registers
+;
+;
+; Floating point registers to use to save values that
+; are manipulated. These don't collide with ftemp1-6 and
+; are all caller save registers
+;
+a0 .reg %fr22
+a0L .reg %fr22L
+a0R .reg %fr22R
+
+a1 .reg %fr23
+a1L .reg %fr23L
+a1R .reg %fr23R
+
+a2 .reg %fr24
+a2L .reg %fr24L
+a2R .reg %fr24R
+
+a3 .reg %fr25
+a3L .reg %fr25L
+a3R .reg %fr25R
+
+a4 .reg %fr26
+a4L .reg %fr26L
+a4R .reg %fr26R
+
+a5 .reg %fr27
+a5L .reg %fr27L
+a5R .reg %fr27R
+
+a6 .reg %fr28
+a6L .reg %fr28L
+a6R .reg %fr28R
+
+a7 .reg %fr29
+a7L .reg %fr29L
+a7R .reg %fr29R
+
+b0 .reg %fr30
+b0L .reg %fr30L
+b0R .reg %fr30R
+
+b1 .reg %fr31
+b1L .reg %fr31L
+b1R .reg %fr31R
+
+;
+; Temporary floating point variables, these are all caller save
+; registers
+;
+ftemp1 .reg %fr4
+ftemp2 .reg %fr5
+ftemp3 .reg %fr6
+ftemp4 .reg %fr7
+
+;
+; The B set of registers when used.
+;
+
+b2 .reg %fr8
+b2L .reg %fr8L
+b2R .reg %fr8R
+
+b3 .reg %fr9
+b3L .reg %fr9L
+b3R .reg %fr9R
+
+b4 .reg %fr10
+b4L .reg %fr10L
+b4R .reg %fr10R
+
+b5 .reg %fr11
+b5L .reg %fr11L
+b5R .reg %fr11R
+
+b6 .reg %fr12
+b6L .reg %fr12L
+b6R .reg %fr12R
+
+b7 .reg %fr13
+b7L .reg %fr13L
+b7R .reg %fr13R
+
+c1 .reg %r21 ; only reg
+temp1 .reg %r20 ; only reg
+temp2 .reg %r19 ; only reg
+temp3 .reg %r31 ; only reg
+
+m1 .reg %r28
+c2 .reg %r23
+high_one .reg %r1
+ht .reg %r6
+lt .reg %r5
+m .reg %r4
+c3 .reg %r3
+
+SQR_ADD_C .macro A0L,A0R,C1,C2,C3
+ XMPYU A0L,A0R,ftemp1 ; m
+ FSTD ftemp1,-24(%sp) ; store m
+
+ XMPYU A0R,A0R,ftemp2 ; lt
+ FSTD ftemp2,-16(%sp) ; store lt
+
+ XMPYU A0L,A0L,ftemp3 ; ht
+ FSTD ftemp3,-8(%sp) ; store ht
+
+ LDD -24(%sp),m ; load m
+ AND m,high_mask,temp2 ; m & Mask
+ DEPD,Z m,30,31,temp3 ; m << 32+1
+ LDD -16(%sp),lt ; lt
+
+ LDD -8(%sp),ht ; ht
+ EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
+ ADD temp3,lt,lt ; lt = lt+m
+ ADD,L ht,temp1,ht ; ht += temp1
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD C1,lt,C1 ; c1=c1+lt
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD C2,ht,C2 ; c2=c2+ht
+ ADD,DC C3,%r0,C3 ; c3++
+.endm
+
+SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
+ XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
+ FSTD ftemp1,-16(%sp) ;
+ XMPYU A0R,A1L,ftemp2 ; m = bh*lt
+ FSTD ftemp2,-8(%sp) ;
+ XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
+ FSTD ftemp3,-32(%sp)
+ XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
+ FSTD ftemp4,-24(%sp) ;
+
+ LDD -8(%sp),m ; r21 = m
+ LDD -16(%sp),m1 ; r19 = m1
+ ADD,L m,m1,m ; m+m1
+
+ DEPD,Z m,31,32,temp3 ; (m+m1<<32)
+ LDD -24(%sp),ht ; r24 = ht
+
+ CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
+ ADD,L ht,high_one,ht ; ht+=high_one
+
+ EXTRD,U m,31,32,temp1 ; m >> 32
+ LDD -32(%sp),lt ; lt
+ ADD,L ht,temp1,ht ; ht+= m>>32
+ ADD lt,temp3,lt ; lt = lt+m1
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD ht,ht,ht ; ht=ht+ht;
+ ADD,DC C3,%r0,C3 ; add in carry (c3++)
+
+ ADD lt,lt,lt ; lt=lt+lt;
+ ADD,DC ht,%r0,ht ; add in carry (ht++)
+
+ ADD C1,lt,C1 ; c1=c1+lt
+ ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
+ LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
+
+ ADD C2,ht,C2 ; c2 = c2 + ht
+ ADD,DC C3,%r0,C3 ; add in carry (c3++)
+.endm
+
+;
+;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+; arg0 = r_ptr
+; arg1 = a_ptr
+;
+
+bn_sqr_comba8
+ .PROC
+ .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .ENTRY
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+ FLDD 32(a_ptr),a4
+ FLDD 40(a_ptr),a5
+ FLDD 48(a_ptr),a6
+ FLDD 56(a_ptr),a7
+
+ SQR_ADD_C a0L,a0R,c1,c2,c3
+ STD c1,0(r_ptr) ; r[0] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
+ STD c2,8(r_ptr) ; r[1] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a1L,a1R,c3,c1,c2
+ SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
+ STD c3,16(r_ptr) ; r[2] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
+ SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
+ STD c1,24(r_ptr) ; r[3] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C a2L,a2R,c2,c3,c1
+ SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
+ SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
+ STD c2,32(r_ptr) ; r[4] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
+ SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
+ SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
+ STD c3,40(r_ptr) ; r[5] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C a3L,a3R,c1,c2,c3
+ SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
+ SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
+ SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
+ STD c1,48(r_ptr) ; r[6] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
+ SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
+ SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
+ SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
+ STD c2,56(r_ptr) ; r[7] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a4L,a4R,c3,c1,c2
+ SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
+ SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
+ SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
+ STD c3,64(r_ptr) ; r[8] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
+ SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
+ SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
+ STD c1,72(r_ptr) ; r[9] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C a5L,a5R,c2,c3,c1
+ SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
+ SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
+ STD c2,80(r_ptr) ; r[10] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
+ SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
+ STD c3,88(r_ptr) ; r[11] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C a6L,a6R,c1,c2,c3
+ SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
+ STD c1,96(r_ptr) ; r[12] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
+ STD c2,104(r_ptr) ; r[13] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a7L,a7R,c3,c1,c2
+ STD c3, 112(r_ptr) ; r[14] = c3
+ STD c1, 120(r_ptr) ; r[15] = c1
+
+ .EXIT
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+;-----------------------------------------------------------------------------
+;
+;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+; arg0 = r_ptr
+; arg1 = a_ptr
+;
+
+bn_sqr_comba4
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+ FLDD 32(a_ptr),a4
+ FLDD 40(a_ptr),a5
+ FLDD 48(a_ptr),a6
+ FLDD 56(a_ptr),a7
+
+ SQR_ADD_C a0L,a0R,c1,c2,c3
+
+ STD c1,0(r_ptr) ; r[0] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
+
+ STD c2,8(r_ptr) ; r[1] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a1L,a1R,c3,c1,c2
+ SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
+
+ STD c3,16(r_ptr) ; r[2] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
+ SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
+
+ STD c1,24(r_ptr) ; r[3] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C a2L,a2R,c2,c3,c1
+ SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
+
+ STD c2,32(r_ptr) ; r[4] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
+ STD c3,40(r_ptr) ; r[5] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C a3L,a3R,c1,c2,c3
+ STD c1,48(r_ptr) ; r[6] = c1;
+ STD c2,56(r_ptr) ; r[7] = c2;
+
+ .EXIT
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+
+;---------------------------------------------------------------------------
+
+MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
+ XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
+ FSTD ftemp1,-16(%sp) ;
+ XMPYU A0R,B0L,ftemp2 ; m = bh*lt
+ FSTD ftemp2,-8(%sp) ;
+ XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
+ FSTD ftemp3,-32(%sp)
+ XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
+ FSTD ftemp4,-24(%sp) ;
+
+ LDD -8(%sp),m ; r21 = m
+ LDD -16(%sp),m1 ; r19 = m1
+ ADD,L m,m1,m ; m+m1
+
+ DEPD,Z m,31,32,temp3 ; (m+m1<<32)
+ LDD -24(%sp),ht ; r24 = ht
+
+ CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
+ ADD,L ht,high_one,ht ; ht+=high_one
+
+ EXTRD,U m,31,32,temp1 ; m >> 32
+ LDD -32(%sp),lt ; lt
+ ADD,L ht,temp1,ht ; ht+= m>>32
+ ADD lt,temp3,lt ; lt = lt+m1
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD C1,lt,C1 ; c1=c1+lt
+ ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
+
+ ADD C2,ht,C2 ; c2 = c2 + ht
+ ADD,DC C3,%r0,C3 ; add in carry (c3++)
+.endm
+
+
+;
+;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg2 = b_ptr
+;
+
+bn_mul_comba8
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+ FSTD %fr12,32(%sp) ; save r6
+ FSTD %fr13,40(%sp) ; save r7
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+ FLDD 32(a_ptr),a4
+ FLDD 40(a_ptr),a5
+ FLDD 48(a_ptr),a6
+ FLDD 56(a_ptr),a7
+
+ FLDD 0(b_ptr),b0
+ FLDD 8(b_ptr),b1
+ FLDD 16(b_ptr),b2
+ FLDD 24(b_ptr),b3
+ FLDD 32(b_ptr),b4
+ FLDD 40(b_ptr),b5
+ FLDD 48(b_ptr),b6
+ FLDD 56(b_ptr),b7
+
+ MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
+ STD c1,0(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
+ STD c2,8(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
+ STD c3,16(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
+ MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
+ STD c1,24(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
+ MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
+ MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
+ STD c2,32(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
+ MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
+ MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
+ MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
+ STD c3,40(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
+ MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
+ MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
+ MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
+ MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
+ STD c1,48(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
+ MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
+ MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
+ MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
+ MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
+ MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
+ STD c2,56(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
+ MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
+ MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
+ MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
+ MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
+ STD c3,64(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
+ MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
+ MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
+ MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
+ STD c1,72(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
+ MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
+ MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
+ MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
+ MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
+ STD c2,80(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
+ MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
+ MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
+ MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
+ STD c3,88(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
+ MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
+ MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
+ STD c1,96(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
+ MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
+ STD c2,104(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
+ STD c3,112(r_ptr)
+ STD c1,120(r_ptr)
+
+ .EXIT
+ FLDD -88(%sp),%fr13
+ FLDD -96(%sp),%fr12
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+;-----------------------------------------------------------------------------
+;
+;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg2 = b_ptr
+;
+
+bn_mul_comba4
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+ FSTD %fr12,32(%sp) ; save r6
+ FSTD %fr13,40(%sp) ; save r7
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+
+ FLDD 0(b_ptr),b0
+ FLDD 8(b_ptr),b1
+ FLDD 16(b_ptr),b2
+ FLDD 24(b_ptr),b3
+
+ MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
+ STD c1,0(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
+ STD c2,8(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
+ STD c3,16(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
+ MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
+ STD c1,24(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
+ STD c2,32(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
+ MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
+ STD c3,40(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
+ STD c1,48(r_ptr)
+ STD c2,56(r_ptr)
+
+ .EXIT
+ FLDD -88(%sp),%fr13
+ FLDD -96(%sp),%fr12
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+
+;--- not PIC .SPACE $TEXT$
+;--- not PIC .SUBSPA $CODE$
+;--- not PIC .SPACE $PRIVATE$,SORT=16
+;--- not PIC .IMPORT $global$,DATA
+;--- not PIC .SPACE $TEXT$
+;--- not PIC .SUBSPA $CODE$
+;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
+;--- not PIC C$7
+;--- not PIC .ALIGN 8
+;--- not PIC .STRINGZ "Division would overflow (%d)\n"
+ .END
diff --git a/openssl-1.1.0h/crypto/bn/asm/pa-risc2W.s b/openssl-1.1.0h/crypto/bn/asm/pa-risc2W.s
new file mode 100644
index 0000000..9738117
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/pa-risc2W.s
@@ -0,0 +1,1612 @@
+; Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+;
+; Licensed under the OpenSSL license (the "License"). You may not use
+; this file except in compliance with the License. You can obtain a copy
+; in the file LICENSE in the source distribution or at
+; https://www.openssl.org/source/license.html
+
+;
+; PA-RISC 64-bit implementation of bn_asm code
+;
+; This code is approximately 2x faster than the C version
+; for RSA/DSA.
+;
+; See http://devresource.hp.com/ for more details on the PA-RISC
+; architecture. Also see the book "PA-RISC 2.0 Architecture"
+; by Gerry Kane for information on the instruction set architecture.
+;
+; Code written by Chris Ruemmler (with some help from the HP C
+; compiler).
+;
+; The code compiles with HP's assembler
+;
+
+ .level 2.0W
+ .space $TEXT$
+ .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
+
+;
+; Global Register definitions used for the routines.
+;
+; Some information about HP's runtime architecture for 64-bits.
+;
+; "Caller save" means the calling function must save the register
+; if it wants the register to be preserved.
+; "Callee save" means if a function uses the register, it must save
+; the value before using it.
+;
+; For the floating point registers
+;
+; "caller save" registers: fr4-fr11, fr22-fr31
+; "callee save" registers: fr12-fr21
+; "special" registers: fr0-fr3 (status and exception registers)
+;
+; For the integer registers
+; value zero : r0
+; "caller save" registers: r1,r19-r26
+; "callee save" registers: r3-r18
+; return register : r2 (rp)
+; return values ; r28 (ret0,ret1)
+; Stack pointer ; r30 (sp)
+; global data pointer ; r27 (dp)
+; argument pointer ; r29 (ap)
+; millicode return ptr ; r31 (also a caller save register)
+
+
+;
+; Arguments to the routines
+;
+r_ptr .reg %r26
+a_ptr .reg %r25
+b_ptr .reg %r24
+num .reg %r24
+w .reg %r23
+n .reg %r23
+
+
+;
+; Globals used in some routines
+;
+
+top_overflow .reg %r29
+high_mask .reg %r22 ; value 0xffffffff80000000L
+
+
+;------------------------------------------------------------------------------
+;
+; bn_mul_add_words
+;
+;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
+; int num, BN_ULONG w)
+;
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg2 = num
+; arg3 = w
+;
+; Local register definitions
+;
+
+fm1 .reg %fr22
+fm .reg %fr23
+ht_temp .reg %fr24
+ht_temp_1 .reg %fr25
+lt_temp .reg %fr26
+lt_temp_1 .reg %fr27
+fm1_1 .reg %fr28
+fm_1 .reg %fr29
+
+fw_h .reg %fr7L
+fw_l .reg %fr7R
+fw .reg %fr7
+
+fht_0 .reg %fr8L
+flt_0 .reg %fr8R
+t_float_0 .reg %fr8
+
+fht_1 .reg %fr9L
+flt_1 .reg %fr9R
+t_float_1 .reg %fr9
+
+tmp_0 .reg %r31
+tmp_1 .reg %r21
+m_0 .reg %r20
+m_1 .reg %r19
+ht_0 .reg %r1
+ht_1 .reg %r3
+lt_0 .reg %r4
+lt_1 .reg %r5
+m1_0 .reg %r6
+m1_1 .reg %r7
+rp_val .reg %r8
+rp_val_1 .reg %r9
+
+bn_mul_add_words
+ .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
+ .proc
+ .callinfo frame=128
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ NOP ; Needed to make the loop 16-byte aligned
+ NOP ; Needed to make the loop 16-byte aligned
+
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+ STD %r7,32(%sp) ; save r7
+ STD %r8,40(%sp) ; save r8
+
+ STD %r9,48(%sp) ; save r9
+ COPY %r0,%ret0 ; return 0 by default
+ DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
+ STD w,56(%sp) ; store w on stack
+
+ CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
+ LDO 128(%sp),%sp ; bump stack
+
+ ;
+ ; The loop is unrolled twice, so if there is only 1 number
+ ; then go straight to the cleanup code.
+ ;
+ CMPIB,= 1,num,bn_mul_add_words_single_top
+ FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+ ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
+ ; two 32-bit mutiplies can be issued per cycle.
+ ;
+bn_mul_add_words_unroll2
+
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ LDD 0(r_ptr),rp_val ; rp[0]
+ LDD 8(r_ptr),rp_val_1 ; rp[1]
+
+ XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
+ XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
+ FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
+ FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
+
+ XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
+ XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
+ FSTD fm,-8(%sp) ; -8(sp) = m[0]
+ FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
+
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
+ XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
+ FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
+
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
+ FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
+
+ LDD -8(%sp),m_0 ; m[0]
+ LDD -40(%sp),m_1 ; m[1]
+ LDD -16(%sp),m1_0 ; m1[0]
+ LDD -48(%sp),m1_1 ; m1[1]
+
+ LDD -24(%sp),ht_0 ; ht[0]
+ LDD -56(%sp),ht_1 ; ht[1]
+ ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
+ ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
+
+ LDD -32(%sp),lt_0
+ LDD -64(%sp),lt_1
+ CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
+ ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
+
+ CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
+ ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
+ EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
+
+ EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
+ DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
+ ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
+ ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
+
+ ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+ ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
+ ADD,DC ht_1,%r0,ht_1 ; ht[1]++
+
+ ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+ ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+
+ LDO -2(num),num ; num = num - 2;
+ ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
+ ADD,DC ht_1,%r0,ht_1 ; ht[1]++
+ STD lt_0,0(r_ptr) ; rp[0] = lt[0]
+
+ ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
+ ADD,DC ht_1,%r0,%ret0 ; ht[1]++
+ LDO 16(a_ptr),a_ptr ; a_ptr += 2
+
+ STD lt_1,8(r_ptr) ; rp[1] = lt[1]
+ CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
+ LDO 16(r_ptr),r_ptr ; r_ptr += 2
+
+ CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
+
+ ;
+ ; Top of loop aligned on 64-byte boundary
+ ;
+bn_mul_add_words_single_top
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ LDD 0(r_ptr),rp_val ; rp[0]
+ LDO 8(a_ptr),a_ptr ; a_ptr++
+ XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
+ FSTD fm1,-16(%sp) ; -16(sp) = m1
+ XMPYU flt_0,fw_h,fm ; m = lt*fw_h
+ FSTD fm,-8(%sp) ; -8(sp) = m
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt
+
+ LDD -8(%sp),m_0
+ LDD -16(%sp),m1_0 ; m1 = temp1
+ ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
+ LDD -24(%sp),ht_0
+ LDD -32(%sp),lt_0
+
+ CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
+ ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
+
+ EXTRD,U tmp_0,31,32,m_0 ; m>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
+
+ ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
+ ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+ ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+ ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
+ ADD,DC ht_0,%r0,%ret0 ; ht++
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+
+bn_mul_add_words_exit
+ .EXIT
+ LDD -80(%sp),%r9 ; restore r9
+ LDD -88(%sp),%r8 ; restore r8
+ LDD -96(%sp),%r7 ; restore r7
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3 ; restore r3
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = num
+; arg3 = w
+
+bn_mul_words
+ .proc
+ .callinfo frame=128
+ .entry
+ .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+
+ STD %r7,32(%sp) ; save r7
+ COPY %r0,%ret0 ; return 0 by default
+ DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
+ STD w,56(%sp) ; w on stack
+
+ CMPIB,>= 0,num,bn_mul_words_exit
+ LDO 128(%sp),%sp ; bump stack
+
+ ;
+ ; See if only 1 word to do, thus just do cleanup
+ ;
+ CMPIB,= 1,num,bn_mul_words_single_top
+ FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+ ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
+ ; two 32-bit mutiplies can be issued per cycle.
+ ;
+bn_mul_words_unroll2
+
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+ XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
+ XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
+
+ FSTD fm1,-16(%sp) ; -16(sp) = m1
+ FSTD fm1_1,-48(%sp) ; -48(sp) = m1
+ XMPYU flt_0,fw_h,fm ; m = lt*fw_h
+ XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
+
+ FSTD fm,-8(%sp) ; -8(sp) = m
+ FSTD fm_1,-40(%sp) ; -40(sp) = m
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
+ XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
+
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht
+ FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
+
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt
+ FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
+ LDD -8(%sp),m_0
+ LDD -40(%sp),m_1
+
+ LDD -16(%sp),m1_0
+ LDD -48(%sp),m1_1
+ LDD -24(%sp),ht_0
+ LDD -56(%sp),ht_1
+
+ ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
+ ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
+ LDD -32(%sp),lt_0
+ LDD -64(%sp),lt_1
+
+ CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
+ ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
+ CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
+ ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
+
+ EXTRD,U tmp_0,31,32,m_0 ; m>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
+ EXTRD,U tmp_1,31,32,m_1 ; m>>32
+ DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
+
+ ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
+ ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
+ ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
+ ADD,DC ht_1,%r0,ht_1 ; ht++
+ ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
+ ADD,DC ht_1,%r0,ht_1 ; ht++
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+ STD lt_1,8(r_ptr) ; rp[1] = lt
+
+ COPY ht_1,%ret0 ; carry = ht
+ LDO -2(num),num ; num = num - 2;
+ LDO 16(a_ptr),a_ptr ; ap += 2
+ CMPIB,<= 2,num,bn_mul_words_unroll2
+ LDO 16(r_ptr),r_ptr ; rp++
+
+ CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
+
+ ;
+ ; Top of loop aligned on 64-byte boundary
+ ;
+bn_mul_words_single_top
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+
+ XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
+ FSTD fm1,-16(%sp) ; -16(sp) = m1
+ XMPYU flt_0,fw_h,fm ; m = lt*fw_h
+ FSTD fm,-8(%sp) ; -8(sp) = m
+ XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
+ FSTD ht_temp,-24(%sp) ; -24(sp) = ht
+ XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
+ FSTD lt_temp,-32(%sp) ; -32(sp) = lt
+
+ LDD -8(%sp),m_0
+ LDD -16(%sp),m1_0
+ ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
+ LDD -24(%sp),ht_0
+ LDD -32(%sp),lt_0
+
+ CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
+ ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
+
+ EXTRD,U tmp_0,31,32,m_0 ; m>>32
+ DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
+
+ ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
+ ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ ADD %ret0,lt_0,lt_0 ; lt = lt + c;
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ COPY ht_0,%ret0 ; copy carry
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+
+bn_mul_words_exit
+ .EXIT
+ LDD -96(%sp),%r7 ; restore r7
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3 ; restore r3
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = num
+;
+
+bn_sqr_words
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ NOP
+ STD %r5,16(%sp) ; save r5
+
+ CMPIB,>= 0,num,bn_sqr_words_exit
+ LDO 128(%sp),%sp ; bump stack
+
+ ;
+ ; If only 1, the goto straight to cleanup
+ ;
+ CMPIB,= 1,num,bn_sqr_words_single_top
+ DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+
+bn_sqr_words_unroll2
+ FLDD 0(a_ptr),t_float_0 ; a[0]
+ FLDD 8(a_ptr),t_float_1 ; a[1]
+ XMPYU fht_0,flt_0,fm ; m[0]
+ XMPYU fht_1,flt_1,fm_1 ; m[1]
+
+ FSTD fm,-24(%sp) ; store m[0]
+ FSTD fm_1,-56(%sp) ; store m[1]
+ XMPYU flt_0,flt_0,lt_temp ; lt[0]
+ XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
+
+ FSTD lt_temp,-16(%sp) ; store lt[0]
+ FSTD lt_temp_1,-48(%sp) ; store lt[1]
+ XMPYU fht_0,fht_0,ht_temp ; ht[0]
+ XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
+
+ FSTD ht_temp,-8(%sp) ; store ht[0]
+ FSTD ht_temp_1,-40(%sp) ; store ht[1]
+ LDD -24(%sp),m_0
+ LDD -56(%sp),m_1
+
+ AND m_0,high_mask,tmp_0 ; m[0] & Mask
+ AND m_1,high_mask,tmp_1 ; m[1] & Mask
+ DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
+ DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
+
+ LDD -16(%sp),lt_0
+ LDD -48(%sp),lt_1
+ EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
+ EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
+
+ LDD -8(%sp),ht_0
+ LDD -40(%sp),ht_1
+ ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
+ ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
+
+ ADD lt_0,m_0,lt_0 ; lt = lt+m
+ ADD,DC ht_0,%r0,ht_0 ; ht[0]++
+ STD lt_0,0(r_ptr) ; rp[0] = lt[0]
+ STD ht_0,8(r_ptr) ; rp[1] = ht[1]
+
+ ADD lt_1,m_1,lt_1 ; lt = lt+m
+ ADD,DC ht_1,%r0,ht_1 ; ht[1]++
+ STD lt_1,16(r_ptr) ; rp[2] = lt[1]
+ STD ht_1,24(r_ptr) ; rp[3] = ht[1]
+
+ LDO -2(num),num ; num = num - 2;
+ LDO 16(a_ptr),a_ptr ; ap += 2
+ CMPIB,<= 2,num,bn_sqr_words_unroll2
+ LDO 32(r_ptr),r_ptr ; rp += 4
+
+ CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
+
+ ;
+ ; Top of loop aligned on 64-byte boundary
+ ;
+bn_sqr_words_single_top
+ FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
+
+ XMPYU fht_0,flt_0,fm ; m
+ FSTD fm,-24(%sp) ; store m
+
+ XMPYU flt_0,flt_0,lt_temp ; lt
+ FSTD lt_temp,-16(%sp) ; store lt
+
+ XMPYU fht_0,fht_0,ht_temp ; ht
+ FSTD ht_temp,-8(%sp) ; store ht
+
+ LDD -24(%sp),m_0 ; load m
+ AND m_0,high_mask,tmp_0 ; m & Mask
+ DEPD,Z m_0,30,31,m_0 ; m << 32+1
+ LDD -16(%sp),lt_0 ; lt
+
+ LDD -8(%sp),ht_0 ; ht
+ EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
+ ADD m_0,lt_0,lt_0 ; lt = lt+m
+ ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
+ ADD,DC ht_0,%r0,ht_0 ; ht++
+
+ STD lt_0,0(r_ptr) ; rp[0] = lt
+ STD ht_0,8(r_ptr) ; rp[1] = ht
+
+bn_sqr_words_exit
+ .EXIT
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = bp
+; arg3 = n
+
+t .reg %r22
+b .reg %r21
+l .reg %r20
+
+bn_add_words
+ .proc
+ .entry
+ .callinfo
+ .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .align 64
+
+ CMPIB,>= 0,n,bn_add_words_exit
+ COPY %r0,%ret0 ; return 0 by default
+
+ ;
+ ; If 2 or more numbers do the loop
+ ;
+ CMPIB,= 1,n,bn_add_words_single_top
+ NOP
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+bn_add_words_unroll2
+ LDD 0(a_ptr),t
+ LDD 0(b_ptr),b
+ ADD t,%ret0,t ; t = t+c;
+ ADD,DC %r0,%r0,%ret0 ; set c to carry
+ ADD t,b,l ; l = t + b[0]
+ ADD,DC %ret0,%r0,%ret0 ; c+= carry
+ STD l,0(r_ptr)
+
+ LDD 8(a_ptr),t
+ LDD 8(b_ptr),b
+ ADD t,%ret0,t ; t = t+c;
+ ADD,DC %r0,%r0,%ret0 ; set c to carry
+ ADD t,b,l ; l = t + b[0]
+ ADD,DC %ret0,%r0,%ret0 ; c+= carry
+ STD l,8(r_ptr)
+
+ LDO -2(n),n
+ LDO 16(a_ptr),a_ptr
+ LDO 16(b_ptr),b_ptr
+
+ CMPIB,<= 2,n,bn_add_words_unroll2
+ LDO 16(r_ptr),r_ptr
+
+ CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
+
+bn_add_words_single_top
+ LDD 0(a_ptr),t
+ LDD 0(b_ptr),b
+
+ ADD t,%ret0,t ; t = t+c;
+ ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
+ ADD t,b,l ; l = t + b[0]
+ ADD,DC %ret0,%r0,%ret0 ; c+= carry
+ STD l,0(r_ptr)
+
+bn_add_words_exit
+ .EXIT
+ BVE (%rp)
+ NOP
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = bp
+; arg3 = n
+
+t1 .reg %r22
+t2 .reg %r21
+sub_tmp1 .reg %r20
+sub_tmp2 .reg %r19
+
+
+bn_sub_words
+ .proc
+ .callinfo
+ .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ CMPIB,>= 0,n,bn_sub_words_exit
+ COPY %r0,%ret0 ; return 0 by default
+
+ ;
+ ; If 2 or more numbers do the loop
+ ;
+ CMPIB,= 1,n,bn_sub_words_single_top
+ NOP
+
+ ;
+ ; This loop is unrolled 2 times (64-byte aligned as well)
+ ;
+bn_sub_words_unroll2
+ LDD 0(a_ptr),t1
+ LDD 0(b_ptr),t2
+ SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
+ SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
+
+ CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
+ LDO 1(%r0),sub_tmp2
+
+ CMPCLR,*= t1,t2,%r0
+ COPY sub_tmp2,%ret0
+ STD sub_tmp1,0(r_ptr)
+
+ LDD 8(a_ptr),t1
+ LDD 8(b_ptr),t2
+ SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
+ SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
+ CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
+ LDO 1(%r0),sub_tmp2
+
+ CMPCLR,*= t1,t2,%r0
+ COPY sub_tmp2,%ret0
+ STD sub_tmp1,8(r_ptr)
+
+ LDO -2(n),n
+ LDO 16(a_ptr),a_ptr
+ LDO 16(b_ptr),b_ptr
+
+ CMPIB,<= 2,n,bn_sub_words_unroll2
+ LDO 16(r_ptr),r_ptr
+
+ CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
+
+bn_sub_words_single_top
+ LDD 0(a_ptr),t1
+ LDD 0(b_ptr),t2
+ SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
+ SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
+ CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
+ LDO 1(%r0),sub_tmp2
+
+ CMPCLR,*= t1,t2,%r0
+ COPY sub_tmp2,%ret0
+
+ STD sub_tmp1,0(r_ptr)
+
+bn_sub_words_exit
+ .EXIT
+ BVE (%rp)
+ NOP
+ .PROCEND ;in=23,24,25,26,29;out=28;
+
+;------------------------------------------------------------------------------
+;
+; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
+;
+; arg0 = h
+; arg1 = l
+; arg2 = d
+;
+; This is mainly just modified assembly from the compiler, thus the
+; lack of variable names.
+;
+;------------------------------------------------------------------------------
+bn_div_words
+ .proc
+ .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
+ .IMPORT __iob,DATA
+ .IMPORT fprintf,CODE,NO_RELOCATION
+ .IMPORT abort,CODE,NO_RELOCATION
+ .IMPORT $$div2U,MILLICODE
+ .entry
+ STD %r2,-16(%r30)
+ STD,MA %r3,352(%r30)
+ STD %r4,-344(%r30)
+ STD %r5,-336(%r30)
+ STD %r6,-328(%r30)
+ STD %r7,-320(%r30)
+ STD %r8,-312(%r30)
+ STD %r9,-304(%r30)
+ STD %r10,-296(%r30)
+
+ STD %r27,-288(%r30) ; save gp
+
+ COPY %r24,%r3 ; save d
+ COPY %r26,%r4 ; save h (high 64-bits)
+ LDO -1(%r0),%ret0 ; return -1 by default
+
+ CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
+ COPY %r25,%r5 ; save l (low 64-bits)
+
+ LDO -48(%r30),%r29 ; create ap
+ .CALL ;in=26,29;out=28;
+ B,L BN_num_bits_word,%r2
+ COPY %r3,%r26
+ LDD -288(%r30),%r27 ; restore gp
+ LDI 64,%r21
+
+ CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
+ COPY %ret0,%r24 ; i
+ MTSARCM %r24
+ DEPDI,Z -1,%sar,1,%r29
+ CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
+
+$00000012
+ SUBI 64,%r24,%r31 ; i = 64 - i;
+ CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
+ SUB %r4,%r3,%r4 ; h -= d
+ CMPB,= %r31,%r0,$0000001A ; if (i)
+ COPY %r0,%r10 ; ret = 0
+ MTSARCM %r31 ; i to shift
+ DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
+ SUBI 64,%r31,%r19 ; 64 - i; redundent
+ MTSAR %r19 ; (64 -i) to shift
+ SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
+ MTSARCM %r31 ; i to shift
+ DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
+
+$0000001A
+ DEPDI,Z -1,31,32,%r19
+ EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
+ EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
+ LDO 2(%r0),%r9
+ STD %r3,-280(%r30) ; "d" to stack
+
+$0000001C
+ DEPDI,Z -1,63,32,%r29 ;
+ EXTRD,U %r4,31,32,%r31 ; h >> 32
+ CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
+ COPY %r4,%r26
+ EXTRD,U %r4,31,32,%r25
+ COPY %r6,%r24
+ .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
+ B,L $$div2U,%r2
+ EXTRD,U %r6,31,32,%r23
+ DEPD %r28,31,32,%r29
+$D2
+ STD %r29,-272(%r30) ; q
+ AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
+ EXTRD,U %r24,31,32,%r24 ; ???
+ FLDD -272(%r30),%fr7 ; q
+ FLDD -280(%r30),%fr8 ; d
+ XMPYU %fr8L,%fr7L,%fr10
+ FSTD %fr10,-256(%r30)
+ XMPYU %fr8L,%fr7R,%fr22
+ FSTD %fr22,-264(%r30)
+ XMPYU %fr8R,%fr7L,%fr11
+ XMPYU %fr8R,%fr7R,%fr23
+ FSTD %fr11,-232(%r30)
+ FSTD %fr23,-240(%r30)
+ LDD -256(%r30),%r28
+ DEPD,Z %r28,31,32,%r2
+ LDD -264(%r30),%r20
+ ADD,L %r20,%r2,%r31
+ LDD -232(%r30),%r22
+ DEPD,Z %r22,31,32,%r22
+ LDD -240(%r30),%r21
+ B $00000024 ; enter loop
+ ADD,L %r21,%r22,%r23
+
+$0000002A
+ LDO -1(%r29),%r29
+ SUB %r23,%r8,%r23
+$00000024
+ SUB %r4,%r31,%r25
+ AND %r25,%r19,%r26
+ CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
+ DEPD,Z %r25,31,32,%r20
+ OR %r20,%r24,%r21
+ CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
+ SUB %r31,%r6,%r31
+;-------------Break path---------------------
+
+$00000046
+ DEPD,Z %r23,31,32,%r25 ;tl
+ EXTRD,U %r23,31,32,%r26 ;t
+ AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
+ ADD,L %r31,%r26,%r31 ;th += t;
+ CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
+ LDO 1(%r31),%r31 ; th++;
+ CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
+ LDO -1(%r29),%r29 ;q--;
+ ADD,L %r4,%r3,%r4 ;h += d;
+$00000036
+ ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
+ SUB %r5,%r24,%r28 ; l -= tl;
+ SUB %r4,%r31,%r24 ; h -= th;
+ SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
+ DEPD,Z %r29,31,32,%r10 ; ret = q<<32
+ b $0000001C
+ DEPD,Z %r28,31,32,%r5 ; l = l << 32
+
+$D1
+ OR %r10,%r29,%r28 ; ret |= q
+$D3
+ LDD -368(%r30),%r2
+$D0
+ LDD -296(%r30),%r10
+ LDD -304(%r30),%r9
+ LDD -312(%r30),%r8
+ LDD -320(%r30),%r7
+ LDD -328(%r30),%r6
+ LDD -336(%r30),%r5
+ LDD -344(%r30),%r4
+ BVE (%r2)
+ .EXIT
+ LDD,MB -352(%r30),%r3
+
+bn_div_err_case
+ MFIA %r6
+ ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
+ LDO R'bn_div_words-bn_div_err_case(%r1),%r6
+ ADDIL LT'__iob,%r27,%r1
+ LDD RT'__iob(%r1),%r26
+ ADDIL L'C$4-bn_div_words,%r6,%r1
+ LDO R'C$4-bn_div_words(%r1),%r25
+ LDO 64(%r26),%r26
+ .CALL ;in=24,25,26,29;out=28;
+ B,L fprintf,%r2
+ LDO -48(%r30),%r29
+ LDD -288(%r30),%r27
+ .CALL ;in=29;
+ B,L abort,%r2
+ LDO -48(%r30),%r29
+ LDD -288(%r30),%r27
+ B $D0
+ LDD -368(%r30),%r2
+ .PROCEND ;in=24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+; Registers to hold 64-bit values to manipulate. The "L" part
+; of the register corresponds to the upper 32-bits, while the "R"
+; part corresponds to the lower 32-bits
+;
+; Note, that when using b6 and b7, the code must save these before
+; using them because they are callee save registers
+;
+;
+; Floating point registers to use to save values that
+; are manipulated. These don't collide with ftemp1-6 and
+; are all caller save registers
+;
+a0 .reg %fr22
+a0L .reg %fr22L
+a0R .reg %fr22R
+
+a1 .reg %fr23
+a1L .reg %fr23L
+a1R .reg %fr23R
+
+a2 .reg %fr24
+a2L .reg %fr24L
+a2R .reg %fr24R
+
+a3 .reg %fr25
+a3L .reg %fr25L
+a3R .reg %fr25R
+
+a4 .reg %fr26
+a4L .reg %fr26L
+a4R .reg %fr26R
+
+a5 .reg %fr27
+a5L .reg %fr27L
+a5R .reg %fr27R
+
+a6 .reg %fr28
+a6L .reg %fr28L
+a6R .reg %fr28R
+
+a7 .reg %fr29
+a7L .reg %fr29L
+a7R .reg %fr29R
+
+b0 .reg %fr30
+b0L .reg %fr30L
+b0R .reg %fr30R
+
+b1 .reg %fr31
+b1L .reg %fr31L
+b1R .reg %fr31R
+
+;
+; Temporary floating point variables, these are all caller save
+; registers
+;
+ftemp1 .reg %fr4
+ftemp2 .reg %fr5
+ftemp3 .reg %fr6
+ftemp4 .reg %fr7
+
+;
+; The B set of registers when used.
+;
+
+b2 .reg %fr8
+b2L .reg %fr8L
+b2R .reg %fr8R
+
+b3 .reg %fr9
+b3L .reg %fr9L
+b3R .reg %fr9R
+
+b4 .reg %fr10
+b4L .reg %fr10L
+b4R .reg %fr10R
+
+b5 .reg %fr11
+b5L .reg %fr11L
+b5R .reg %fr11R
+
+b6 .reg %fr12
+b6L .reg %fr12L
+b6R .reg %fr12R
+
+b7 .reg %fr13
+b7L .reg %fr13L
+b7R .reg %fr13R
+
+c1 .reg %r21 ; only reg
+temp1 .reg %r20 ; only reg
+temp2 .reg %r19 ; only reg
+temp3 .reg %r31 ; only reg
+
+m1 .reg %r28
+c2 .reg %r23
+high_one .reg %r1
+ht .reg %r6
+lt .reg %r5
+m .reg %r4
+c3 .reg %r3
+
+SQR_ADD_C .macro A0L,A0R,C1,C2,C3
+ XMPYU A0L,A0R,ftemp1 ; m
+ FSTD ftemp1,-24(%sp) ; store m
+
+ XMPYU A0R,A0R,ftemp2 ; lt
+ FSTD ftemp2,-16(%sp) ; store lt
+
+ XMPYU A0L,A0L,ftemp3 ; ht
+ FSTD ftemp3,-8(%sp) ; store ht
+
+ LDD -24(%sp),m ; load m
+ AND m,high_mask,temp2 ; m & Mask
+ DEPD,Z m,30,31,temp3 ; m << 32+1
+ LDD -16(%sp),lt ; lt
+
+ LDD -8(%sp),ht ; ht
+ EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
+ ADD temp3,lt,lt ; lt = lt+m
+ ADD,L ht,temp1,ht ; ht += temp1
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD C1,lt,C1 ; c1=c1+lt
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD C2,ht,C2 ; c2=c2+ht
+ ADD,DC C3,%r0,C3 ; c3++
+.endm
+
+SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
+ XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
+ FSTD ftemp1,-16(%sp) ;
+ XMPYU A0R,A1L,ftemp2 ; m = bh*lt
+ FSTD ftemp2,-8(%sp) ;
+ XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
+ FSTD ftemp3,-32(%sp)
+ XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
+ FSTD ftemp4,-24(%sp) ;
+
+ LDD -8(%sp),m ; r21 = m
+ LDD -16(%sp),m1 ; r19 = m1
+ ADD,L m,m1,m ; m+m1
+
+ DEPD,Z m,31,32,temp3 ; (m+m1<<32)
+ LDD -24(%sp),ht ; r24 = ht
+
+ CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
+ ADD,L ht,high_one,ht ; ht+=high_one
+
+ EXTRD,U m,31,32,temp1 ; m >> 32
+ LDD -32(%sp),lt ; lt
+ ADD,L ht,temp1,ht ; ht+= m>>32
+ ADD lt,temp3,lt ; lt = lt+m1
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD ht,ht,ht ; ht=ht+ht;
+ ADD,DC C3,%r0,C3 ; add in carry (c3++)
+
+ ADD lt,lt,lt ; lt=lt+lt;
+ ADD,DC ht,%r0,ht ; add in carry (ht++)
+
+ ADD C1,lt,C1 ; c1=c1+lt
+ ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
+ LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
+
+ ADD C2,ht,C2 ; c2 = c2 + ht
+ ADD,DC C3,%r0,C3 ; add in carry (c3++)
+.endm
+
+;
+;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+; arg0 = r_ptr
+; arg1 = a_ptr
+;
+
+bn_sqr_comba8
+ .PROC
+ .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .ENTRY
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+ FLDD 32(a_ptr),a4
+ FLDD 40(a_ptr),a5
+ FLDD 48(a_ptr),a6
+ FLDD 56(a_ptr),a7
+
+ SQR_ADD_C a0L,a0R,c1,c2,c3
+ STD c1,0(r_ptr) ; r[0] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
+ STD c2,8(r_ptr) ; r[1] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a1L,a1R,c3,c1,c2
+ SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
+ STD c3,16(r_ptr) ; r[2] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
+ SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
+ STD c1,24(r_ptr) ; r[3] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C a2L,a2R,c2,c3,c1
+ SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
+ SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
+ STD c2,32(r_ptr) ; r[4] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
+ SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
+ SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
+ STD c3,40(r_ptr) ; r[5] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C a3L,a3R,c1,c2,c3
+ SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
+ SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
+ SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
+ STD c1,48(r_ptr) ; r[6] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
+ SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
+ SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
+ SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
+ STD c2,56(r_ptr) ; r[7] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a4L,a4R,c3,c1,c2
+ SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
+ SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
+ SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
+ STD c3,64(r_ptr) ; r[8] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
+ SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
+ SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
+ STD c1,72(r_ptr) ; r[9] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C a5L,a5R,c2,c3,c1
+ SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
+ SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
+ STD c2,80(r_ptr) ; r[10] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
+ SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
+ STD c3,88(r_ptr) ; r[11] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C a6L,a6R,c1,c2,c3
+ SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
+ STD c1,96(r_ptr) ; r[12] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
+ STD c2,104(r_ptr) ; r[13] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a7L,a7R,c3,c1,c2
+ STD c3, 112(r_ptr) ; r[14] = c3
+ STD c1, 120(r_ptr) ; r[15] = c1
+
+ .EXIT
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+;-----------------------------------------------------------------------------
+;
+;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+; arg0 = r_ptr
+; arg1 = a_ptr
+;
+
+bn_sqr_comba4
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+ FLDD 32(a_ptr),a4
+ FLDD 40(a_ptr),a5
+ FLDD 48(a_ptr),a6
+ FLDD 56(a_ptr),a7
+
+ SQR_ADD_C a0L,a0R,c1,c2,c3
+
+ STD c1,0(r_ptr) ; r[0] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
+
+ STD c2,8(r_ptr) ; r[1] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C a1L,a1R,c3,c1,c2
+ SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
+
+ STD c3,16(r_ptr) ; r[2] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
+ SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
+
+ STD c1,24(r_ptr) ; r[3] = c1;
+ COPY %r0,c1
+
+ SQR_ADD_C a2L,a2R,c2,c3,c1
+ SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
+
+ STD c2,32(r_ptr) ; r[4] = c2;
+ COPY %r0,c2
+
+ SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
+ STD c3,40(r_ptr) ; r[5] = c3;
+ COPY %r0,c3
+
+ SQR_ADD_C a3L,a3R,c1,c2,c3
+ STD c1,48(r_ptr) ; r[6] = c1;
+ STD c2,56(r_ptr) ; r[7] = c2;
+
+ .EXIT
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+
+;---------------------------------------------------------------------------
+
+MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
+ XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
+ FSTD ftemp1,-16(%sp) ;
+ XMPYU A0R,B0L,ftemp2 ; m = bh*lt
+ FSTD ftemp2,-8(%sp) ;
+ XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
+ FSTD ftemp3,-32(%sp)
+ XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
+ FSTD ftemp4,-24(%sp) ;
+
+ LDD -8(%sp),m ; r21 = m
+ LDD -16(%sp),m1 ; r19 = m1
+ ADD,L m,m1,m ; m+m1
+
+ DEPD,Z m,31,32,temp3 ; (m+m1<<32)
+ LDD -24(%sp),ht ; r24 = ht
+
+ CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
+ ADD,L ht,high_one,ht ; ht+=high_one
+
+ EXTRD,U m,31,32,temp1 ; m >> 32
+ LDD -32(%sp),lt ; lt
+ ADD,L ht,temp1,ht ; ht+= m>>32
+ ADD lt,temp3,lt ; lt = lt+m1
+ ADD,DC ht,%r0,ht ; ht++
+
+ ADD C1,lt,C1 ; c1=c1+lt
+ ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
+
+ ADD C2,ht,C2 ; c2 = c2 + ht
+ ADD,DC C3,%r0,C3 ; add in carry (c3++)
+.endm
+
+
+;
+;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg2 = b_ptr
+;
+
+bn_mul_comba8
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+ FSTD %fr12,32(%sp) ; save r6
+ FSTD %fr13,40(%sp) ; save r7
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+ FLDD 32(a_ptr),a4
+ FLDD 40(a_ptr),a5
+ FLDD 48(a_ptr),a6
+ FLDD 56(a_ptr),a7
+
+ FLDD 0(b_ptr),b0
+ FLDD 8(b_ptr),b1
+ FLDD 16(b_ptr),b2
+ FLDD 24(b_ptr),b3
+ FLDD 32(b_ptr),b4
+ FLDD 40(b_ptr),b5
+ FLDD 48(b_ptr),b6
+ FLDD 56(b_ptr),b7
+
+ MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
+ STD c1,0(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
+ STD c2,8(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
+ STD c3,16(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
+ MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
+ STD c1,24(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
+ MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
+ MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
+ STD c2,32(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
+ MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
+ MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
+ MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
+ STD c3,40(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
+ MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
+ MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
+ MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
+ MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
+ STD c1,48(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
+ MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
+ MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
+ MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
+ MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
+ MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
+ STD c2,56(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
+ MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
+ MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
+ MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
+ MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
+ STD c3,64(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
+ MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
+ MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
+ MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
+ STD c1,72(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
+ MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
+ MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
+ MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
+ MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
+ STD c2,80(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
+ MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
+ MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
+ MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
+ STD c3,88(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
+ MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
+ MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
+ STD c1,96(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
+ MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
+ STD c2,104(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
+ STD c3,112(r_ptr)
+ STD c1,120(r_ptr)
+
+ .EXIT
+ FLDD -88(%sp),%fr13
+ FLDD -96(%sp),%fr12
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+;-----------------------------------------------------------------------------
+;
+;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg2 = b_ptr
+;
+
+bn_mul_comba4
+ .proc
+ .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+ .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+ .entry
+ .align 64
+
+ STD %r3,0(%sp) ; save r3
+ STD %r4,8(%sp) ; save r4
+ STD %r5,16(%sp) ; save r5
+ STD %r6,24(%sp) ; save r6
+ FSTD %fr12,32(%sp) ; save r6
+ FSTD %fr13,40(%sp) ; save r7
+
+ ;
+ ; Zero out carries
+ ;
+ COPY %r0,c1
+ COPY %r0,c2
+ COPY %r0,c3
+
+ LDO 128(%sp),%sp ; bump stack
+ DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
+
+ ;
+ ; Load up all of the values we are going to use
+ ;
+ FLDD 0(a_ptr),a0
+ FLDD 8(a_ptr),a1
+ FLDD 16(a_ptr),a2
+ FLDD 24(a_ptr),a3
+
+ FLDD 0(b_ptr),b0
+ FLDD 8(b_ptr),b1
+ FLDD 16(b_ptr),b2
+ FLDD 24(b_ptr),b3
+
+ MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
+ STD c1,0(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
+ STD c2,8(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
+ MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
+ MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
+ STD c3,16(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
+ MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
+ MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
+ MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
+ STD c1,24(r_ptr)
+ COPY %r0,c1
+
+ MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
+ MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
+ MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
+ STD c2,32(r_ptr)
+ COPY %r0,c2
+
+ MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
+ MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
+ STD c3,40(r_ptr)
+ COPY %r0,c3
+
+ MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
+ STD c1,48(r_ptr)
+ STD c2,56(r_ptr)
+
+ .EXIT
+ FLDD -88(%sp),%fr13
+ FLDD -96(%sp),%fr12
+ LDD -104(%sp),%r6 ; restore r6
+ LDD -112(%sp),%r5 ; restore r5
+ LDD -120(%sp),%r4 ; restore r4
+ BVE (%rp)
+ LDD,MB -128(%sp),%r3
+
+ .PROCEND
+
+
+ .SPACE $TEXT$
+ .SUBSPA $CODE$
+ .SPACE $PRIVATE$,SORT=16
+ .IMPORT $global$,DATA
+ .SPACE $TEXT$
+ .SUBSPA $CODE$
+ .SUBSPA $LIT$,ACCESS=0x2c
+C$4
+ .ALIGN 8
+ .STRINGZ "Division would overflow (%d)\n"
+ .END
diff --git a/openssl-1.1.0h/crypto/bn/asm/parisc-mont.pl b/openssl-1.1.0h/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000..8aa94e8
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,1002 @@
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+ $BN_SZ =$SIZE_T;
+} else {
+ $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+ $BN_SZ =$SIZE_T;
+ if (open CONF,"<${dir}../../opensslconf.h") {
+ while(<CONF>) {
+ if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+ $BN_SZ=8;
+ $LEVEL="2.0";
+ last;
+ }
+ }
+ close CONF;
+ }
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
+ # [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32; # local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22"; # passed through stack in 32-bit
+$num="%r21"; # passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4"; $fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
+$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+ .ALIGN 64
+bn_mul_mont
+ .PROC
+ .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ ldo -$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldw `-$FRAME_MARKER-4`($fp),$n0
+ ldw `-$FRAME_MARKER-8`($fp),$num
+ nop
+ nop ; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+ comiclr,<= 6,$num,%r0 ; are vectors long enough?
+ b L\$abort
+ ldi 0,%r28 ; signal "unhandled"
+ add,ev %r0,$num,$num ; is $num even?
+ b L\$abort
+ nop
+ or $ap,$np,$ti1
+ extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
+ b L\$abort
+ nop
+ nop ; alignment
+ nop
+
+ fldws 0($n0),${fn0}
+ fldws,ma 4($bp),${fbi} ; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+ comib,> 3,$num,L\$abort ; are vectors long enough?
+ ldi 0,%r28 ; signal "unhandled"
+ addl $num,$num,$num ; I operate on 32-bit values
+
+ fldws 4($n0),${fn0} ; only low part of n0
+ fldws 4($bp),${fbi} ; bp[0] in flipped word order
+___
+$code.=<<___;
+ fldds 0($ap),${fai} ; ap[0,1]
+ fldds 0($np),${fni} ; np[0,1]
+
+ sh2addl $num,%r0,$arrsz
+ ldi 31,$hi0
+ ldo 36($arrsz),$hi1 ; space for tp[num+1]
+ andcm $hi1,$hi0,$hi1 ; align
+ addl $hi1,%sp,%sp
+ $PUSH $fp,-$SIZE_T(%sp)
+
+ ldo `$LOCALS+16`($fp),$xfer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ addl $arrsz,$ap,$ap ; point at the end
+ addl $arrsz,$np,$np
+ subi 0,$arrsz,$idx ; j=0
+ ldo 8($idx),$idx ; j++++
+
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ fstds ${fab1},0($xfer)
+ fstds ${fnm1},8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+ mtctl $hi0,%cr11 ; $hi0 still holds 31
+ extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
+ b L\$parisc11
+ nop
+___
+$code.=<<___; # PA-RISC 2.0 code-path
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $ab0,63,32,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ ldo 8($idx),$idx ; j++++
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ extrd,u $nm0,31,32,$hi1
+
+L\$1st
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ stw $nm1,-4($tp) ; tp[j-1]
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ addl $ab1,$nm1,$nm1
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+ fldws 0($bp),${fbi} ; bp[1] in flipped word order
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ addl $hi1,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2]
+ flddx $idx($np),${fni} ; np[2]
+ ldo 8($idx),$idx ; j++++
+ ldd -16($xfer),$ab0 ; 33-bit value
+ ldd -8($xfer),$nm0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ extrd,u $ab0,31,32,$ti0 ; carry bit
+ extrd,u $ab0,63,32,$ab0
+ fstds ${fab1},0($xfer)
+ addl $ti0,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ extrd,u $nm0,31,32,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+L\$inner
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldw 4($tp),$ti0 ; tp[j]
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ti0,$ti0
+ addl $ti0,$ab0,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $nm1,31,32,$hi1
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldw 4($tp),$ti0 ; tp[j]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ addl $ti0,$ab0,$ab0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,31,32,$hi0
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldi 12,$ti0 ; bp[i] in flipped word order
+ addl,ev %r0,$num,$num
+ ldi -4,$ti0
+ addl $ti0,$bp,$bp
+ fldws 0($bp),${fbi}
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0]
+ addl $hi0,$ab1,$ab1
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ addl $hi1,$hi0,$hi0
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+ addl $hi0,$ab1,$ab1
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ addl $hi1,$hi0,$hi0
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+ ldws,ma 4($tp),$ti0
+ extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
+ b L\$sub_pa11
+ addl $tp,$arrsz,$tp
+L\$sub
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldd,ma 8($tp),$ti0
+L\$sub
+ ldd $idx($np),$hi0
+ shrpd $ti0,$ti0,32,$ti0 ; flip word order
+ std $ti0,-8($tp) ; save flipped value
+ sub,db $ti0,$hi0,$hi1
+ ldd,ma 8($tp),$ti0
+ addib,<> 8,$idx,L\$sub
+ std,ma $hi1,8($rp)
+
+ extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
+ sub,db $ti0,%r0,$hi1
+ ldo -8($tp),$tp
+___
+$code.=<<___;
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy
+ ldd $idx($np),$hi0
+ std,ma %r0,8($tp)
+ addib,<> 8,$idx,.-8 ; L\$copy
+ std,ma $hi0,8($rp)
+___
+
+if ($BN_SZ==4) { # PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+ b L\$done
+ nop
+
+ .ALIGN 8
+L\$parisc11
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -12($xfer),$ablo
+ ldw -16($xfer),$hi0
+ ldw -4($xfer),$nmlo0
+ ldw -8($xfer),$nmhi0
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+ ldo 8($idx),$idx ; j++++
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ addc %r0,$nmhi0,$hi1
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+ nop
+
+L\$1st_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 12($xfer),$nmlo1
+ addc %r0,$abhi,$hi0
+ ldw 8($xfer),$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fab1},0($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ fstds ${fnm1},8($xfer)
+ add $hi1,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$hi1
+ ldw -16($xfer),$abhi
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ ldw -4($xfer),$nmlo0
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -8($xfer),$nmhi0
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ fstds ${fab0},-16($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ fstds ${fnm0},-8($xfer)
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ ldw 8($xfer),$nmhi1
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ add $hi0,$ablo,$ablo
+ fstds ${fab1},0($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -8($xfer),$nmhi0
+ addc %r0,$nmhi1,$hi1
+ ldw -4($xfer),$nmlo0
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[1]
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+ ldw -16($xfer),$abhi ; carry bit actually
+ ldo 8($idx),$idx ; j++++
+ ldw -12($xfer),$ablo
+ ldw -8($xfer),$nmhi0
+ ldw -4($xfer),$nmlo0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ fstds ${fab1},0($xfer)
+ addl $abhi,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ addc %r0,$nmhi0,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+
+L\$inner_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ ldw 12($xfer),$nmlo1
+ add $ti1,$ablo,$ablo
+ ldw 8($xfer),$nmhi1
+ addc %r0,$abhi,$hi0
+ fstds ${fab1},0($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fnm1},8($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ ldw 8($tp),$ti1 ; tp[j]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -4($xfer),$nmlo0
+ add $hi0,$ablo,$ablo
+ ldw -8($xfer),$nmhi0
+ addc %r0,$abhi,$abhi
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ add $ti0,$ablo,$ablo
+ fstds ${fab0},-16($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm0},-8($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldw 8($xfer),$nmhi1
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ fstds ${fab1},0($xfer)
+ add $ti1,$ablo,$ablo
+ fstds ${fnm1},8($xfer)
+ addc %r0,$abhi,$hi0
+ ldw -16($xfer),$abhi
+ add $ablo,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$nmhi1
+ ldw -8($xfer),$nmhi0
+ add $hi1,$nmlo1,$nmlo1
+ ldw -4($xfer),$nmlo0
+ addc %r0,$nmhi1,$hi1
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$abhi
+ add $ti0,$ablo,$ablo
+ ldw 8($tp),$ti1 ; tp[j]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone_pa11; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[i]
+ flddx $idx($ap),${fai} ; ap[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer_pa11
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32+4`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+ ldw -4($tp),$ti0
+ addl $tp,$arrsz,$tp
+L\$sub_pa11
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub_pa11
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy_pa11
+ ldwx $idx($np),$hi0
+ stws,ma %r0,4($tp)
+ addib,<> 4,$idx,L\$copy_pa11
+ stws,ma $hi0,4($rp)
+
+ nop ; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+ ldi 1,%r28 ; signal "handled"
+ ldo $FRAME($fp),%sp ; destroy tp[num+1]
+
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+ .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "ldd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
+ { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
+ { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+ $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $std = sub {
+ my ($mod,$args) = @_;
+ my $orig = "std$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
+ { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+ $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $extrd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "extrd$mod\t$args";
+
+ # I only have ",u" completer, it's implicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
+ { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+ my $len=32-$3;
+ $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
+ { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+ my $len=32-$2;
+ $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
+ $opcode |= (1<<13) if ($mod =~ /,\**=/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "shrpd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
+ { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+ my $cpos=63-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $sub = sub {
+ my ($mod,$args) = @_;
+ my $orig = "sub$mod\t$args";
+
+ if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+ my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+ $opcode|=(1<<10); # e1
+ $opcode|=(1<<8); # e2
+ $opcode|=(1<<5); # d
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+ }
+ else { "\t".$orig; }
+};
+
+sub assemble {
+ my ($mnemonic,$mod,$args)=@_;
+ my $opcode = eval("\$$mnemonic");
+
+ ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ # flip word order in 64-bit mode...
+ s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+ # assemble 2.0 instructions in 32-bit mode...
+ s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+ s/\bbv\b/bve/gm if ($SIZE_T==8);
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/ppc-mont.pl b/openssl-1.1.0h/crypto/bn/asm/ppc-mont.pl
new file mode 100644
index 0000000..5802260
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,342 @@
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2006
+
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit +65%
+# 1024-bit +35%
+# 2048-bit +18%
+# 4096-bit +4%
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+ $BITS= 32;
+ $BNSZ= $BITS/8;
+ $SIZE_T=4;
+ $RZONE= 224;
+
+ $LD= "lwz"; # load
+ $LDU= "lwzu"; # load and update
+ $LDX= "lwzx"; # load indexed
+ $ST= "stw"; # store
+ $STU= "stwu"; # store and update
+ $STX= "stwx"; # store indexed
+ $STUX= "stwux"; # store indexed and update
+ $UMULL= "mullw"; # unsigned multiply low
+ $UMULH= "mulhwu"; # unsigned multiply high
+ $UCMP= "cmplw"; # unsigned compare
+ $SHRI= "srwi"; # unsigned shift right by immediate
+ $PUSH= $ST;
+ $POP= $LD;
+} elsif ($flavour =~ /64/) {
+ $BITS= 64;
+ $BNSZ= $BITS/8;
+ $SIZE_T=8;
+ $RZONE= 288;
+
+ # same as above, but 64-bit mnemonics...
+ $LD= "ld"; # load
+ $LDU= "ldu"; # load and update
+ $LDX= "ldx"; # load indexed
+ $ST= "std"; # store
+ $STU= "stdu"; # store and update
+ $STX= "stdx"; # store indexed
+ $STUX= "stdux"; # store indexed and update
+ $UMULL= "mulld"; # unsigned multiply low
+ $UMULH= "mulhdu"; # unsigned multiply high
+ $UCMP= "cmpld"; # unsigned compare
+ $SHRI= "srdi"; # unsigned shift right by immediate
+ $PUSH= $ST;
+ $POP= $LD;
+} else { die "nonsense $flavour"; }
+
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$sp="r1";
+$toc="r2";
+$rp="r3"; $ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9"; # $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
+#
+$nhi="r0";
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl .bn_mul_mont_int
+.align 4
+.bn_mul_mont_int:
+ cmpwi $num,4
+ mr $rp,r3 ; $rp is reassigned
+ li r3,0
+ bltlr
+___
+$code.=<<___ if ($BNSZ==4);
+ cmpwi $num,32 ; longer key performance is not better
+ bgelr
+___
+$code.=<<___;
+ slwi $num,$num,`log($BNSZ)/log(2)`
+ li $tj,-4096
+ addi $ovf,$num,$FRAME
+ subf $ovf,$ovf,$sp ; $sp-$ovf
+ and $ovf,$ovf,$tj ; minimize TLB usage
+ subf $ovf,$sp,$ovf ; $ovf-$sp
+ mr $tj,$sp
+ srwi $num,$num,`log($BNSZ)/log(2)`
+ $STUX $sp,$sp,$ovf
+
+ $PUSH r20,`-12*$SIZE_T`($tj)
+ $PUSH r21,`-11*$SIZE_T`($tj)
+ $PUSH r22,`-10*$SIZE_T`($tj)
+ $PUSH r23,`-9*$SIZE_T`($tj)
+ $PUSH r24,`-8*$SIZE_T`($tj)
+ $PUSH r25,`-7*$SIZE_T`($tj)
+ $PUSH r26,`-6*$SIZE_T`($tj)
+ $PUSH r27,`-5*$SIZE_T`($tj)
+ $PUSH r28,`-4*$SIZE_T`($tj)
+ $PUSH r29,`-3*$SIZE_T`($tj)
+ $PUSH r30,`-2*$SIZE_T`($tj)
+ $PUSH r31,`-1*$SIZE_T`($tj)
+
+ $LD $n0,0($n0) ; pull n0[0] value
+ addi $num,$num,-2 ; adjust $num for counter register
+
+ $LD $m0,0($bp) ; m0=bp[0]
+ $LD $aj,0($ap) ; ap[0]
+ addi $tp,$sp,$LOCALS
+ $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
+ $UMULH $hi0,$aj,$m0
+
+ $LD $aj,$BNSZ($ap) ; ap[1]
+ $LD $nj,0($np) ; np[0]
+
+ $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
+
+ $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
+ $UMULH $ahi,$aj,$m0
+
+ $UMULL $lo1,$nj,$m1 ; np[0]*m1
+ $UMULH $hi1,$nj,$m1
+ $LD $nj,$BNSZ($np) ; np[1]
+ addc $lo1,$lo1,$lo0
+ addze $hi1,$hi1
+
+ $UMULL $nlo,$nj,$m1 ; np[1]*m1
+ $UMULH $nhi,$nj,$m1
+
+ mtctr $num
+ li $j,`2*$BNSZ`
+.align 4
+L1st:
+ $LDX $aj,$ap,$j ; ap[j]
+ addc $lo0,$alo,$hi0
+ $LDX $nj,$np,$j ; np[j]
+ addze $hi0,$ahi
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
+ addc $lo1,$nlo,$hi1
+ $UMULH $ahi,$aj,$m0
+ addze $hi1,$nhi
+ $UMULL $nlo,$nj,$m1 ; np[j]*m1
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
+ $UMULH $nhi,$nj,$m1
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+
+ addi $j,$j,$BNSZ ; j++
+ addi $tp,$tp,$BNSZ ; tp++
+ bdnz L1st
+;L1st
+ addc $lo0,$alo,$hi0
+ addze $hi0,$ahi
+
+ addc $lo1,$nlo,$hi1
+ addze $hi1,$nhi
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+
+ li $ovf,0
+ addc $hi1,$hi1,$hi0
+ addze $ovf,$ovf ; upmost overflow bit
+ $ST $hi1,$BNSZ($tp)
+
+ li $i,$BNSZ
+.align 4
+Louter:
+ $LDX $m0,$bp,$i ; m0=bp[i]
+ $LD $aj,0($ap) ; ap[0]
+ addi $tp,$sp,$LOCALS
+ $LD $tj,$LOCALS($sp); tp[0]
+ $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
+ $UMULH $hi0,$aj,$m0
+ $LD $aj,$BNSZ($ap) ; ap[1]
+ $LD $nj,0($np) ; np[0]
+ addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
+ addze $hi0,$hi0
+ $UMULL $m1,$lo0,$n0 ; tp[0]*n0
+ $UMULH $ahi,$aj,$m0
+ $UMULL $lo1,$nj,$m1 ; np[0]*m1
+ $UMULH $hi1,$nj,$m1
+ $LD $nj,$BNSZ($np) ; np[1]
+ addc $lo1,$lo1,$lo0
+ $UMULL $nlo,$nj,$m1 ; np[1]*m1
+ addze $hi1,$hi1
+ $UMULH $nhi,$nj,$m1
+
+ mtctr $num
+ li $j,`2*$BNSZ`
+.align 4
+Linner:
+ $LDX $aj,$ap,$j ; ap[j]
+ addc $lo0,$alo,$hi0
+ $LD $tj,$BNSZ($tp) ; tp[j]
+ addze $hi0,$ahi
+ $LDX $nj,$np,$j ; np[j]
+ addc $lo1,$nlo,$hi1
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
+ addze $hi1,$nhi
+ $UMULH $ahi,$aj,$m0
+ addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
+ $UMULL $nlo,$nj,$m1 ; np[j]*m1
+ addze $hi0,$hi0
+ $UMULH $nhi,$nj,$m1
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
+ addi $j,$j,$BNSZ ; j++
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+ addi $tp,$tp,$BNSZ ; tp++
+ bdnz Linner
+;Linner
+ $LD $tj,$BNSZ($tp) ; tp[j]
+ addc $lo0,$alo,$hi0
+ addze $hi0,$ahi
+ addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
+ addze $hi0,$hi0
+
+ addc $lo1,$nlo,$hi1
+ addze $hi1,$nhi
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+
+ addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
+ li $ovf,0
+ adde $hi1,$hi1,$hi0
+ addze $ovf,$ovf
+ $ST $hi1,$BNSZ($tp)
+;
+ slwi $tj,$num,`log($BNSZ)/log(2)`
+ $UCMP $i,$tj
+ addi $i,$i,$BNSZ
+ ble Louter
+
+ addi $num,$num,2 ; restore $num
+ subfc $j,$j,$j ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,$LOCALS
+ mtctr $num
+
+.align 4
+Lsub: $LDX $tj,$tp,$j
+ $LDX $nj,$np,$j
+ subfe $aj,$nj,$tj ; tp[j]-np[j]
+ $STX $aj,$rp,$j
+ addi $j,$j,$BNSZ
+ bdnz Lsub
+
+ li $j,0
+ mtctr $num
+ subfe $ovf,$j,$ovf ; handle upmost overflow bit
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ $LDX $tj,$ap,$j
+ $STX $tj,$rp,$j
+ $STX $j,$tp,$j ; zap at once
+ addi $j,$j,$BNSZ
+ bdnz Lcopy
+
+ $POP $tj,0($sp)
+ li r3,1
+ $POP r20,`-12*$SIZE_T`($tj)
+ $POP r21,`-11*$SIZE_T`($tj)
+ $POP r22,`-10*$SIZE_T`($tj)
+ $POP r23,`-9*$SIZE_T`($tj)
+ $POP r24,`-8*$SIZE_T`($tj)
+ $POP r25,`-7*$SIZE_T`($tj)
+ $POP r26,`-6*$SIZE_T`($tj)
+ $POP r27,`-5*$SIZE_T`($tj)
+ $POP r28,`-4*$SIZE_T`($tj)
+ $POP r29,`-3*$SIZE_T`($tj)
+ $POP r30,`-2*$SIZE_T`($tj)
+ $POP r31,`-1*$SIZE_T`($tj)
+ mr $sp,$tj
+ blr
+ .long 0
+ .byte 0,12,4,0,0x80,12,6,0
+ .long 0
+.size .bn_mul_mont_int,.-.bn_mul_mont_int
+
+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/ppc.pl b/openssl-1.1.0h/crypto/bn/asm/ppc.pl
new file mode 100644
index 0000000..4ea534a
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/ppc.pl
@@ -0,0 +1,2014 @@
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# Implemented as a Perl wrapper as we want to support several different
+# architectures with single file. We pick up the target based on the
+# file name we are asked to generate.
+#
+# It should be noted though that this perl code is nothing like
+# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
+# as pre-processor to cover for platform differences in name decoration,
+# linker tables, 32-/64-bit instruction sets...
+#
+# As you might know there're several PowerPC ABI in use. Most notably
+# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
+# are similar enough to implement leaf(!) functions, which would be ABI
+# neutral. And that's what you find here: ABI neutral leaf functions.
+# In case you wonder what that is...
+#
+# AIX performance
+#
+# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
+#
+# The following is the performance of 32-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6c 21 dec 2001
+# built on: Tue Jun 11 11:06:51 EDT 2002
+# options:bn(64,32) ...
+#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
+#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
+#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
+#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
+#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
+#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
+#
+# Same bechmark with this assembler code:
+#
+#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
+#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
+#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
+#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
+#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
+#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
+#
+# Number of operations increases by at almost 75%
+#
+# Here are performance numbers for 64-bit compiler
+# generated code:
+#
+# OpenSSL 0.9.6g [engine] 9 Aug 2002
+# built on: Fri Apr 18 16:59:20 EDT 2003
+# options:bn(64,64) ...
+# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
+# sign verify sign/s verify/s
+#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
+#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
+#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
+#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
+#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
+#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
+#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
+#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
+#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
+#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
+#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
+#
+# Again, performance increases by at about 75%
+#
+# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
+# OpenSSL 0.9.7c 30 Sep 2003
+#
+# Original code.
+#
+#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
+#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
+#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
+#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
+#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
+#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
+#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
+#
+# Same benchmark with this assembler code:
+#
+#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
+#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
+#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
+#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
+#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
+#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
+#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
+#
+# Performance increase of ~60%
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari@us.ibm.com
+#
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+ $BITS= 32;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc\"";
+
+ $LD= "lwz"; # load
+ $LDU= "lwzu"; # load and update
+ $ST= "stw"; # store
+ $STU= "stwu"; # store and update
+ $UMULL= "mullw"; # unsigned multiply low
+ $UMULH= "mulhwu"; # unsigned multiply high
+ $UDIV= "divwu"; # unsigned divide
+ $UCMPI= "cmplwi"; # unsigned compare with immediate
+ $UCMP= "cmplw"; # unsigned compare
+ $CNTLZ= "cntlzw"; # count leading zeros
+ $SHL= "slw"; # shift left
+ $SHR= "srw"; # unsigned shift right
+ $SHRI= "srwi"; # unsigned shift right by immediate
+ $SHLI= "slwi"; # shift left by immediate
+ $CLRU= "clrlwi"; # clear upper bits
+ $INSR= "insrwi"; # insert right
+ $ROTL= "rotlwi"; # rotate left by immediate
+ $TR= "tw"; # conditional trap
+} elsif ($flavour =~ /64/) {
+ $BITS= 64;
+ $BNSZ= $BITS/8;
+ $ISA= "\"ppc64\"";
+
+ # same as above, but 64-bit mnemonics...
+ $LD= "ld"; # load
+ $LDU= "ldu"; # load and update
+ $ST= "std"; # store
+ $STU= "stdu"; # store and update
+ $UMULL= "mulld"; # unsigned multiply low
+ $UMULH= "mulhdu"; # unsigned multiply high
+ $UDIV= "divdu"; # unsigned divide
+ $UCMPI= "cmpldi"; # unsigned compare with immediate
+ $UCMP= "cmpld"; # unsigned compare
+ $CNTLZ= "cntlzd"; # count leading zeros
+ $SHL= "sld"; # shift left
+ $SHR= "srd"; # unsigned shift right
+ $SHRI= "srdi"; # unsigned shift right by immediate
+ $SHLI= "sldi"; # shift left by immediate
+ $CLRU= "clrldi"; # clear upper bits
+ $INSR= "insrdi"; # insert right
+ $ROTL= "rotldi"; # rotate left by immediate
+ $TR= "td"; # conditional trap
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$data=<<EOF;
+#--------------------------------------------------------------------
+#
+#
+#
+#
+# File: ppc32.s
+#
+# Created by: Suresh Chari
+# IBM Thomas J. Watson Research Library
+# Hawthorne, NY
+#
+#
+# Description: Optimized assembly routines for OpenSSL crypto
+# on the 32 bitPowerPC platform.
+#
+#
+# Version History
+#
+# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
+# cleaned up code. Also made a single version which can
+# be used for both the AIX and Linux compilers. See NOTE
+# below.
+# 12/05/03 Suresh Chari
+# (with lots of help from) Andy Polyakov
+##
+# 1. Initial version 10/20/02 Suresh Chari
+#
+#
+# The following file works for the xlc,cc
+# and gcc compilers.
+#
+# NOTE: To get the file to link correctly with the gcc compiler
+# you have to change the names of the routines and remove
+# the first .(dot) character. This should automatically
+# be done in the build process.
+#
+# Hand optimized assembly code for the following routines
+#
+# bn_sqr_comba4
+# bn_sqr_comba8
+# bn_mul_comba4
+# bn_mul_comba8
+# bn_sub_words
+# bn_add_words
+# bn_div_words
+# bn_sqr_words
+# bn_mul_words
+# bn_mul_add_words
+#
+# NOTE: It is possible to optimize this code more for
+# specific PowerPC or Power architectures. On the Northstar
+# architecture the optimizations in this file do
+# NOT provide much improvement.
+#
+# If you have comments or suggestions to improve code send
+# me a note at schari\@us.ibm.com
+#
+#--------------------------------------------------------------------------
+#
+# Defines to be used in the assembly code.
+#
+#.set r0,0 # we use it as storage for value of 0
+#.set SP,1 # preserved
+#.set RTOC,2 # preserved
+#.set r3,3 # 1st argument/return value
+#.set r4,4 # 2nd argument/volatile register
+#.set r5,5 # 3rd argument/volatile register
+#.set r6,6 # ...
+#.set r7,7
+#.set r8,8
+#.set r9,9
+#.set r10,10
+#.set r11,11
+#.set r12,12
+#.set r13,13 # not used, nor any other "below" it...
+
+# Declare function names to be global
+# NOTE: For gcc these names MUST be changed to remove
+# the first . i.e. for example change ".bn_sqr_comba4"
+# to "bn_sqr_comba4". This should be automatically done
+# in the build.
+
+ .globl .bn_sqr_comba4
+ .globl .bn_sqr_comba8
+ .globl .bn_mul_comba4
+ .globl .bn_mul_comba8
+ .globl .bn_sub_words
+ .globl .bn_add_words
+ .globl .bn_div_words
+ .globl .bn_sqr_words
+ .globl .bn_mul_words
+ .globl .bn_mul_add_words
+
+# .text section
+
+ .machine "any"
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba4" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba4:
+#
+# Optimized version of bn_sqr_comba4.
+#
+# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+# Here's the assembly
+#
+#
+ xor r0,r0,r0 # set r0 = 0. Used in the addze
+ # instructions below
+
+ #sqr_add_c(a,0,c1,c2,c3)
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5
+ $UMULH r10,r5,r5 #in first iteration. No need
+ #to add since c1=c2=c3=0.
+ # Note c3(r11) is NOT set to 0
+ # but will be.
+
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ # sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
+ adde r8,r8,r8
+ addze r9,r0 # catch carry if any.
+ # r9= r0(=0) and carry
+
+ addc r10,r7,r10 # now add to temp result.
+ addze r11,r8 # r8 added to r11 which is 0
+ addze r9,r9
+
+ $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
+ #sqr_add_c(a,1,c3,c1,c2)
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r7,r7,r7
+ adde r8,r8,r8
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5] = c3
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+
+ $ST r9,`6*$BNSZ`(r3) #r[6]=c1
+ $ST r10,`7*$BNSZ`(r3) #r[7]=c2
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .bn_sqr_comba4,.-.bn_sqr_comba4
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_comba8" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_sqr_comba8:
+#
+# This is an optimized version of the bn_sqr_comba8 routine.
+# Tightly uses the adde instruction
+#
+#
+# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+# r3 contains r
+# r4 contains a
+#
+# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
+#
+# r5,r6 are the two BN_ULONGs being multiplied.
+# r7,r8 are the results of the 32x32 giving 64 bit multiply.
+# r9,r10, r11 are the equivalents of c1,c2, c3.
+#
+# Possible optimization of loading all 8 longs of a into registers
+# doesn't provide any speedup
+#
+
+ xor r0,r0,r0 #set r0 = 0.Used in addze
+ #instructions below.
+
+ #sqr_add_c(a,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $UMULL r9,r5,r5 #1st iteration: no carries.
+ $UMULH r10,r5,r5
+ $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
+ #sqr_add_c2(a,1,0,c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r0 # (r8,r7) to the three register
+ addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
+
+ addc r10,r7,r10 #add the two register number
+ adde r11,r8,r11 # (r8,r7) to the three register
+ addze r9,r9 # number (r9,r11,r10).
+
+ $ST r10,`1*$BNSZ`(r3) # r[1]=c2
+
+ #sqr_add_c(a,1,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,2,0,c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ $ST r11,`2*$BNSZ`(r3) #r[2]=c3
+ #sqr_add_c2(a,3,0,c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,2,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
+ #sqr_add_c(a,2,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,3,1,c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,4,0,c2,c3,c1);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
+ #sqr_add_c2(a,5,0,c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,4,1,c3,c1,c2);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,3,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`3*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
+ #sqr_add_c(a,3,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ #sqr_add_c2(a,4,2,c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,5,1,c1,c2,c3);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,6,0,c1,c2,c3);
+ $LD r5,`0*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
+ #sqr_add_c2(a,7,0,c2,c3,c1);
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,6,1,c2,c3,c1);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,5,2,c2,c3,c1);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,4,3,c2,c3,c1);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
+ #sqr_add_c(a,4,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ #sqr_add_c2(a,5,3,c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,6,2,c3,c1,c2);
+ $LD r5,`2*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,7,1,c3,c1,c2);
+ $LD r5,`1*$BNSZ`(r4)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
+ #sqr_add_c2(a,7,2,c1,c2,c3);
+ $LD r5,`2*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,6,3,c1,c2,c3);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ #sqr_add_c2(a,5,4,c1,c2,c3);
+ $LD r5,`4*$BNSZ`(r4)
+ $LD r6,`5*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
+ #sqr_add_c(a,5,c2,c3,c1);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ #sqr_add_c2(a,6,4,c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ #sqr_add_c2(a,7,3,c2,c3,c1);
+ $LD r5,`3*$BNSZ`(r4)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
+ #sqr_add_c2(a,7,4,c3,c1,c2);
+ $LD r5,`4*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r0
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ #sqr_add_c2(a,6,5,c3,c1,c2);
+ $LD r5,`5*$BNSZ`(r4)
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ addc r11,r7,r11
+ adde r9,r8,r9
+ addze r10,r10
+ $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
+ #sqr_add_c(a,6,c1,c2,c3);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r0
+ #sqr_add_c2(a,7,5,c1,c2,c3)
+ $LD r6,`7*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ addc r9,r7,r9
+ adde r10,r8,r10
+ addze r11,r11
+ $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
+
+ #sqr_add_c2(a,7,6,c2,c3,c1)
+ $LD r5,`6*$BNSZ`(r4)
+ $UMULL r7,r5,r6
+ $UMULH r8,r5,r6
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r0
+ addc r10,r7,r10
+ adde r11,r8,r11
+ addze r9,r9
+ $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
+ #sqr_add_c(a,7,c3,c1,c2);
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ addc r11,r7,r11
+ adde r9,r8,r9
+ $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
+ $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
+
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
+.size .bn_sqr_comba8,.-.bn_sqr_comba8
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_comba4" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_comba4:
+#
+# This is an optimized version of the bn_mul_comba4 routine.
+#
+# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+# r3 contains r
+# r4 contains a
+# r5 contains b
+# r6, r7 are the 2 BN_ULONGs being multiplied.
+# r8, r9 are the results of the 32x32 giving 64 multiply.
+# r10, r11, r12 are the equivalents of c1, c2, and c3.
+#
+ xor r0,r0,r0 #r0=0. Used in addze below.
+ #mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r10,r6,r7
+ $UMULH r11,r6,r7
+ $ST r10,`0*$BNSZ`(r3) #r[0]=c1
+ #mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r0
+ addze r10,r0
+ #mul_add_c(a[1],b[0],c2,c3,c1);
+ $LD r6, `1*$BNSZ`(r4)
+ $LD r7, `0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ $ST r11,`1*$BNSZ`(r3) #r[1]=c2
+ #mul_add_c(a[2],b[0],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r0
+ #mul_add_c(a[1],b[1],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ #mul_add_c(a[0],b[2],c3,c1,c2);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ $ST r12,`2*$BNSZ`(r3) #r[2]=c3
+ #mul_add_c(a[0],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r0
+ #mul_add_c(a[1],b[2],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ #mul_add_c(a[2],b[1],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ #mul_add_c(a[3],b[0],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+ addze r12,r12
+ $ST r10,`3*$BNSZ`(r3) #r[3]=c1
+ #mul_add_c(a[3],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r0
+ #mul_add_c(a[2],b[2],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ #mul_add_c(a[1],b[3],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r8,r11
+ adde r12,r9,r12
+ addze r10,r10
+ $ST r11,`4*$BNSZ`(r3) #r[4]=c2
+ #mul_add_c(a[2],b[3],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r0
+ #mul_add_c(a[3],b[2],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r8,r12
+ adde r10,r9,r10
+ addze r11,r11
+ $ST r12,`5*$BNSZ`(r3) #r[5]=c3
+ #mul_add_c(a[3],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r8,r10
+ adde r11,r9,r11
+
+ $ST r10,`6*$BNSZ`(r3) #r[6]=c1
+ $ST r11,`7*$BNSZ`(r3) #r[7]=c2
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .bn_mul_comba4,.-.bn_mul_comba4
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_comba8" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_comba8:
+#
+# Optimized version of the bn_mul_comba8 routine.
+#
+# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+# r3 contains r
+# r4 contains a
+# r5 contains b
+# r6, r7 are the 2 BN_ULONGs being multiplied.
+# r8, r9 are the results of the 32x32 giving 64 multiply.
+# r10, r11, r12 are the equivalents of c1, c2, and c3.
+#
+ xor r0,r0,r0 #r0=0. Used in addze below.
+
+ #mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4) #a[0]
+ $LD r7,`0*$BNSZ`(r5) #b[0]
+ $UMULL r10,r6,r7
+ $UMULH r11,r6,r7
+ $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
+ #mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ addze r12,r9 # since we didn't set r12 to zero before.
+ addze r10,r0
+ #mul_add_c(a[1],b[0],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
+ #mul_add_c(a[2],b[0],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[1],b[1],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[0],b[2],c3,c1,c2);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
+ #mul_add_c(a[0],b[3],c1,c2,c3);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[1],b[2],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+
+ #mul_add_c(a[2],b[1],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[3],b[0],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
+ #mul_add_c(a[4],b[0],c2,c3,c1);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[3],b[1],c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[2],b[2],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[1],b[3],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[0],b[4],c2,c3,c1);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
+ #mul_add_c(a[0],b[5],c3,c1,c2);
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[1],b[4],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[2],b[3],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[3],b[2],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[4],b[1],c3,c1,c2);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[5],b[0],c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
+ #mul_add_c(a[6],b[0],c1,c2,c3);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[5],b[1],c1,c2,c3);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[4],b[2],c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[3],b[3],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[2],b[4],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[1],b[5],c1,c2,c3);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[0],b[6],c1,c2,c3);
+ $LD r6,`0*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
+ #mul_add_c(a[0],b[7],c2,c3,c1);
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[1],b[6],c2,c3,c1);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[2],b[5],c2,c3,c1);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[3],b[4],c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[4],b[3],c2,c3,c1);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[5],b[2],c2,c3,c1);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[6],b[1],c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[7],b[0],c2,c3,c1);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`0*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
+ #mul_add_c(a[7],b[1],c3,c1,c2);
+ $LD r7,`1*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[6],b[2],c3,c1,c2);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[5],b[3],c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[4],b[4],c3,c1,c2);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[3],b[5],c3,c1,c2);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[2],b[6],c3,c1,c2);
+ $LD r6,`2*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[1],b[7],c3,c1,c2);
+ $LD r6,`1*$BNSZ`(r4)
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
+ #mul_add_c(a[2],b[7],c1,c2,c3);
+ $LD r6,`2*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[3],b[6],c1,c2,c3);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[4],b[5],c1,c2,c3);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[5],b[4],c1,c2,c3);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[6],b[3],c1,c2,c3);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[7],b[2],c1,c2,c3);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
+ #mul_add_c(a[7],b[3],c2,c3,c1);
+ $LD r7,`3*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[6],b[4],c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[5],b[5],c2,c3,c1);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[4],b[6],c2,c3,c1);
+ $LD r6,`4*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ #mul_add_c(a[3],b[7],c2,c3,c1);
+ $LD r6,`3*$BNSZ`(r4)
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
+ #mul_add_c(a[4],b[7],c3,c1,c2);
+ $LD r6,`4*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r0
+ #mul_add_c(a[5],b[6],c3,c1,c2);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[6],b[5],c3,c1,c2);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ #mul_add_c(a[7],b[4],c3,c1,c2);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`4*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ addze r11,r11
+ $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
+ #mul_add_c(a[7],b[5],c1,c2,c3);
+ $LD r7,`5*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r0
+ #mul_add_c(a[6],b[6],c1,c2,c3);
+ $LD r6,`6*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ #mul_add_c(a[5],b[7],c1,c2,c3);
+ $LD r6,`5*$BNSZ`(r4)
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r10,r10,r8
+ adde r11,r11,r9
+ addze r12,r12
+ $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
+ #mul_add_c(a[6],b[7],c2,c3,c1);
+ $LD r6,`6*$BNSZ`(r4)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r0
+ #mul_add_c(a[7],b[6],c2,c3,c1);
+ $LD r6,`7*$BNSZ`(r4)
+ $LD r7,`6*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r11,r11,r8
+ adde r12,r12,r9
+ addze r10,r10
+ $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
+ #mul_add_c(a[7],b[7],c3,c1,c2);
+ $LD r7,`7*$BNSZ`(r5)
+ $UMULL r8,r6,r7
+ $UMULH r9,r6,r7
+ addc r12,r12,r8
+ adde r10,r10,r9
+ $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
+ $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .bn_mul_comba8,.-.bn_mul_comba8
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sub_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+#
+.align 4
+.bn_sub_words:
+#
+# Handcoded version of bn_sub_words
+#
+#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+#
+# r3 = r
+# r4 = a
+# r5 = b
+# r6 = n
+#
+# Note: No loop unrolling done since this is not a performance
+# critical loop.
+
+ xor r0,r0,r0 #set r0 = 0
+#
+# check for r6 = 0 AND set carry bit.
+#
+ subfc. r7,r0,r6 # If r6 is 0 then result is 0.
+ # if r6 > 0 then result !=0
+ # In either case carry bit is set.
+ beq Lppcasm_sub_adios
+ addi r4,r4,-$BNSZ
+ addi r3,r3,-$BNSZ
+ addi r5,r5,-$BNSZ
+ mtctr r6
+Lppcasm_sub_mainloop:
+ $LDU r7,$BNSZ(r4)
+ $LDU r8,$BNSZ(r5)
+ subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
+ # if carry = 1 this is r7-r8. Else it
+ # is r7-r8 -1 as we need.
+ $STU r6,$BNSZ(r3)
+ bdnz Lppcasm_sub_mainloop
+Lppcasm_sub_adios:
+ subfze r3,r0 # if carry bit is set then r3 = 0 else -1
+ andi. r3,r3,1 # keep only last bit.
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .bn_sub_words,.-.bn_sub_words
+
+#
+# NOTE: The following label name should be changed to
+# "bn_add_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_add_words:
+#
+# Handcoded version of bn_add_words
+#
+#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+#
+# r3 = r
+# r4 = a
+# r5 = b
+# r6 = n
+#
+# Note: No loop unrolling done since this is not a performance
+# critical loop.
+
+ xor r0,r0,r0
+#
+# check for r6 = 0. Is this needed?
+#
+ addic. r6,r6,0 #test r6 and clear carry bit.
+ beq Lppcasm_add_adios
+ addi r4,r4,-$BNSZ
+ addi r3,r3,-$BNSZ
+ addi r5,r5,-$BNSZ
+ mtctr r6
+Lppcasm_add_mainloop:
+ $LDU r7,$BNSZ(r4)
+ $LDU r8,$BNSZ(r5)
+ adde r8,r7,r8
+ $STU r8,$BNSZ(r3)
+ bdnz Lppcasm_add_mainloop
+Lppcasm_add_adios:
+ addze r3,r0 #return carry bit.
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .bn_add_words,.-.bn_add_words
+
+#
+# NOTE: The following label name should be changed to
+# "bn_div_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_div_words:
+#
+# This is a cleaned up version of code generated by
+# the AIX compiler. The only optimization is to use
+# the PPC instruction to count leading zeros instead
+# of call to num_bits_word. Since this was compiled
+# only at level -O2 we can possibly squeeze it more?
+#
+# r3 = h
+# r4 = l
+# r5 = d
+
+ $UCMPI 0,r5,0 # compare r5 and 0
+ bne Lppcasm_div1 # proceed if d!=0
+ li r3,-1 # d=0 return -1
+ blr
+Lppcasm_div1:
+ xor r0,r0,r0 #r0=0
+ li r8,$BITS
+ $CNTLZ. r7,r5 #r7 = num leading 0s in d.
+ beq Lppcasm_div2 #proceed if no leading zeros
+ subf r8,r7,r8 #r8 = BN_num_bits_word(d)
+ $SHR. r9,r3,r8 #are there any bits above r8'th?
+ $TR 16,r9,r0 #if there're, signal to dump core...
+Lppcasm_div2:
+ $UCMP 0,r3,r5 #h>=d?
+ blt Lppcasm_div3 #goto Lppcasm_div3 if not
+ subf r3,r5,r3 #h-=d ;
+Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
+ cmpi 0,0,r7,0 # is (i == 0)?
+ beq Lppcasm_div4
+ $SHL r3,r3,r7 # h = (h<< i)
+ $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
+ $SHL r5,r5,r7 # d<<=i
+ or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
+ $SHL r4,r4,r7 # l <<=i
+Lppcasm_div4:
+ $SHRI r9,r5,`$BITS/2` # r9 = dh
+ # dl will be computed when needed
+ # as it saves registers.
+ li r6,2 #r6=2
+ mtctr r6 #counter will be in count.
+Lppcasm_divouterloop:
+ $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
+ $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
+ # compute here for innerloop.
+ $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
+ bne Lppcasm_div5 # goto Lppcasm_div5 if not
+
+ li r8,-1
+ $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
+ b Lppcasm_div6
+Lppcasm_div5:
+ $UDIV r8,r3,r9 #q = h/dh
+Lppcasm_div6:
+ $UMULL r12,r9,r8 #th = q*dh
+ $CLRU r10,r5,`$BITS/2` #r10=dl
+ $UMULL r6,r8,r10 #tl = q*dl
+
+Lppcasm_divinnerloop:
+ subf r10,r12,r3 #t = h -th
+ $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
+ addic. r7,r7,0 #test if r7 == 0. used below.
+ # now want to compute
+ # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
+ # the following 2 instructions do that
+ $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
+ or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
+ $UCMP cr1,r6,r7 # compare (tl <= r7)
+ bne Lppcasm_divinnerexit
+ ble cr1,Lppcasm_divinnerexit
+ addi r8,r8,-1 #q--
+ subf r12,r9,r12 #th -=dh
+ $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
+ subf r6,r10,r6 #tl -=dl
+ b Lppcasm_divinnerloop
+Lppcasm_divinnerexit:
+ $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
+ $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
+ $UCMP cr1,r4,r11 # compare l and tl
+ add r12,r12,r10 # th+=t
+ bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
+ addi r12,r12,1 # th++
+Lppcasm_div7:
+ subf r11,r11,r4 #r11=l-tl
+ $UCMP cr1,r3,r12 #compare h and th
+ bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
+ addi r8,r8,-1 # q--
+ add r3,r5,r3 # h+=d
+Lppcasm_div8:
+ subf r12,r12,r3 #r12 = h-th
+ $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
+ # want to compute
+ # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
+ # the following 2 instructions will do this.
+ $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
+ $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
+ bdz Lppcasm_div9 #if (count==0) break ;
+ $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
+ b Lppcasm_divouterloop
+Lppcasm_div9:
+ or r3,r8,r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .bn_div_words,.-.bn_div_words
+
+#
+# NOTE: The following label name should be changed to
+# "bn_sqr_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+.align 4
+.bn_sqr_words:
+#
+# Optimized version of bn_sqr_words
+#
+# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+#
+# r3 = r
+# r4 = a
+# r5 = n
+#
+# r6 = a[i].
+# r7,r8 = product.
+#
+# No unrolling done here. Not performance critical.
+
+ addic. r5,r5,0 #test r5.
+ beq Lppcasm_sqr_adios
+ addi r4,r4,-$BNSZ
+ addi r3,r3,-$BNSZ
+ mtctr r5
+Lppcasm_sqr_mainloop:
+ #sqr(r[0],r[1],a[0]);
+ $LDU r6,$BNSZ(r4)
+ $UMULL r7,r6,r6
+ $UMULH r8,r6,r6
+ $STU r7,$BNSZ(r3)
+ $STU r8,$BNSZ(r3)
+ bdnz Lppcasm_sqr_mainloop
+Lppcasm_sqr_adios:
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
+.size .bn_sqr_words,.-.bn_sqr_words
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_words:
+#
+# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+#
+# r3 = rp
+# r4 = ap
+# r5 = num
+# r6 = w
+ xor r0,r0,r0
+ xor r12,r12,r12 # used for carry
+ rlwinm. r7,r5,30,2,31 # num >> 2
+ beq Lppcasm_mw_REM
+ mtctr r7
+Lppcasm_mw_LOOP:
+ #mul(rp[0],ap[0],w,c1);
+ $LD r8,`0*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ #addze r10,r10 #carry is NOT ignored.
+ #will be taken care of
+ #in second spin below
+ #using adde.
+ $ST r9,`0*$BNSZ`(r3)
+ #mul(rp[1],ap[1],w,c1);
+ $LD r8,`1*$BNSZ`(r4)
+ $UMULL r11,r6,r8
+ $UMULH r12,r6,r8
+ adde r11,r11,r10
+ #addze r12,r12
+ $ST r11,`1*$BNSZ`(r3)
+ #mul(rp[2],ap[2],w,c1);
+ $LD r8,`2*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ adde r9,r9,r12
+ #addze r10,r10
+ $ST r9,`2*$BNSZ`(r3)
+ #mul_add(rp[3],ap[3],w,c1);
+ $LD r8,`3*$BNSZ`(r4)
+ $UMULL r11,r6,r8
+ $UMULH r12,r6,r8
+ adde r11,r11,r10
+ addze r12,r12 #this spin we collect carry into
+ #r12
+ $ST r11,`3*$BNSZ`(r3)
+
+ addi r3,r3,`4*$BNSZ`
+ addi r4,r4,`4*$BNSZ`
+ bdnz Lppcasm_mw_LOOP
+
+Lppcasm_mw_REM:
+ andi. r5,r5,0x3
+ beq Lppcasm_mw_OVER
+ #mul(rp[0],ap[0],w,c1);
+ $LD r8,`0*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ addze r10,r10
+ $ST r9,`0*$BNSZ`(r3)
+ addi r12,r10,0
+
+ addi r5,r5,-1
+ cmpli 0,0,r5,0
+ beq Lppcasm_mw_OVER
+
+
+ #mul(rp[1],ap[1],w,c1);
+ $LD r8,`1*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ addze r10,r10
+ $ST r9,`1*$BNSZ`(r3)
+ addi r12,r10,0
+
+ addi r5,r5,-1
+ cmpli 0,0,r5,0
+ beq Lppcasm_mw_OVER
+
+ #mul_add(rp[2],ap[2],w,c1);
+ $LD r8,`2*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12
+ addze r10,r10
+ $ST r9,`2*$BNSZ`(r3)
+ addi r12,r10,0
+
+Lppcasm_mw_OVER:
+ addi r3,r12,0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size bn_mul_words,.-bn_mul_words
+
+#
+# NOTE: The following label name should be changed to
+# "bn_mul_add_words" i.e. remove the first dot
+# for the gcc compiler. This should be automatically
+# done in the build
+#
+
+.align 4
+.bn_mul_add_words:
+#
+# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+#
+# r3 = rp
+# r4 = ap
+# r5 = num
+# r6 = w
+#
+# empirical evidence suggests that unrolled version performs best!!
+#
+ xor r0,r0,r0 #r0 = 0
+ xor r12,r12,r12 #r12 = 0 . used for carry
+ rlwinm. r7,r5,30,2,31 # num >> 2
+ beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
+ mtctr r7
+Lppcasm_maw_mainloop:
+ #mul_add(rp[0],ap[0],w,c1);
+ $LD r8,`0*$BNSZ`(r4)
+ $LD r11,`0*$BNSZ`(r3)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ addc r9,r9,r12 #r12 is carry.
+ addze r10,r10
+ addc r9,r9,r11
+ #addze r10,r10
+ #the above instruction addze
+ #is NOT needed. Carry will NOT
+ #be ignored. It's not affected
+ #by multiply and will be collected
+ #in the next spin
+ $ST r9,`0*$BNSZ`(r3)
+
+ #mul_add(rp[1],ap[1],w,c1);
+ $LD r8,`1*$BNSZ`(r4)
+ $LD r9,`1*$BNSZ`(r3)
+ $UMULL r11,r6,r8
+ $UMULH r12,r6,r8
+ adde r11,r11,r10 #r10 is carry.
+ addze r12,r12
+ addc r11,r11,r9
+ #addze r12,r12
+ $ST r11,`1*$BNSZ`(r3)
+
+ #mul_add(rp[2],ap[2],w,c1);
+ $LD r8,`2*$BNSZ`(r4)
+ $UMULL r9,r6,r8
+ $LD r11,`2*$BNSZ`(r3)
+ $UMULH r10,r6,r8
+ adde r9,r9,r12
+ addze r10,r10
+ addc r9,r9,r11
+ #addze r10,r10
+ $ST r9,`2*$BNSZ`(r3)
+
+ #mul_add(rp[3],ap[3],w,c1);
+ $LD r8,`3*$BNSZ`(r4)
+ $UMULL r11,r6,r8
+ $LD r9,`3*$BNSZ`(r3)
+ $UMULH r12,r6,r8
+ adde r11,r11,r10
+ addze r12,r12
+ addc r11,r11,r9
+ addze r12,r12
+ $ST r11,`3*$BNSZ`(r3)
+ addi r3,r3,`4*$BNSZ`
+ addi r4,r4,`4*$BNSZ`
+ bdnz Lppcasm_maw_mainloop
+
+Lppcasm_maw_leftover:
+ andi. r5,r5,0x3
+ beq Lppcasm_maw_adios
+ addi r3,r3,-$BNSZ
+ addi r4,r4,-$BNSZ
+ #mul_add(rp[0],ap[0],w,c1);
+ mtctr r5
+ $LDU r8,$BNSZ(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ $LDU r11,$BNSZ(r3)
+ addc r9,r9,r11
+ addze r10,r10
+ addc r9,r9,r12
+ addze r12,r10
+ $ST r9,0(r3)
+
+ bdz Lppcasm_maw_adios
+ #mul_add(rp[1],ap[1],w,c1);
+ $LDU r8,$BNSZ(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ $LDU r11,$BNSZ(r3)
+ addc r9,r9,r11
+ addze r10,r10
+ addc r9,r9,r12
+ addze r12,r10
+ $ST r9,0(r3)
+
+ bdz Lppcasm_maw_adios
+ #mul_add(rp[2],ap[2],w,c1);
+ $LDU r8,$BNSZ(r4)
+ $UMULL r9,r6,r8
+ $UMULH r10,r6,r8
+ $LDU r11,$BNSZ(r3)
+ addc r9,r9,r11
+ addze r10,r10
+ addc r9,r9,r12
+ addze r12,r10
+ $ST r9,0(r3)
+
+Lppcasm_maw_adios:
+ addi r3,r12,0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size .bn_mul_add_words,.-.bn_mul_add_words
+ .align 4
+EOF
+$data =~ s/\`([^\`]*)\`/eval $1/gem;
+print $data;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/ppc64-mont.pl b/openssl-1.1.0h/crypto/bn/asm/ppc64-mont.pl
new file mode 100644
index 0000000..1e19c95
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/ppc64-mont.pl
@@ -0,0 +1,1635 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2007
+
+# The reason for undertaken effort is basically following. Even though
+# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
+# performance was observed to be less than impressive, essentially as
+# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
+# Well, it's not surprising that IBM had to make some sacrifices to
+# boost the clock frequency that much, but no overall improvement?
+# Having observed how much difference did switching to FPU make on
+# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
+# Unfortunately the resulting performance improvement is not as
+# impressive, ~30%, and in absolute terms is still very far from what
+# one would expect from 4.7GHz CPU. There is a chance that I'm doing
+# something wrong, but in the lack of assembler level micro-profiling
+# data or at least decent platform guide I can't tell... Or better
+# results might be achieved with VMX... Anyway, this module provides
+# *worse* performance on other PowerPC implementations, ~40-15% slower
+# on PPC970 depending on key length and ~40% slower on Power 5 for all
+# key lengths. As it's obviously inappropriate as "best all-round"
+# alternative, it has to be complemented with run-time CPU family
+# detection. Oh! It should also be noted that unlike other PowerPC
+# implementation IALU ppc-mont.pl module performs *suboptimaly* on
+# >=1024-bit key lengths on Power 6. It should also be noted that
+# *everything* said so far applies to 64-bit builds! As far as 32-bit
+# application executed on 64-bit CPU goes, this module is likely to
+# become preferred choice, because it's easy to adapt it for such
+# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
+
+# February 2008
+
+# Micro-profiling assisted optimization results in ~15% improvement
+# over original ppc64-mont.pl version, or overall ~50% improvement
+# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
+# Power 6 CPU, this module is 5-150% faster depending on key length,
+# [hereafter] more for longer keys. But if compared to ppc-mont.pl
+# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
+# in absolute terms, but it's apparently the way Power 6 is...
+
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
+# December 2012
+
+# Remove above mentioned dependence on GPRs' upper halves in 32-bit
+# build. No signal masking overhead, but integer instructions are
+# *more* numerous... It's still "universally" faster than 32-bit
+# ppc-mont.pl, but improvement coefficient is not as impressive
+# for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $RZONE= 224;
+ $fname= "bn_mul_mont_fpu64";
+
+ $STUX= "stwux"; # store indexed and update
+ $PUSH= "stw";
+ $POP= "lwz";
+} elsif ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $RZONE= 288;
+ $fname= "bn_mul_mont_fpu64";
+
+ # same as above, but 64-bit mnemonics...
+ $STUX= "stdux"; # store indexed and update
+ $PUSH= "std";
+ $POP= "ld";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=64; # padded frame header
+$TRANSFER=16*8;
+
+$carry="r0";
+$sp="r1";
+$toc="r2";
+$rp="r3"; $ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9"; # $rp is reassigned
+$tp="r10";
+$j="r11";
+$i="r12";
+# non-volatile registers
+$c1="r19";
+$n1="r20";
+$a1="r21";
+$nap_d="r22"; # interleaved ap and np in double format
+$a0="r23"; # ap[0]
+$t0="r24"; # temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
+
+# PPC offers enough register bank capacity to unroll inner loops twice
+#
+# ..A3A2A1A0
+# dcba
+# -----------
+# A0a
+# A0b
+# A0c
+# A0d
+# A1a
+# A1b
+# A1c
+# A1d
+# A2a
+# A2b
+# A2c
+# A2d
+# A3a
+# A3b
+# A3c
+# A3d
+# ..a
+# ..b
+#
+$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
+$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
+$dota="f8"; $dotb="f9";
+$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
+$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
+$T0a="f24"; $T0b="f25";
+$T1a="f26"; $T1b="f27";
+$T2a="f28"; $T2b="f29";
+$T3a="f30"; $T3b="f31";
+
+# sp----------->+-------------------------------+
+# | saved sp |
+# +-------------------------------+
+# . .
+# +64 +-------------------------------+
+# | 16 gpr<->fpr transfer zone |
+# . .
+# . .
+# +16*8 +-------------------------------+
+# | __int64 tmp[-1] |
+# +-------------------------------+
+# | __int64 tmp[num] |
+# . .
+# . .
+# . .
+# +(num+1)*8 +-------------------------------+
+# | padding to 64 byte boundary |
+# . .
+# +X +-------------------------------+
+# | double nap_d[4*num] |
+# . .
+# . .
+# . .
+# +-------------------------------+
+# . .
+# -13*size_t +-------------------------------+
+# | 13 saved gpr, r19-r31 |
+# . .
+# . .
+# -12*8 +-------------------------------+
+# | 12 saved fpr, f20-f31 |
+# . .
+# . .
+# +-------------------------------+
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl .$fname
+.align 5
+.$fname:
+ cmpwi $num,`3*8/$SIZE_T`
+ mr $rp,r3 ; $rp is reassigned
+ li r3,0 ; possible "not handled" return code
+ bltlr-
+ andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
+ bnelr-
+
+ slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
+ li $i,-4096
+ slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
+ add $tp,$tp,$num ; place for tp[num+1]
+ addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
+ subf $tp,$tp,$sp ; $sp-$tp
+ and $tp,$tp,$i ; minimize TLB usage
+ subf $tp,$sp,$tp ; $tp-$sp
+ mr $i,$sp
+ $STUX $sp,$sp,$tp ; alloca
+
+ $PUSH r19,`-12*8-13*$SIZE_T`($i)
+ $PUSH r20,`-12*8-12*$SIZE_T`($i)
+ $PUSH r21,`-12*8-11*$SIZE_T`($i)
+ $PUSH r22,`-12*8-10*$SIZE_T`($i)
+ $PUSH r23,`-12*8-9*$SIZE_T`($i)
+ $PUSH r24,`-12*8-8*$SIZE_T`($i)
+ $PUSH r25,`-12*8-7*$SIZE_T`($i)
+ $PUSH r26,`-12*8-6*$SIZE_T`($i)
+ $PUSH r27,`-12*8-5*$SIZE_T`($i)
+ $PUSH r28,`-12*8-4*$SIZE_T`($i)
+ $PUSH r29,`-12*8-3*$SIZE_T`($i)
+ $PUSH r30,`-12*8-2*$SIZE_T`($i)
+ $PUSH r31,`-12*8-1*$SIZE_T`($i)
+ stfd f20,`-12*8`($i)
+ stfd f21,`-11*8`($i)
+ stfd f22,`-10*8`($i)
+ stfd f23,`-9*8`($i)
+ stfd f24,`-8*8`($i)
+ stfd f25,`-7*8`($i)
+ stfd f26,`-6*8`($i)
+ stfd f27,`-5*8`($i)
+ stfd f28,`-4*8`($i)
+ stfd f29,`-3*8`($i)
+ stfd f30,`-2*8`($i)
+ stfd f31,`-1*8`($i)
+
+ addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
+ li $i,-64
+ add $nap_d,$tp,$num
+ and $nap_d,$nap_d,$i ; align to 64 bytes
+ ; nap_d is off by 1, because it's used with stfdu/lfdu
+ addi $nap_d,$nap_d,-8
+ srwi $j,$num,`3+1` ; counter register, num/2
+ addi $j,$j,-1
+ addi $tp,$sp,`$FRAME+$TRANSFER-8`
+ li $carry,0
+ mtctr $j
+___
+
+$code.=<<___ if ($SIZE_T==8);
+ ld $a0,0($ap) ; pull ap[0] value
+ ld $t3,0($bp) ; bp[0]
+ ld $n0,0($n0) ; pull n0[0] value
+
+ mulld $t7,$a0,$t3 ; ap[0]*bp[0]
+ ; transfer bp[0] to FPU as 4x16-bit values
+ extrdi $t0,$t3,16,48
+ extrdi $t1,$t3,16,32
+ extrdi $t2,$t3,16,16
+ extrdi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp)
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+
+ mulld $t7,$t7,$n0 ; tp[0]*n0
+ ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
+ extrdi $t4,$t7,16,48
+ extrdi $t5,$t7,16,32
+ extrdi $t6,$t7,16,16
+ extrdi $t7,$t7,16,0
+ std $t4,`$FRAME+32`($sp)
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+
+ extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
+ extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
+ lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
+ lwz $t3,`8^$LITTLE_ENDIAN`($ap)
+ lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
+ lwz $t5,`0^$LITTLE_ENDIAN`($np)
+ lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
+ lwz $t7,`8^$LITTLE_ENDIAN`($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $a0,0($ap) ; pull ap[0,1] value
+ mr $n1,$n0
+ lwz $a1,4($ap)
+ li $c1,0
+ lwz $t1,0($bp) ; bp[0,1]
+ lwz $t3,4($bp)
+ lwz $n0,0($n1) ; pull n0[0,1] value
+ lwz $n1,4($n1)
+
+ mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
+ mulhwu $t5,$a0,$t1
+ mullw $t6,$a1,$t1
+ mullw $t7,$a0,$t3
+ add $t5,$t5,$t6
+ add $t5,$t5,$t7
+ ; transfer bp[0] to FPU as 4x16-bit values
+ extrwi $t0,$t1,16,16
+ extrwi $t1,$t1,16,0
+ extrwi $t2,$t3,16,16
+ extrwi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+
+ mullw $t0,$t4,$n0 ; mulld tp[0]*n0
+ mulhwu $t1,$t4,$n0
+ mullw $t2,$t5,$n0
+ mullw $t3,$t4,$n1
+ add $t1,$t1,$t2
+ add $t1,$t1,$t3
+ ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
+ extrwi $t4,$t0,16,16
+ extrwi $t5,$t0,16,0
+ extrwi $t6,$t1,16,16
+ extrwi $t7,$t1,16,0
+ std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+
+ mr $t0,$a0 ; lwz $t0,0($ap)
+ mr $t1,$a1 ; lwz $t1,4($ap)
+ lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
+ lfd $ba,`$FRAME+0`($sp)
+ lfd $bb,`$FRAME+8`($sp)
+ lfd $bc,`$FRAME+16`($sp)
+ lfd $bd,`$FRAME+24`($sp)
+ lfd $na,`$FRAME+32`($sp)
+ lfd $nb,`$FRAME+40`($sp)
+ lfd $nc,`$FRAME+48`($sp)
+ lfd $nd,`$FRAME+56`($sp)
+ std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
+ std $t1,`$FRAME+72`($sp)
+ std $t2,`$FRAME+80`($sp)
+ std $t3,`$FRAME+88`($sp)
+ std $t4,`$FRAME+96`($sp)
+ std $t5,`$FRAME+104`($sp)
+ std $t6,`$FRAME+112`($sp)
+ std $t7,`$FRAME+120`($sp)
+ fcfid $ba,$ba
+ fcfid $bb,$bb
+ fcfid $bc,$bc
+ fcfid $bd,$bd
+ fcfid $na,$na
+ fcfid $nb,$nb
+ fcfid $nc,$nc
+ fcfid $nd,$nd
+
+ lfd $A0,`$FRAME+64`($sp)
+ lfd $A1,`$FRAME+72`($sp)
+ lfd $A2,`$FRAME+80`($sp)
+ lfd $A3,`$FRAME+88`($sp)
+ lfd $N0,`$FRAME+96`($sp)
+ lfd $N1,`$FRAME+104`($sp)
+ lfd $N2,`$FRAME+112`($sp)
+ lfd $N3,`$FRAME+120`($sp)
+ fcfid $A0,$A0
+ fcfid $A1,$A1
+ fcfid $A2,$A2
+ fcfid $A3,$A3
+ fcfid $N0,$N0
+ fcfid $N1,$N1
+ fcfid $N2,$N2
+ fcfid $N3,$N3
+ addi $ap,$ap,16
+ addi $np,$np,16
+
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ stfd $A0,8($nap_d) ; save a[j] in double format
+ stfd $A1,16($nap_d)
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ stfd $A2,24($nap_d) ; save a[j+1] in double format
+ stfd $A3,32($nap_d)
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ stfd $N0,40($nap_d) ; save n[j] in double format
+ stfd $N1,48($nap_d)
+ fmul $T0a,$A0,$ba
+ fmul $T0b,$A0,$bb
+ stfd $N2,56($nap_d) ; save n[j+1] in double format
+ stfdu $N3,64($nap_d)
+
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+
+.align 5
+L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
+ lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
+ lwz $t1,`0^$LITTLE_ENDIAN`($ap)
+ lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
+ lwz $t3,`8^$LITTLE_ENDIAN`($ap)
+ lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
+ lwz $t5,`0^$LITTLE_ENDIAN`($np)
+ lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
+ lwz $t7,`8^$LITTLE_ENDIAN`($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t1,4($ap)
+ lwz $t2,8($ap)
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
+ std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
+ std $t1,`$FRAME+72`($sp)
+ std $t2,`$FRAME+80`($sp)
+ std $t3,`$FRAME+88`($sp)
+ std $t4,`$FRAME+96`($sp)
+ std $t5,`$FRAME+104`($sp)
+ std $t6,`$FRAME+112`($sp)
+ std $t7,`$FRAME+120`($sp)
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+___
+} else {
+$code.=<<___;
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
+___
+}
+$code.=<<___;
+ lfd $A0,`$FRAME+64`($sp)
+ lfd $A1,`$FRAME+72`($sp)
+ lfd $A2,`$FRAME+80`($sp)
+ lfd $A3,`$FRAME+88`($sp)
+ lfd $N0,`$FRAME+96`($sp)
+ lfd $N1,`$FRAME+104`($sp)
+ lfd $N2,`$FRAME+112`($sp)
+ lfd $N3,`$FRAME+120`($sp)
+ fcfid $A0,$A0
+ fcfid $A1,$A1
+ fcfid $A2,$A2
+ fcfid $A3,$A3
+ fcfid $N0,$N0
+ fcfid $N1,$N1
+ fcfid $N2,$N2
+ fcfid $N3,$N3
+ addi $ap,$ap,16
+ addi $np,$np,16
+
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ stfd $A0,8($nap_d) ; save a[j] in double format
+ stfd $A1,16($nap_d)
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ fmadd $T0a,$A0,$ba,$dota
+ fmadd $T0b,$A0,$bb,$dotb
+ stfd $A2,24($nap_d) ; save a[j+1] in double format
+ stfd $A3,32($nap_d)
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ stfd $N0,40($nap_d) ; save n[j] in double format
+ stfd $N1,48($nap_d)
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ add $t0,$t0,$carry ; can not overflow
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+ stfd $N2,56($nap_d) ; save n[j+1] in double format
+ stfdu $N3,64($nap_d)
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ insrdi $t0,$t1,16,32
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ add $t2,$t2,$carry
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ srdi $carry,$t2,16
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ insrdi $t0,$t2,16,16
+ add $t3,$t3,$carry
+ srdi $carry,$t3,16
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ add $t4,$t4,$carry
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ srdi $carry,$t4,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ add $t5,$t5,$carry
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ add $t6,$t6,$carry
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ srdi $carry,$t6,16
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ insrdi $t4,$t6,16,16
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+ add $t7,$t7,$carry
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+ std $t0,8($tp) ; tp[j-1]
+ stdu $t4,16($tp) ; tp[j]
+___
+} else {
+$code.=<<___;
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ stfd $N0,40($nap_d) ; save n[j] in double format
+ stfd $N1,48($nap_d)
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+ stfd $N2,56($nap_d) ; save n[j+1] in double format
+ stfdu $N3,64($nap_d)
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ stw $t0,12($tp) ; tp[j-1]
+ stw $t4,8($tp)
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+ stw $t2,20($tp) ; tp[j]
+ stwu $t0,16($tp)
+___
+}
+$code.=<<___;
+ bdnz L1st
+
+ fctid $dota,$dota
+ fctid $dotb,$dotb
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+
+ add $t0,$t0,$carry ; can not overflow
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+ insrdi $t0,$t1,16,32
+ add $t2,$t2,$carry
+ srdi $carry,$t2,16
+ insrdi $t0,$t2,16,16
+ add $t3,$t3,$carry
+ srdi $carry,$t3,16
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ add $t4,$t4,$carry
+ srdi $carry,$t4,16
+ add $t5,$t5,$carry
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+ add $t6,$t6,$carry
+ srdi $carry,$t6,16
+ insrdi $t4,$t6,16,16
+ add $t7,$t7,$carry
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+ ld $t6,`$FRAME+64`($sp)
+ ld $t7,`$FRAME+72`($sp)
+
+ std $t0,8($tp) ; tp[j-1]
+ stdu $t4,16($tp) ; tp[j]
+
+ add $t6,$t6,$carry ; can not overflow
+ srdi $carry,$t6,16
+ add $t7,$t7,$carry
+ insrdi $t6,$t7,48,0
+ srdi $ovf,$t7,48
+ std $t6,8($tp) ; tp[num-1]
+___
+} else {
+$code.=<<___;
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ stw $t0,12($tp) ; tp[j-1]
+ stw $t4,8($tp)
+
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
+
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+ stw $t2,20($tp) ; tp[j]
+ stwu $t0,16($tp)
+
+ lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
+
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+
+ insrwi $t6,$t4,16,0
+ srwi $t4,$t4,16
+ insrwi $t4,$t5,16,0
+ srwi $ovf,$t5,16
+ stw $t6,12($tp) ; tp[num-1]
+ stw $t4,8($tp)
+___
+}
+$code.=<<___;
+ slwi $t7,$num,2
+ subf $nap_d,$t7,$nap_d ; rewind pointer
+
+ li $i,8 ; i=1
+.align 5
+Louter:
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ li $carry,0
+ mtctr $j
+___
+$code.=<<___ if ($SIZE_T==8);
+ ldx $t3,$bp,$i ; bp[i]
+
+ ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
+ mulld $t7,$a0,$t3 ; ap[0]*bp[i]
+ add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
+ ; transfer bp[i] to FPU as 4x16-bit values
+ extrdi $t0,$t3,16,48
+ extrdi $t1,$t3,16,32
+ extrdi $t2,$t3,16,16
+ extrdi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp)
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+
+ mulld $t7,$t7,$n0 ; tp[0]*n0
+ ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
+ extrdi $t4,$t7,16,48
+ extrdi $t5,$t7,16,32
+ extrdi $t6,$t7,16,16
+ extrdi $t7,$t7,16,0
+ std $t4,`$FRAME+32`($sp)
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+ add $t0,$bp,$i
+ li $c1,0
+ lwz $t1,0($t0) ; bp[i,i+1]
+ lwz $t3,4($t0)
+
+ mullw $t4,$a0,$t1 ; ap[0]*bp[i]
+ lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
+ mulhwu $t5,$a0,$t1
+ lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
+ mullw $t6,$a1,$t1
+ mullw $t7,$a0,$t3
+ add $t5,$t5,$t6
+ add $t5,$t5,$t7
+ addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
+ adde $t5,$t5,$t2
+ ; transfer bp[i] to FPU as 4x16-bit values
+ extrwi $t0,$t1,16,16
+ extrwi $t1,$t1,16,0
+ extrwi $t2,$t3,16,16
+ extrwi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+
+ mullw $t0,$t4,$n0 ; mulld tp[0]*n0
+ mulhwu $t1,$t4,$n0
+ mullw $t2,$t5,$n0
+ mullw $t3,$t4,$n1
+ add $t1,$t1,$t2
+ add $t1,$t1,$t3
+ ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
+ extrwi $t4,$t0,16,16
+ extrwi $t5,$t0,16,0
+ extrwi $t6,$t1,16,16
+ extrwi $t7,$t1,16,0
+ std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+___
+$code.=<<___;
+ lfd $A0,8($nap_d) ; load a[j] in double format
+ lfd $A1,16($nap_d)
+ lfd $A2,24($nap_d) ; load a[j+1] in double format
+ lfd $A3,32($nap_d)
+ lfd $N0,40($nap_d) ; load n[j] in double format
+ lfd $N1,48($nap_d)
+ lfd $N2,56($nap_d) ; load n[j+1] in double format
+ lfdu $N3,64($nap_d)
+
+ lfd $ba,`$FRAME+0`($sp)
+ lfd $bb,`$FRAME+8`($sp)
+ lfd $bc,`$FRAME+16`($sp)
+ lfd $bd,`$FRAME+24`($sp)
+ lfd $na,`$FRAME+32`($sp)
+ lfd $nb,`$FRAME+40`($sp)
+ lfd $nc,`$FRAME+48`($sp)
+ lfd $nd,`$FRAME+56`($sp)
+
+ fcfid $ba,$ba
+ fcfid $bb,$bb
+ fcfid $bc,$bc
+ fcfid $bd,$bd
+ fcfid $na,$na
+ fcfid $nb,$nb
+ fcfid $nc,$nc
+ fcfid $nd,$nd
+
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ fmul $T0a,$A0,$ba
+ fmul $T0b,$A0,$bb
+
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ lfd $A0,8($nap_d) ; load a[j] in double format
+ lfd $A1,16($nap_d)
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ lfd $A2,24($nap_d) ; load a[j+1] in double format
+ lfd $A3,32($nap_d)
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+
+.align 5
+Linner:
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ lfd $N0,40($nap_d) ; load n[j] in double format
+ lfd $N1,48($nap_d)
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ fmadd $T0a,$A0,$ba,$dota
+ fmadd $T0b,$A0,$bb,$dotb
+ lfd $N2,56($nap_d) ; load n[j+1] in double format
+ lfdu $N3,64($nap_d)
+
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ lfd $A0,8($nap_d) ; load a[j] in double format
+ lfd $A1,16($nap_d)
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+ lfd $A2,24($nap_d) ; load a[j+1] in double format
+ lfd $A3,32($nap_d)
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ add $t0,$t0,$carry ; can not overflow
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ insrdi $t0,$t1,16,32
+ ld $t1,8($tp) ; tp[j]
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ add $t2,$t2,$carry
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ srdi $carry,$t2,16
+ insrdi $t0,$t2,16,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ add $t3,$t3,$carry
+ ldu $t2,16($tp) ; tp[j+1]
+ srdi $carry,$t3,16
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ add $t4,$t4,$carry
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ srdi $carry,$t4,16
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ add $t5,$t5,$carry
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+ add $t6,$t6,$carry
+ srdi $carry,$t6,16
+ insrdi $t4,$t6,16,16
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ add $t7,$t7,$carry
+ addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+ addze $carry,$carry
+ std $t3,-16($tp) ; tp[j-1]
+ std $t5,-8($tp) ; tp[j]
+___
+} else {
+$code.=<<___;
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
+ srwi $c1,$t1,16
+ insrwi $carry,$t1,16,0
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ lwz $t2,12($tp) ; tp[j]
+ lwz $t3,8($tp)
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ srwi $c1,$t5,16
+ insrwi $carry,$t5,16,0
+
+ fctid $T0a,$T0a
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ fctid $T0b,$T0b
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ srwi $c1,$t7,16
+ insrwi $carry,$t7,16,0
+ fctid $T1a,$T1a
+ addc $t0,$t0,$t2
+ adde $t4,$t4,$t3
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
+ fctid $T1b,$T1b
+ addze $carry,$carry
+ addze $c1,$c1
+ stw $t0,4($tp) ; tp[j-1]
+ stw $t4,0($tp)
+ fctid $T2a,$T2a
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
+ fctid $T2b,$T2b
+ srwi $c1,$t3,16
+ insrwi $carry,$t3,16,0
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
+ fctid $T3a,$T3a
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
+ fctid $T3b,$T3b
+
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ lwz $t6,20($tp)
+ lwzu $t7,16($tp)
+ addc $t0,$t0,$carry
+ stfd $T0a,`$FRAME+0`($sp)
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ stfd $T0b,`$FRAME+8`($sp)
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t4,$t4,$carry
+ stfd $T1a,`$FRAME+16`($sp)
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ stfd $T1b,`$FRAME+24`($sp)
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+
+ addc $t2,$t2,$t6
+ stfd $T2a,`$FRAME+32`($sp)
+ adde $t0,$t0,$t7
+ stfd $T2b,`$FRAME+40`($sp)
+ addze $carry,$carry
+ stfd $T3a,`$FRAME+48`($sp)
+ addze $c1,$c1
+ stfd $T3b,`$FRAME+56`($sp)
+ stw $t2,-4($tp) ; tp[j]
+ stw $t0,-8($tp)
+___
+}
+$code.=<<___;
+ bdnz Linner
+
+ fctid $dota,$dota
+ fctid $dotb,$dotb
+___
+if ($SIZE_T==8 or $flavour =~ /osx/) {
+$code.=<<___;
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+
+ add $t0,$t0,$carry ; can not overflow
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+ insrdi $t0,$t1,16,32
+ add $t2,$t2,$carry
+ ld $t1,8($tp) ; tp[j]
+ srdi $carry,$t2,16
+ insrdi $t0,$t2,16,16
+ add $t3,$t3,$carry
+ ldu $t2,16($tp) ; tp[j+1]
+ srdi $carry,$t3,16
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ add $t4,$t4,$carry
+ srdi $carry,$t4,16
+ add $t5,$t5,$carry
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+ add $t6,$t6,$carry
+ srdi $carry,$t6,16
+ insrdi $t4,$t6,16,16
+ add $t7,$t7,$carry
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+ ld $t6,`$FRAME+64`($sp)
+ ld $t7,`$FRAME+72`($sp)
+
+ addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
+ adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
+ addze $carry,$carry
+
+ std $t3,-16($tp) ; tp[j-1]
+ std $t5,-8($tp) ; tp[j]
+
+ add $carry,$carry,$ovf ; comsume upmost overflow
+ add $t6,$t6,$carry ; can not overflow
+ srdi $carry,$t6,16
+ add $t7,$t7,$carry
+ insrdi $t6,$t7,48,0
+ srdi $ovf,$t7,48
+ std $t6,0($tp) ; tp[num-1]
+___
+} else {
+$code.=<<___;
+ lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
+ lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
+ lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
+ lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
+ lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
+ lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $t0,$t2,16,0 ; 0..31 bits
+ lwz $t2,12($tp) ; tp[j]
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ lwz $t3,8($tp)
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t4,$t6,16,0 ; 32..63 bits
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+
+ addc $t0,$t0,$t2
+ adde $t4,$t4,$t3
+ addze $carry,$carry
+ addze $c1,$c1
+ stw $t0,4($tp) ; tp[j-1]
+ stw $t4,0($tp)
+
+ lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
+ lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
+ lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
+ lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
+ lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
+ lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
+ lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
+ lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
+
+ addc $t2,$t2,$carry
+ adde $t3,$t3,$c1
+ srwi $carry,$t2,16
+ insrwi $carry,$t3,16,0
+ srwi $c1,$t3,16
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ srwi $carry,$t6,16
+ insrwi $t2,$t6,16,0 ; 64..95 bits
+ lwz $t6,20($tp)
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ lwzu $t7,16($tp)
+ addc $t0,$t0,$carry
+ adde $t1,$t1,$c1
+ srwi $carry,$t0,16
+ insrwi $carry,$t1,16,0
+ srwi $c1,$t1,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+ srwi $carry,$t4,16
+ insrwi $t0,$t4,16,0 ; 96..127 bits
+ insrwi $carry,$t5,16,0
+ srwi $c1,$t5,16
+
+ addc $t2,$t2,$t6
+ adde $t0,$t0,$t7
+ lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
+ lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
+ addze $carry,$carry
+ addze $c1,$c1
+ lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
+ lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
+
+ addc $t6,$t6,$carry
+ adde $t7,$t7,$c1
+ stw $t2,-4($tp) ; tp[j]
+ stw $t0,-8($tp)
+ addc $t6,$t6,$ovf
+ addze $t7,$t7
+ srwi $carry,$t6,16
+ insrwi $carry,$t7,16,0
+ srwi $c1,$t7,16
+ addc $t4,$t4,$carry
+ adde $t5,$t5,$c1
+
+ insrwi $t6,$t4,16,0
+ srwi $t4,$t4,16
+ insrwi $t4,$t5,16,0
+ srwi $ovf,$t5,16
+ stw $t6,4($tp) ; tp[num-1]
+ stw $t4,0($tp)
+___
+}
+$code.=<<___;
+ slwi $t7,$num,2
+ addi $i,$i,8
+ subf $nap_d,$t7,$nap_d ; rewind pointer
+ cmpw $i,$num
+ blt- Louter
+___
+
+$code.=<<___ if ($SIZE_T==8);
+ subf $np,$num,$np ; rewind np
+ addi $j,$j,1 ; restore counter
+ subfc $i,$i,$i ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,`$FRAME+$TRANSFER+8`
+ addi $t4,$sp,`$FRAME+$TRANSFER+16`
+ addi $t5,$np,8
+ addi $t6,$rp,8
+ mtctr $j
+
+.align 4
+Lsub: ldx $t0,$tp,$i
+ ldx $t1,$np,$i
+ ldx $t2,$t4,$i
+ ldx $t3,$t5,$i
+ subfe $t0,$t1,$t0 ; tp[j]-np[j]
+ subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
+ stdx $t0,$rp,$i
+ stdx $t2,$t6,$i
+ addi $i,$i,16
+ bdnz Lsub
+
+ li $i,0
+ subfe $ovf,$i,$ovf ; handle upmost overflow bit
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+ addi $t7,$ap,8
+ mtctr $j
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ ldx $t0,$ap,$i
+ ldx $t1,$t7,$i
+ std $i,8($nap_d) ; zap nap_d
+ std $i,16($nap_d)
+ std $i,24($nap_d)
+ std $i,32($nap_d)
+ std $i,40($nap_d)
+ std $i,48($nap_d)
+ std $i,56($nap_d)
+ stdu $i,64($nap_d)
+ stdx $t0,$rp,$i
+ stdx $t1,$t6,$i
+ stdx $i,$tp,$i ; zap tp at once
+ stdx $i,$t4,$i
+ addi $i,$i,16
+ bdnz Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+ subf $np,$num,$np ; rewind np
+ addi $j,$j,1 ; restore counter
+ subfc $i,$i,$i ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ addi $np,$np,-4
+ addi $rp,$rp,-4
+ addi $ap,$sp,`$FRAME+$TRANSFER+4`
+ mtctr $j
+
+.align 4
+Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
+ lwz $t1,8($tp)
+ lwz $t2,20($tp)
+ lwzu $t3,16($tp)
+ lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
+ lwz $t5,8($np)
+ lwz $t6,12($np)
+ lwzu $t7,16($np)
+ subfe $t4,$t4,$t0 ; tp[j]-np[j]
+ stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
+ subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
+ stw $t1,8($ap)
+ subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
+ stw $t2,12($ap)
+ subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
+ stwu $t3,16($ap)
+ stw $t4,4($rp)
+ stw $t5,8($rp)
+ stw $t6,12($rp)
+ stwu $t7,16($rp)
+ bdnz Lsub
+
+ li $i,0
+ subfe $ovf,$i,$ovf ; handle upmost overflow bit
+ addi $tp,$sp,`$FRAME+$TRANSFER+4`
+ subf $rp,$num,$rp ; rewind rp
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ mtctr $j
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ lwz $t0,4($ap)
+ lwz $t1,8($ap)
+ lwz $t2,12($ap)
+ lwzu $t3,16($ap)
+ std $i,8($nap_d) ; zap nap_d
+ std $i,16($nap_d)
+ std $i,24($nap_d)
+ std $i,32($nap_d)
+ std $i,40($nap_d)
+ std $i,48($nap_d)
+ std $i,56($nap_d)
+ stdu $i,64($nap_d)
+ stw $t0,4($rp)
+ stw $t1,8($rp)
+ stw $t2,12($rp)
+ stwu $t3,16($rp)
+ std $i,8($tp) ; zap tp at once
+ stdu $i,16($tp)
+ bdnz Lcopy
+___
+
+$code.=<<___;
+ $POP $i,0($sp)
+ li r3,1 ; signal "handled"
+ $POP r19,`-12*8-13*$SIZE_T`($i)
+ $POP r20,`-12*8-12*$SIZE_T`($i)
+ $POP r21,`-12*8-11*$SIZE_T`($i)
+ $POP r22,`-12*8-10*$SIZE_T`($i)
+ $POP r23,`-12*8-9*$SIZE_T`($i)
+ $POP r24,`-12*8-8*$SIZE_T`($i)
+ $POP r25,`-12*8-7*$SIZE_T`($i)
+ $POP r26,`-12*8-6*$SIZE_T`($i)
+ $POP r27,`-12*8-5*$SIZE_T`($i)
+ $POP r28,`-12*8-4*$SIZE_T`($i)
+ $POP r29,`-12*8-3*$SIZE_T`($i)
+ $POP r30,`-12*8-2*$SIZE_T`($i)
+ $POP r31,`-12*8-1*$SIZE_T`($i)
+ lfd f20,`-12*8`($i)
+ lfd f21,`-11*8`($i)
+ lfd f22,`-10*8`($i)
+ lfd f23,`-9*8`($i)
+ lfd f24,`-8*8`($i)
+ lfd f25,`-7*8`($i)
+ lfd f26,`-6*8`($i)
+ lfd f27,`-5*8`($i)
+ lfd f28,`-4*8`($i)
+ lfd f29,`-3*8`($i)
+ lfd f30,`-2*8`($i)
+ lfd f31,`-1*8`($i)
+ mr $sp,$i
+ blr
+ .long 0
+ .byte 0,12,4,0,0x8c,13,6,0
+ .long 0
+.size .$fname,.-.$fname
+
+.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/rsaz-avx2.pl b/openssl-1.1.0h/crypto/bn/asm/rsaz-avx2.pl
new file mode 100755
index 0000000..46d746b
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/rsaz-avx2.pl
@@ -0,0 +1,1967 @@
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+##############################################################################
+# #
+# Copyright (c) 2012, Intel Corporation #
+# #
+# All rights reserved. #
+# #
+# Redistribution and use in source and binary forms, with or without #
+# modification, are permitted provided that the following conditions are #
+# met: #
+# #
+# * Redistributions of source code must retain the above copyright #
+# notice, this list of conditions and the following disclaimer. #
+# #
+# * Redistributions in binary form must reproduce the above copyright #
+# notice, this list of conditions and the following disclaimer in the #
+# documentation and/or other materials provided with the #
+# distribution. #
+# #
+# * Neither the name of the Intel Corporation nor the names of its #
+# contributors may be used to endorse or promote products derived from #
+# this software without specific prior written permission. #
+# #
+# #
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
+# #
+##############################################################################
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel #
+# (2) University of Haifa, Israel #
+##############################################################################
+# Reference: #
+# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
+# Exponentiation, Using Advanced Vector Instructions Architectures", #
+# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
+# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
+# [2] S. Gueron: "Efficient Software Implementations of Modular #
+# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
+# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
+# Proceedings of 9th International Conference on Information Technology: #
+# New Generations (ITNG 2012), pp.821-823 (2012) #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
+# resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
+# on AVX2 capable x86_64 platforms", #
+# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
+##############################################################################
+#
+# +13% improvement over original submission by <appro@openssl.org>
+#
+# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
+# 2.3GHz Haswell 621 765/+23% 1113/+79%
+# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
+#
+# (*) if system doesn't support AVX2, for reference purposes;
+# (**) scaled to 2.3GHz to simplify comparison;
+# (***) scalar AD*X code is faster than AVX2 and is preferred code
+# path for Broadwell;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+ $addx = ($1>=2.23);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+ $addx = ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ $addx = ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+ $addx = ($ver>=3.03);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+if ($avx>1) {{{
+{ # void AMS_WW(
+my $rp="%rdi"; # BN_ULONG *rp,
+my $ap="%rsi"; # const BN_ULONG *ap,
+my $np="%rdx"; # const BN_ULONG *np,
+my $n0="%ecx"; # const BN_ULONG n0,
+my $rep="%r8d"; # int repeat);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+# Registers that hold the broadcasted words of bp, currently used
+my $B1="%ymm10";
+my $B2="%ymm11";
+# Registers that hold the broadcasted words of Y, currently used
+my $Y1="%ymm12";
+my $Y2="%ymm13";
+# Helper registers
+my $TEMP1="%ymm14";
+my $AND_MASK="%ymm15";
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d"; # loop counter
+my $tmp = "%r15";
+
+my $FrameSize=32*18+32*8; # place for A^2 and 2*A
+
+my $aap=$r0;
+my $tp0="%rbx";
+my $tp1=$r3;
+my $tpa=$tmp;
+
+$np="%r13"; # reassigned argument
+
+$code.=<<___;
+.text
+
+.globl rsaz_1024_sqr_avx2
+.type rsaz_1024_sqr_avx2,\@function,5
+.align 64
+rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
+ lea (%rsp), %rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -0xa8(%rsp),%rsp
+ vmovaps %xmm6,-0xd8(%rax)
+ vmovaps %xmm7,-0xc8(%rax)
+ vmovaps %xmm8,-0xb8(%rax)
+ vmovaps %xmm9,-0xa8(%rax)
+ vmovaps %xmm10,-0x98(%rax)
+ vmovaps %xmm11,-0x88(%rax)
+ vmovaps %xmm12,-0x78(%rax)
+ vmovaps %xmm13,-0x68(%rax)
+ vmovaps %xmm14,-0x58(%rax)
+ vmovaps %xmm15,-0x48(%rax)
+.Lsqr_1024_body:
+___
+$code.=<<___;
+ mov %rax,%rbp
+ mov %rdx, $np # reassigned argument
+ sub \$$FrameSize, %rsp
+ mov $np, $tmp
+ sub \$-128, $rp # size optimization
+ sub \$-128, $ap
+ sub \$-128, $np
+
+ and \$4095, $tmp # see if $np crosses page
+ add \$32*10, $tmp
+ shr \$12, $tmp
+ vpxor $ACC9,$ACC9,$ACC9
+ jz .Lsqr_1024_no_n_copy
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause >2x performance degradation here, so if $np does
+ # cross page boundary, copy it to stack and make sure stack
+ # frame doesn't...
+ sub \$32*10,%rsp
+ vmovdqu 32*0-128($np), $ACC0
+ and \$-2048, %rsp
+ vmovdqu 32*1-128($np), $ACC1
+ vmovdqu 32*2-128($np), $ACC2
+ vmovdqu 32*3-128($np), $ACC3
+ vmovdqu 32*4-128($np), $ACC4
+ vmovdqu 32*5-128($np), $ACC5
+ vmovdqu 32*6-128($np), $ACC6
+ vmovdqu 32*7-128($np), $ACC7
+ vmovdqu 32*8-128($np), $ACC8
+ lea $FrameSize+128(%rsp),$np
+ vmovdqu $ACC0, 32*0-128($np)
+ vmovdqu $ACC1, 32*1-128($np)
+ vmovdqu $ACC2, 32*2-128($np)
+ vmovdqu $ACC3, 32*3-128($np)
+ vmovdqu $ACC4, 32*4-128($np)
+ vmovdqu $ACC5, 32*5-128($np)
+ vmovdqu $ACC6, 32*6-128($np)
+ vmovdqu $ACC7, 32*7-128($np)
+ vmovdqu $ACC8, 32*8-128($np)
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
+
+.Lsqr_1024_no_n_copy:
+ and \$-1024, %rsp
+
+ vmovdqu 32*1-128($ap), $ACC1
+ vmovdqu 32*2-128($ap), $ACC2
+ vmovdqu 32*3-128($ap), $ACC3
+ vmovdqu 32*4-128($ap), $ACC4
+ vmovdqu 32*5-128($ap), $ACC5
+ vmovdqu 32*6-128($ap), $ACC6
+ vmovdqu 32*7-128($ap), $ACC7
+ vmovdqu 32*8-128($ap), $ACC8
+
+ lea 192(%rsp), $tp0 # 64+128=192
+ vmovdqu .Land_mask(%rip), $AND_MASK
+ jmp .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+ lea 32*18+128(%rsp), $aap # size optimization
+ lea 448(%rsp), $tp1 # 64+128+256=448
+
+ # the squaring is performed as described in Variant B of
+ # "Speeding up Big-Number Squaring", so start by calculating
+ # the A*2=A+A vector
+ vpaddq $ACC1, $ACC1, $ACC1
+ vpbroadcastq 32*0-128($ap), $B1
+ vpaddq $ACC2, $ACC2, $ACC2
+ vmovdqa $ACC1, 32*0-128($aap)
+ vpaddq $ACC3, $ACC3, $ACC3
+ vmovdqa $ACC2, 32*1-128($aap)
+ vpaddq $ACC4, $ACC4, $ACC4
+ vmovdqa $ACC3, 32*2-128($aap)
+ vpaddq $ACC5, $ACC5, $ACC5
+ vmovdqa $ACC4, 32*3-128($aap)
+ vpaddq $ACC6, $ACC6, $ACC6
+ vmovdqa $ACC5, 32*4-128($aap)
+ vpaddq $ACC7, $ACC7, $ACC7
+ vmovdqa $ACC6, 32*5-128($aap)
+ vpaddq $ACC8, $ACC8, $ACC8
+ vmovdqa $ACC7, 32*6-128($aap)
+ vpxor $ACC9, $ACC9, $ACC9
+ vmovdqa $ACC8, 32*7-128($aap)
+
+ vpmuludq 32*0-128($ap), $B1, $ACC0
+ vpbroadcastq 32*1-128($ap), $B2
+ vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
+ vpmuludq $B1, $ACC1, $ACC1
+ vmovdqu $ACC9, 32*10-448($tp1)
+ vpmuludq $B1, $ACC2, $ACC2
+ vmovdqu $ACC9, 32*11-448($tp1)
+ vpmuludq $B1, $ACC3, $ACC3
+ vmovdqu $ACC9, 32*12-448($tp1)
+ vpmuludq $B1, $ACC4, $ACC4
+ vmovdqu $ACC9, 32*13-448($tp1)
+ vpmuludq $B1, $ACC5, $ACC5
+ vmovdqu $ACC9, 32*14-448($tp1)
+ vpmuludq $B1, $ACC6, $ACC6
+ vmovdqu $ACC9, 32*15-448($tp1)
+ vpmuludq $B1, $ACC7, $ACC7
+ vmovdqu $ACC9, 32*16-448($tp1)
+ vpmuludq $B1, $ACC8, $ACC8
+ vpbroadcastq 32*2-128($ap), $B1
+ vmovdqu $ACC9, 32*17-448($tp1)
+
+ mov $ap, $tpa
+ mov \$4, $i
+ jmp .Lsqr_entry_1024
+___
+$TEMP0=$Y1;
+$TEMP2=$Y2;
+$code.=<<___;
+.align 32
+.LOOP_SQR_1024:
+ vpbroadcastq 32*1-128($tpa), $B2
+ vpmuludq 32*0-128($ap), $B1, $ACC0
+ vpaddq 32*0-192($tp0), $ACC0, $ACC0
+ vpmuludq 32*0-128($aap), $B1, $ACC1
+ vpaddq 32*1-192($tp0), $ACC1, $ACC1
+ vpmuludq 32*1-128($aap), $B1, $ACC2
+ vpaddq 32*2-192($tp0), $ACC2, $ACC2
+ vpmuludq 32*2-128($aap), $B1, $ACC3
+ vpaddq 32*3-192($tp0), $ACC3, $ACC3
+ vpmuludq 32*3-128($aap), $B1, $ACC4
+ vpaddq 32*4-192($tp0), $ACC4, $ACC4
+ vpmuludq 32*4-128($aap), $B1, $ACC5
+ vpaddq 32*5-192($tp0), $ACC5, $ACC5
+ vpmuludq 32*5-128($aap), $B1, $ACC6
+ vpaddq 32*6-192($tp0), $ACC6, $ACC6
+ vpmuludq 32*6-128($aap), $B1, $ACC7
+ vpaddq 32*7-192($tp0), $ACC7, $ACC7
+ vpmuludq 32*7-128($aap), $B1, $ACC8
+ vpbroadcastq 32*2-128($tpa), $B1
+ vpaddq 32*8-192($tp0), $ACC8, $ACC8
+.Lsqr_entry_1024:
+ vmovdqu $ACC0, 32*0-192($tp0)
+ vmovdqu $ACC1, 32*1-192($tp0)
+
+ vpmuludq 32*1-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*1-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq 32*2-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq 32*3-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*4-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq 32*5-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq 32*6-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*7-128($aap), $B2, $ACC0
+ vpbroadcastq 32*3-128($tpa), $B2
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
+
+ vmovdqu $ACC2, 32*2-192($tp0)
+ vmovdqu $ACC3, 32*3-192($tp0)
+
+ vpmuludq 32*2-128($ap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq 32*2-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*3-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq 32*4-128($aap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq 32*5-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC0, $ACC0
+ vpmuludq 32*7-128($aap), $B1, $ACC1
+ vpbroadcastq 32*4-128($tpa), $B1
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
+
+ vmovdqu $ACC4, 32*4-192($tp0)
+ vmovdqu $ACC5, 32*5-192($tp0)
+
+ vpmuludq 32*3-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC6, $ACC6
+ vpmuludq 32*3-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC7, $ACC7
+ vpmuludq 32*4-128($aap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC8, $ACC8
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpmuludq 32*7-128($aap), $B2, $ACC2
+ vpbroadcastq 32*5-128($tpa), $B2
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
+
+ vmovdqu $ACC6, 32*6-192($tp0)
+ vmovdqu $ACC7, 32*7-192($tp0)
+
+ vpmuludq 32*4-128($ap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq 32*4-128($aap), $B1, $TEMP1
+ vpaddq $TEMP1, $ACC0, $ACC0
+ vpmuludq 32*5-128($aap), $B1, $TEMP2
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*6-128($aap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*7-128($aap), $B1, $ACC3
+ vpbroadcastq 32*6-128($tpa), $B1
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
+
+ vmovdqu $ACC8, 32*8-192($tp0)
+ vmovdqu $ACC0, 32*9-192($tp0)
+ lea 8($tp0), $tp0
+
+ vpmuludq 32*5-128($ap), $B2, $TEMP2
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC2, $ACC2
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq 32*7-128($aap), $B2, $ACC4
+ vpbroadcastq 32*7-128($tpa), $B2
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
+
+ vmovdqu $ACC1, 32*10-448($tp1)
+ vmovdqu $ACC2, 32*11-448($tp1)
+
+ vpmuludq 32*6-128($ap), $B1, $TEMP0
+ vpaddq $TEMP0, $ACC3, $ACC3
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
+ vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
+ vpaddq $TEMP1, $ACC4, $ACC4
+ vpmuludq 32*7-128($aap), $B1, $ACC5
+ vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
+
+ vmovdqu $ACC3, 32*12-448($tp1)
+ vmovdqu $ACC4, 32*13-448($tp1)
+ lea 8($tpa), $tpa
+
+ vpmuludq 32*7-128($ap), $B2, $TEMP0
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq 32*7-128($aap), $B2, $ACC6
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
+
+ vpmuludq 32*8-128($ap), $ACC0, $ACC7
+ vmovdqu $ACC5, 32*14-448($tp1)
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
+ vmovdqu $ACC6, 32*15-448($tp1)
+ vmovdqu $ACC7, 32*16-448($tp1)
+ lea 8($tp1), $tp1
+
+ dec $i
+ jnz .LOOP_SQR_1024
+___
+$ZERO = $ACC9;
+$TEMP0 = $B1;
+$TEMP2 = $B2;
+$TEMP3 = $Y1;
+$TEMP4 = $Y2;
+$code.=<<___;
+ # we need to fix indices 32-39 to avoid overflow
+ vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
+ vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
+ vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
+ lea 192(%rsp), $tp0 # 64+128=192
+
+ vpsrlq \$29, $ACC8, $TEMP1
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpxor $ZERO, $ZERO, $ZERO
+ vpermq \$0x93, $TEMP2, $TEMP2
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpblendd \$3, $TEMP2, $ZERO, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vmovdqu $ACC1, 32*9-192($tp0)
+ vmovdqu $ACC2, 32*10-192($tp0)
+
+ mov (%rsp), %rax
+ mov 8(%rsp), $r1
+ mov 16(%rsp), $r2
+ mov 24(%rsp), $r3
+ vmovdqu 32*1(%rsp), $ACC1
+ vmovdqu 32*2-192($tp0), $ACC2
+ vmovdqu 32*3-192($tp0), $ACC3
+ vmovdqu 32*4-192($tp0), $ACC4
+ vmovdqu 32*5-192($tp0), $ACC5
+ vmovdqu 32*6-192($tp0), $ACC6
+ vmovdqu 32*7-192($tp0), $ACC7
+
+ mov %rax, $r0
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+ vmovd %eax, $Y1
+
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpbroadcastq $Y1, $Y1
+ add %rax, $r0
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ shr \$29, $r0
+ add %rax, $r1
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ add $r0, $r1
+ add %rax, $r2
+ imulq 24-128($np), %rdx
+ add %rdx, $r3
+
+ mov $r1, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ mov \$9, $i
+ jmp .LOOP_REDUCE_1024
+
+.align 32
+.LOOP_REDUCE_1024:
+ vmovd %eax, $Y2
+ vpbroadcastq $Y2, $Y2
+
+ vpmuludq 32*1-128($np), $Y1, $TEMP0
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpaddq $TEMP0, $ACC1, $ACC1
+ add %rax, $r1
+ vpmuludq 32*2-128($np), $Y1, $TEMP1
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ vpaddq $TEMP1, $ACC2, $ACC2
+ vpmuludq 32*3-128($np), $Y1, $TEMP2
+ .byte 0x67
+ add %rax, $r2
+ .byte 0x67
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ shr \$29, $r1
+ vpaddq $TEMP2, $ACC3, $ACC3
+ vpmuludq 32*4-128($np), $Y1, $TEMP0
+ add %rax, $r3
+ add $r1, $r2
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpmuludq 32*5-128($np), $Y1, $TEMP1
+ mov $r2, %rax
+ imull $n0, %eax
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpmuludq 32*6-128($np), $Y1, $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpmuludq 32*7-128($np), $Y1, $TEMP0
+ vpaddq $TEMP0, $ACC7, $ACC7
+ vpmuludq 32*8-128($np), $Y1, $TEMP1
+ vmovd %eax, $Y1
+ #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
+ vpaddq $TEMP1, $ACC8, $ACC8
+ #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
+ vpbroadcastq $Y1, $Y1
+
+ vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
+ vmovdqu 32*3-8-128($np), $TEMP1
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ vpaddq $TEMP2, $ACC1, $ACC1
+ vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
+ vmovdqu 32*4-8-128($np), $TEMP2
+ add %rax, $r2
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ vpaddq $TEMP0, $ACC2, $ACC2
+ add $r3, %rax
+ shr \$29, $r2
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*5-8-128($np), $TEMP0
+ add $r2, %rax
+ vpaddq $TEMP1, $ACC3, $ACC3
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*6-8-128($np), $TEMP1
+ .byte 0x67
+ mov %rax, $r3
+ imull $n0, %eax
+ vpaddq $TEMP2, $ACC4, $ACC4
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP0, $ACC5, $ACC5
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*8-8-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC6, $ACC6
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*9-8-128($np), $ACC9
+ vmovd %eax, $ACC0 # borrow ACC0 for Y2
+ imulq -128($np), %rax
+ vpaddq $TEMP2, $ACC7, $ACC7
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vmovdqu 32*1-16-128($np), $TEMP1
+ vpbroadcastq $ACC0, $ACC0
+ vpaddq $TEMP0, $ACC8, $ACC8
+ vpmuludq $Y2, $ACC9, $ACC9
+ vmovdqu 32*2-16-128($np), $TEMP2
+ add %rax, $r3
+
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+ vmovdqu 32*1-24-128($np), $ACC0
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ vmovdqu 32*3-16-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpmuludq $Y2, $ACC0, $ACC0
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
+ vpaddq $ACC1, $ACC0, $ACC0
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ vmovdqu 32*5-16-128($np), $TEMP2
+ .byte 0x67
+ vmovq $ACC0, %rax
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
+ vpaddq $TEMP0, $ACC3, $ACC3
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ vmovdqu 32*6-16-128($np), $TEMP0
+ vpaddq $TEMP1, $ACC4, $ACC4
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ vmovdqu 32*7-16-128($np), $TEMP1
+ vpaddq $TEMP2, $ACC5, $ACC5
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ vmovdqu 32*8-16-128($np), $TEMP2
+ vpaddq $TEMP0, $ACC6, $ACC6
+ vpmuludq $Y1, $TEMP1, $TEMP1
+ shr \$29, $r3
+ vmovdqu 32*9-16-128($np), $TEMP0
+ add $r3, %rax
+ vpaddq $TEMP1, $ACC7, $ACC7
+ vpmuludq $Y1, $TEMP2, $TEMP2
+ #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
+ mov %rax, $r0
+ imull $n0, %eax
+ vpaddq $TEMP2, $ACC8, $ACC8
+ vpmuludq $Y1, $TEMP0, $TEMP0
+ and \$0x1fffffff, %eax
+ vmovd %eax, $Y1
+ vmovdqu 32*3-24-128($np), $TEMP2
+ .byte 0x67
+ vpaddq $TEMP0, $ACC9, $ACC9
+ vpbroadcastq $Y1, $Y1
+
+ vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
+ vmovdqu 32*4-24-128($np), $TEMP0
+ mov %rax, %rdx
+ imulq -128($np), %rax
+ mov 8(%rsp), $r1
+ vpaddq $TEMP1, $ACC2, $ACC1
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*5-24-128($np), $TEMP1
+ add %rax, $r0
+ mov %rdx, %rax
+ imulq 8-128($np), %rax
+ .byte 0x67
+ shr \$29, $r0
+ mov 16(%rsp), $r2
+ vpaddq $TEMP2, $ACC3, $ACC2
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vmovdqu 32*6-24-128($np), $TEMP2
+ add %rax, $r1
+ mov %rdx, %rax
+ imulq 16-128($np), %rax
+ vpaddq $TEMP0, $ACC4, $ACC3
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ vmovdqu 32*7-24-128($np), $TEMP0
+ imulq 24-128($np), %rdx # future $r3
+ add %rax, $r2
+ lea ($r0,$r1), %rax
+ vpaddq $TEMP1, $ACC5, $ACC4
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vmovdqu 32*8-24-128($np), $TEMP1
+ mov %rax, $r1
+ imull $n0, %eax
+ vpmuludq $Y2, $TEMP0, $TEMP0
+ vpaddq $TEMP2, $ACC6, $ACC5
+ vmovdqu 32*9-24-128($np), $TEMP2
+ and \$0x1fffffff, %eax
+ vpaddq $TEMP0, $ACC7, $ACC6
+ vpmuludq $Y2, $TEMP1, $TEMP1
+ add 24(%rsp), %rdx
+ vpaddq $TEMP1, $ACC8, $ACC7
+ vpmuludq $Y2, $TEMP2, $TEMP2
+ vpaddq $TEMP2, $ACC9, $ACC8
+ vmovq $r3, $ACC9
+ mov %rdx, $r3
+
+ dec $i
+ jnz .LOOP_REDUCE_1024
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+ lea 448(%rsp), $tp1 # size optimization
+ vpaddq $ACC9, $Y2, $ACC0
+ vpxor $ZERO, $ZERO, $ZERO
+
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
+ vpaddq 32*17-448($tp1), $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vmovdqu $ACC0, 32*0-128($rp)
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vmovdqu $ACC1, 32*1-128($rp)
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vmovdqu $ACC2, 32*2-128($rp)
+ vpaddq $TEMP4, $ACC4, $ACC4
+ vmovdqu $ACC3, 32*3-128($rp)
+___
+$TEMP5=$ACC0;
+$code.=<<___;
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vmovdqu $ACC4, 32*4-128($rp)
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vmovdqu $ACC5, 32*5-128($rp)
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vmovdqu $ACC6, 32*6-128($rp)
+ vpaddq $TEMP4, $ACC8, $ACC8
+ vmovdqu $ACC7, 32*7-128($rp)
+ vmovdqu $ACC8, 32*8-128($rp)
+
+ mov $rp, $ap
+ dec $rep
+ jne .LOOP_GRANDE_SQR_1024
+
+ vzeroall
+ mov %rbp, %rax
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lsqr_1024_epilogue:
+ ret
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}
+
+{ # void AMM_WW(
+my $rp="%rdi"; # BN_ULONG *rp,
+my $ap="%rsi"; # const BN_ULONG *ap,
+my $bp="%rdx"; # const BN_ULONG *bp,
+my $np="%rcx"; # const BN_ULONG *np,
+my $n0="%r8d"; # unsigned int n0);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+
+# Registers that hold the broadcasted words of multiplier, currently used
+my $Bi="%ymm10";
+my $Yi="%ymm11";
+
+# Helper registers
+my $TEMP0=$ACC0;
+my $TEMP1="%ymm12";
+my $TEMP2="%ymm13";
+my $ZERO="%ymm14";
+my $AND_MASK="%ymm15";
+
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d";
+my $tmp="%r15";
+
+$bp="%r13"; # reassigned argument
+
+$code.=<<___;
+.globl rsaz_1024_mul_avx2
+.type rsaz_1024_mul_avx2,\@function,5
+.align 64
+rsaz_1024_mul_avx2:
+ lea (%rsp), %rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ vzeroupper
+ lea -0xa8(%rsp),%rsp
+ vmovaps %xmm6,-0xd8(%rax)
+ vmovaps %xmm7,-0xc8(%rax)
+ vmovaps %xmm8,-0xb8(%rax)
+ vmovaps %xmm9,-0xa8(%rax)
+ vmovaps %xmm10,-0x98(%rax)
+ vmovaps %xmm11,-0x88(%rax)
+ vmovaps %xmm12,-0x78(%rax)
+ vmovaps %xmm13,-0x68(%rax)
+ vmovaps %xmm14,-0x58(%rax)
+ vmovaps %xmm15,-0x48(%rax)
+.Lmul_1024_body:
+___
+$code.=<<___;
+ mov %rax,%rbp
+ vzeroall
+ mov %rdx, $bp # reassigned argument
+ sub \$64,%rsp
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause severe performance degradation here, so if $ap does
+ # cross page boundary, swap it with $bp [meaning that caller
+ # is advised to lay down $ap and $bp next to each other, so
+ # that only one can cross page boundary].
+ .byte 0x67,0x67
+ mov $ap, $tmp
+ and \$4095, $tmp
+ add \$32*10, $tmp
+ shr \$12, $tmp
+ mov $ap, $tmp
+ cmovnz $bp, $ap
+ cmovnz $tmp, $bp
+
+ mov $np, $tmp
+ sub \$-128,$ap # size optimization
+ sub \$-128,$np
+ sub \$-128,$rp
+
+ and \$4095, $tmp # see if $np crosses page
+ add \$32*10, $tmp
+ .byte 0x67,0x67
+ shr \$12, $tmp
+ jz .Lmul_1024_no_n_copy
+
+ # unaligned 256-bit load that crosses page boundary can
+ # cause severe performance degradation here, so if $np does
+ # cross page boundary, copy it to stack and make sure stack
+ # frame doesn't...
+ sub \$32*10,%rsp
+ vmovdqu 32*0-128($np), $ACC0
+ and \$-512, %rsp
+ vmovdqu 32*1-128($np), $ACC1
+ vmovdqu 32*2-128($np), $ACC2
+ vmovdqu 32*3-128($np), $ACC3
+ vmovdqu 32*4-128($np), $ACC4
+ vmovdqu 32*5-128($np), $ACC5
+ vmovdqu 32*6-128($np), $ACC6
+ vmovdqu 32*7-128($np), $ACC7
+ vmovdqu 32*8-128($np), $ACC8
+ lea 64+128(%rsp),$np
+ vmovdqu $ACC0, 32*0-128($np)
+ vpxor $ACC0, $ACC0, $ACC0
+ vmovdqu $ACC1, 32*1-128($np)
+ vpxor $ACC1, $ACC1, $ACC1
+ vmovdqu $ACC2, 32*2-128($np)
+ vpxor $ACC2, $ACC2, $ACC2
+ vmovdqu $ACC3, 32*3-128($np)
+ vpxor $ACC3, $ACC3, $ACC3
+ vmovdqu $ACC4, 32*4-128($np)
+ vpxor $ACC4, $ACC4, $ACC4
+ vmovdqu $ACC5, 32*5-128($np)
+ vpxor $ACC5, $ACC5, $ACC5
+ vmovdqu $ACC6, 32*6-128($np)
+ vpxor $ACC6, $ACC6, $ACC6
+ vmovdqu $ACC7, 32*7-128($np)
+ vpxor $ACC7, $ACC7, $ACC7
+ vmovdqu $ACC8, 32*8-128($np)
+ vmovdqa $ACC0, $ACC8
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
+.Lmul_1024_no_n_copy:
+ and \$-64,%rsp
+
+ mov ($bp), %rbx
+ vpbroadcastq ($bp), $Bi
+ vmovdqu $ACC0, (%rsp) # clear top of stack
+ xor $r0, $r0
+ .byte 0x67
+ xor $r1, $r1
+ xor $r2, $r2
+ xor $r3, $r3
+
+ vmovdqu .Land_mask(%rip), $AND_MASK
+ mov \$9, $i
+ vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
+ jmp .Loop_mul_1024
+
+.align 32
+.Loop_mul_1024:
+ vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
+ mov %rbx, %rax
+ imulq -128($ap), %rax
+ add $r0, %rax
+ mov %rbx, $r1
+ imulq 8-128($ap), $r1
+ add 8(%rsp), $r1
+
+ mov %rax, $r0
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ mov %rbx, $r2
+ imulq 16-128($ap), $r2
+ add 16(%rsp), $r2
+
+ mov %rbx, $r3
+ imulq 24-128($ap), $r3
+ add 24(%rsp), $r3
+ vpmuludq 32*1-128($ap),$Bi,$TEMP0
+ vmovd %eax, $Yi
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq 32*2-128($ap),$Bi,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq 32*3-128($ap),$Bi,$TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq 32*4-128($ap),$Bi,$TEMP0
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq 32*5-128($ap),$Bi,$TEMP1
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq 32*6-128($ap),$Bi,$TEMP2
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq 32*7-128($ap),$Bi,$TEMP0
+ vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq 32*8-128($ap),$Bi,$TEMP1
+ vpbroadcastq 8($bp), $Bi
+ vpaddq $TEMP1,$ACC8,$ACC8
+
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r0
+ mov %rdx,%rax
+ imulq 8-128($np),%rax
+ add %rax,$r1
+ mov %rdx,%rax
+ imulq 16-128($np),%rax
+ add %rax,$r2
+ shr \$29, $r0
+ imulq 24-128($np),%rdx
+ add %rdx,$r3
+ add $r0, $r1
+
+ vpmuludq 32*1-128($np),$Yi,$TEMP2
+ vmovq $Bi, %rbx
+ vpaddq $TEMP2,$ACC1,$ACC1
+ vpmuludq 32*2-128($np),$Yi,$TEMP0
+ vpaddq $TEMP0,$ACC2,$ACC2
+ vpmuludq 32*3-128($np),$Yi,$TEMP1
+ vpaddq $TEMP1,$ACC3,$ACC3
+ vpmuludq 32*4-128($np),$Yi,$TEMP2
+ vpaddq $TEMP2,$ACC4,$ACC4
+ vpmuludq 32*5-128($np),$Yi,$TEMP0
+ vpaddq $TEMP0,$ACC5,$ACC5
+ vpmuludq 32*6-128($np),$Yi,$TEMP1
+ vpaddq $TEMP1,$ACC6,$ACC6
+ vpmuludq 32*7-128($np),$Yi,$TEMP2
+ vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
+ vpaddq $TEMP2,$ACC7,$ACC7
+ vpmuludq 32*8-128($np),$Yi,$TEMP0
+ vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
+ vpaddq $TEMP0,$ACC8,$ACC8
+
+ mov %rbx, %rax
+ imulq -128($ap),%rax
+ add %rax,$r1
+ vmovdqu -8+32*1-128($ap),$TEMP1
+ mov %rbx, %rax
+ imulq 8-128($ap),%rax
+ add %rax,$r2
+ vmovdqu -8+32*2-128($ap),$TEMP2
+
+ mov $r1, %rax
+ vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
+ imull $n0, %eax
+ vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
+ and \$0x1fffffff, %eax
+
+ imulq 16-128($ap),%rbx
+ add %rbx,$r3
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovd %eax, $Yi
+ vmovdqu -8+32*3-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -8+32*4-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -8+32*5-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -8+32*6-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -8+32*7-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -8+32*8-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -8+32*9-128($ap),$ACC9
+ vpaddq $TEMP1,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpaddq $TEMP2,$ACC8,$ACC8
+ vpmuludq $Bi,$ACC9,$ACC9
+ vpbroadcastq 16($bp), $Bi
+
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r1
+ vmovdqu -8+32*1-128($np),$TEMP0
+ mov %rdx,%rax
+ imulq 8-128($np),%rax
+ add %rax,$r2
+ vmovdqu -8+32*2-128($np),$TEMP1
+ shr \$29, $r1
+ imulq 16-128($np),%rdx
+ add %rdx,$r3
+ add $r1, $r2
+
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -8+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -8+32*4-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -8+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -8+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -8+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -8+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -8+32*9-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ vmovdqu -16+32*1-128($ap),$TEMP0
+ mov %rbx,%rax
+ imulq -128($ap),%rax
+ add $r2,%rax
+
+ vmovdqu -16+32*2-128($ap),$TEMP1
+ mov %rax,$r2
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ imulq 8-128($ap),%rbx
+ add %rbx,$r3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovd %eax, $Yi
+ vmovdqu -16+32*3-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -16+32*4-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -16+32*5-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -16+32*6-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -16+32*7-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -16+32*8-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -16+32*9-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq 24($bp), $Bi
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ vmovdqu -16+32*1-128($np),$TEMP0
+ mov %rax,%rdx
+ imulq -128($np),%rax
+ add %rax,$r2
+ vmovdqu -16+32*2-128($np),$TEMP1
+ imulq 8-128($np),%rdx
+ add %rdx,$r3
+ shr \$29, $r2
+
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -16+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -16+32*4-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -16+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -16+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -16+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -16+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -16+32*9-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -24+32*1-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*2-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC9,$ACC9
+
+ add $r2, $r3
+ imulq -128($ap),%rbx
+ add %rbx,$r3
+
+ mov $r3, %rax
+ imull $n0, %eax
+ and \$0x1fffffff, %eax
+
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovd %eax, $Yi
+ vmovdqu -24+32*3-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC1
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpbroadcastq $Yi, $Yi
+ vmovdqu -24+32*4-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC2,$ACC2
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -24+32*5-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC3
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -24+32*6-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC4
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vmovdqu -24+32*7-128($ap),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC5
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vmovdqu -24+32*8-128($ap),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC6
+ vpmuludq $Bi,$TEMP0,$TEMP0
+ vmovdqu -24+32*9-128($ap),$TEMP2
+ vpaddq $TEMP0,$ACC7,$ACC7
+ vpmuludq $Bi,$TEMP1,$TEMP1
+ vpaddq $TEMP1,$ACC8,$ACC8
+ vpmuludq $Bi,$TEMP2,$TEMP2
+ vpbroadcastq 32($bp), $Bi
+ vpaddq $TEMP2,$ACC9,$ACC9
+ add \$32, $bp # $bp++
+
+ vmovdqu -24+32*1-128($np),$TEMP0
+ imulq -128($np),%rax
+ add %rax,$r3
+ shr \$29, $r3
+
+ vmovdqu -24+32*2-128($np),$TEMP1
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovq $Bi, %rbx
+ vmovdqu -24+32*3-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
+ vpaddq $TEMP1,$ACC2,$ACC1
+ vmovdqu -24+32*4-128($np),$TEMP0
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*5-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC3,$ACC2
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -24+32*6-128($np),$TEMP2
+ vpaddq $TEMP0,$ACC4,$ACC3
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ vmovdqu -24+32*7-128($np),$TEMP0
+ vpaddq $TEMP1,$ACC5,$ACC4
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovdqu -24+32*8-128($np),$TEMP1
+ vpaddq $TEMP2,$ACC6,$ACC5
+ vpmuludq $Yi,$TEMP0,$TEMP0
+ vmovdqu -24+32*9-128($np),$TEMP2
+ mov $r3, $r0
+ vpaddq $TEMP0,$ACC7,$ACC6
+ vpmuludq $Yi,$TEMP1,$TEMP1
+ add (%rsp), $r0
+ vpaddq $TEMP1,$ACC8,$ACC7
+ vpmuludq $Yi,$TEMP2,$TEMP2
+ vmovq $r3, $TEMP1
+ vpaddq $TEMP2,$ACC9,$ACC8
+
+ dec $i
+ jnz .Loop_mul_1024
+___
+
+# (*) Original implementation was correcting ACC1-ACC3 for overflow
+# after 7 loop runs, or after 28 iterations, or 56 additions.
+# But as we underutilize resources, it's possible to correct in
+# each iteration with marginal performance loss. But then, as
+# we do it in each iteration, we can correct less digits, and
+# avoid performance penalties completely.
+
+$TEMP0 = $ACC9;
+$TEMP3 = $Bi;
+$TEMP4 = $Yi;
+$code.=<<___;
+ vpaddq (%rsp), $TEMP1, $ACC0
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vpsrlq \$29, $ACC0, $TEMP1
+ vpand $AND_MASK, $ACC0, $ACC0
+ vpsrlq \$29, $ACC1, $TEMP2
+ vpand $AND_MASK, $ACC1, $ACC1
+ vpsrlq \$29, $ACC2, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC2, $ACC2
+ vpsrlq \$29, $ACC3, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC3, $ACC3
+ vpermq \$0x93, $TEMP3, $TEMP3
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP4, $TEMP4
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC0, $ACC0
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC1, $ACC1
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC2, $ACC2
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
+ vpaddq $TEMP3, $ACC3, $ACC3
+ vpaddq $TEMP4, $ACC4, $ACC4
+
+ vmovdqu $ACC0, 0-128($rp)
+ vmovdqu $ACC1, 32-128($rp)
+ vmovdqu $ACC2, 64-128($rp)
+ vmovdqu $ACC3, 96-128($rp)
+___
+
+$TEMP5=$ACC0;
+$code.=<<___;
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vpsrlq \$29, $ACC4, $TEMP1
+ vpand $AND_MASK, $ACC4, $ACC4
+ vpsrlq \$29, $ACC5, $TEMP2
+ vpand $AND_MASK, $ACC5, $ACC5
+ vpsrlq \$29, $ACC6, $TEMP3
+ vpermq \$0x93, $TEMP1, $TEMP1
+ vpand $AND_MASK, $ACC6, $ACC6
+ vpsrlq \$29, $ACC7, $TEMP4
+ vpermq \$0x93, $TEMP2, $TEMP2
+ vpand $AND_MASK, $ACC7, $ACC7
+ vpsrlq \$29, $ACC8, $TEMP5
+ vpermq \$0x93, $TEMP3, $TEMP3
+ vpand $AND_MASK, $ACC8, $ACC8
+ vpermq \$0x93, $TEMP4, $TEMP4
+
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
+ vpermq \$0x93, $TEMP5, $TEMP5
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
+ vpaddq $TEMP0, $ACC4, $ACC4
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
+ vpaddq $TEMP1, $ACC5, $ACC5
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
+ vpaddq $TEMP2, $ACC6, $ACC6
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
+ vpaddq $TEMP3, $ACC7, $ACC7
+ vpaddq $TEMP4, $ACC8, $ACC8
+
+ vmovdqu $ACC4, 128-128($rp)
+ vmovdqu $ACC5, 160-128($rp)
+ vmovdqu $ACC6, 192-128($rp)
+ vmovdqu $ACC7, 224-128($rp)
+ vmovdqu $ACC8, 256-128($rp)
+ vzeroupper
+
+ mov %rbp, %rax
+___
+$code.=<<___ if ($win64);
+ movaps -0xd8(%rax),%xmm6
+ movaps -0xc8(%rax),%xmm7
+ movaps -0xb8(%rax),%xmm8
+ movaps -0xa8(%rax),%xmm9
+ movaps -0x98(%rax),%xmm10
+ movaps -0x88(%rax),%xmm11
+ movaps -0x78(%rax),%xmm12
+ movaps -0x68(%rax),%xmm13
+ movaps -0x58(%rax),%xmm14
+ movaps -0x48(%rax),%xmm15
+___
+$code.=<<___;
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp # restore %rsp
+.Lmul_1024_epilogue:
+ ret
+.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+___
+}
+{
+my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
+my @T = map("%r$_",(8..11));
+
+$code.=<<___;
+.globl rsaz_1024_red2norm_avx2
+.type rsaz_1024_red2norm_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_red2norm_avx2:
+ sub \$-128,$inp # size optimization
+ xor %rax,%rax
+___
+
+for ($j=0,$i=0; $i<16; $i++) {
+ my $k=0;
+ while (29*$j<64*($i+1)) { # load data till boundary
+ $code.=" mov `8*$j-128`($inp), @T[0]\n";
+ $j++; $k++; push(@T,shift(@T));
+ }
+ $l=$k;
+ while ($k>1) { # shift loaded data but last value
+ $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
+ $k--;
+ }
+ $code.=<<___; # shift last value
+ mov @T[-1], @T[0]
+ shl \$`29*($j-1)`, @T[-1]
+ shr \$`-29*($j-1)`, @T[0]
+___
+ while ($l) { # accumulate all values
+ $code.=" add @T[-$l], %rax\n";
+ $l--;
+ }
+ $code.=<<___;
+ adc \$0, @T[0] # consume eventual carry
+ mov %rax, 8*$i($out)
+ mov @T[0], %rax
+___
+ push(@T,shift(@T));
+}
+$code.=<<___;
+ ret
+.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl rsaz_1024_norm2red_avx2
+.type rsaz_1024_norm2red_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_norm2red_avx2:
+ sub \$-128,$out # size optimization
+ mov ($inp),@T[0]
+ mov \$0x1fffffff,%eax
+___
+for ($j=0,$i=0; $i<16; $i++) {
+ $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
+ $code.=" xor @T[1],@T[1]\n" if ($i==15);
+ my $k=1;
+ while (29*($j+1)<64*($i+1)) {
+ $code.=<<___;
+ mov @T[0],@T[-$k]
+ shr \$`29*$j`,@T[-$k]
+ and %rax,@T[-$k] # &0x1fffffff
+ mov @T[-$k],`8*$j-128`($out)
+___
+ $j++; $k++;
+ }
+ $code.=<<___;
+ shrd \$`29*$j`,@T[1],@T[0]
+ and %rax,@T[0]
+ mov @T[0],`8*$j-128`($out)
+___
+ $j++;
+ push(@T,shift(@T));
+}
+$code.=<<___;
+ mov @T[0],`8*$j-128`($out) # zero
+ mov @T[0],`8*($j+1)-128`($out)
+ mov @T[0],`8*($j+2)-128`($out)
+ mov @T[0],`8*($j+3)-128`($out)
+ ret
+.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+___
+}
+{
+my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+
+$code.=<<___;
+.globl rsaz_1024_scatter5_avx2
+.type rsaz_1024_scatter5_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_scatter5_avx2:
+ vzeroupper
+ vmovdqu .Lscatter_permd(%rip),%ymm5
+ shl \$4,$power
+ lea ($out,$power),$out
+ mov \$9,%eax
+ jmp .Loop_scatter_1024
+
+.align 32
+.Loop_scatter_1024:
+ vmovdqu ($inp),%ymm0
+ lea 32($inp),$inp
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,($out)
+ lea 16*32($out),$out
+ dec %eax
+ jnz .Loop_scatter_1024
+
+ vzeroupper
+ ret
+.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_gather5_avx2,\@abi-omnipotent
+.align 32
+rsaz_1024_gather5_avx2:
+ vzeroupper
+ mov %rsp,%r11
+___
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+.LSEH_begin_rsaz_1024_gather5:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
+ .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
+ .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
+ .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
+ .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
+ .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
+ .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
+ .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ lea -0x100(%rsp),%rsp
+ and \$-32, %rsp
+ lea .Linc(%rip), %r10
+ lea -128(%rsp),%rax # control u-op density
+
+ vmovd $power, %xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*0+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm0
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*1+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm1
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*2+128(%rax)
+ vpaddd %ymm5, %ymm0, %ymm2
+ vpcmpeqd %ymm4, %ymm0, %ymm0
+ vmovdqa %ymm3, 32*3+128(%rax)
+ vpaddd %ymm5, %ymm1, %ymm3
+ vpcmpeqd %ymm4, %ymm1, %ymm1
+ vmovdqa %ymm0, 32*4+128(%rax)
+ vpaddd %ymm5, %ymm2, %ymm8
+ vpcmpeqd %ymm4, %ymm2, %ymm2
+ vmovdqa %ymm1, 32*5+128(%rax)
+ vpaddd %ymm5, %ymm3, %ymm9
+ vpcmpeqd %ymm4, %ymm3, %ymm3
+ vmovdqa %ymm2, 32*6+128(%rax)
+ vpaddd %ymm5, %ymm8, %ymm10
+ vpcmpeqd %ymm4, %ymm8, %ymm8
+ vmovdqa %ymm3, 32*7+128(%rax)
+ vpaddd %ymm5, %ymm9, %ymm11
+ vpcmpeqd %ymm4, %ymm9, %ymm9
+ vpaddd %ymm5, %ymm10, %ymm12
+ vpcmpeqd %ymm4, %ymm10, %ymm10
+ vpaddd %ymm5, %ymm11, %ymm13
+ vpcmpeqd %ymm4, %ymm11, %ymm11
+ vpaddd %ymm5, %ymm12, %ymm14
+ vpcmpeqd %ymm4, %ymm12, %ymm12
+ vpaddd %ymm5, %ymm13, %ymm15
+ vpcmpeqd %ymm4, %ymm13, %ymm13
+ vpcmpeqd %ymm4, %ymm14, %ymm14
+ vpcmpeqd %ymm4, %ymm15, %ymm15
+
+ vmovdqa -32(%r10),%ymm7 # .Lgather_permd
+ lea 128($inp), $inp
+ mov \$9,$power
+
+.Loop_gather_1024:
+ vmovdqa 32*0-128($inp), %ymm0
+ vmovdqa 32*1-128($inp), %ymm1
+ vmovdqa 32*2-128($inp), %ymm2
+ vmovdqa 32*3-128($inp), %ymm3
+ vpand 32*0+128(%rax), %ymm0, %ymm0
+ vpand 32*1+128(%rax), %ymm1, %ymm1
+ vpand 32*2+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm1, %ymm4
+ vpand 32*3+128(%rax), %ymm3, %ymm3
+ vmovdqa 32*4-128($inp), %ymm0
+ vmovdqa 32*5-128($inp), %ymm1
+ vpor %ymm2, %ymm3, %ymm5
+ vmovdqa 32*6-128($inp), %ymm2
+ vmovdqa 32*7-128($inp), %ymm3
+ vpand 32*4+128(%rax), %ymm0, %ymm0
+ vpand 32*5+128(%rax), %ymm1, %ymm1
+ vpand 32*6+128(%rax), %ymm2, %ymm2
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*7+128(%rax), %ymm3, %ymm3
+ vpand 32*8-128($inp), %ymm8, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*9-128($inp), %ymm9, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*10-128($inp),%ymm10, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*11-128($inp),%ymm11, %ymm3
+ vpor %ymm0, %ymm4, %ymm4
+ vpand 32*12-128($inp),%ymm12, %ymm0
+ vpor %ymm1, %ymm5, %ymm5
+ vpand 32*13-128($inp),%ymm13, %ymm1
+ vpor %ymm2, %ymm4, %ymm4
+ vpand 32*14-128($inp),%ymm14, %ymm2
+ vpor %ymm3, %ymm5, %ymm5
+ vpand 32*15-128($inp),%ymm15, %ymm3
+ lea 32*16($inp), $inp
+ vpor %ymm0, %ymm4, %ymm4
+ vpor %ymm1, %ymm5, %ymm5
+ vpor %ymm2, %ymm4, %ymm4
+ vpor %ymm3, %ymm5, %ymm5
+
+ vpor %ymm5, %ymm4, %ymm4
+ vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
+ vpor %xmm4, %xmm5, %xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,($out)
+ lea 32($out),$out
+ dec $power
+ jnz .Loop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,($out)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -0xa8(%r11),%xmm6
+ movaps -0x98(%r11),%xmm7
+ movaps -0x88(%r11),%xmm8
+ movaps -0x78(%r11),%xmm9
+ movaps -0x68(%r11),%xmm10
+ movaps -0x58(%r11),%xmm11
+ movaps -0x48(%r11),%xmm12
+ movaps -0x38(%r11),%xmm13
+ movaps -0x28(%r11),%xmm14
+ movaps -0x18(%r11),%xmm15
+.LSEH_end_rsaz_1024_gather5:
+___
+$code.=<<___;
+ lea (%r11),%rsp
+ ret
+.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+___
+}
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,\@abi-omnipotent
+.align 32
+rsaz_avx2_eligible:
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
+___
+$code.=<<___ if ($addx);
+ mov \$`1<<8|1<<19`,%ecx
+ mov \$0,%edx
+ and %eax,%ecx
+ cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
+ cmove %edx,%eax
+___
+$code.=<<___;
+ and \$`1<<5`,%eax
+ shr \$5,%eax
+ ret
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.align 64
+.Land_mask:
+ .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+ .long 0,2,4,6,7,7,7,7
+.Lgather_permd:
+ .long 0,7,1,7,2,7,3,7
+.Linc:
+ .long 0,0,0,0, 1,1,1,1
+ .long 2,2,2,2, 3,3,3,3
+ .long 4,4,4,4, 4,4,4,4
+.align 64
+___
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern __imp_RtlVirtualUnwind
+.type rsaz_se_handler,\@abi-omnipotent
+.align 16
+rsaz_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 160($context),%rax # pull context->Rbp
+
+ mov -48(%rax),%r15
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ mov %r15,240($context)
+ mov %r14,232($context)
+ mov %r13,224($context)
+ mov %r12,216($context)
+ mov %rbp,160($context)
+ mov %rbx,144($context)
+
+ lea -0xd8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size rsaz_se_handler,.-rsaz_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_rsaz_1024_sqr_avx2
+ .rva .LSEH_end_rsaz_1024_sqr_avx2
+ .rva .LSEH_info_rsaz_1024_sqr_avx2
+
+ .rva .LSEH_begin_rsaz_1024_mul_avx2
+ .rva .LSEH_end_rsaz_1024_mul_avx2
+ .rva .LSEH_info_rsaz_1024_mul_avx2
+
+ .rva .LSEH_begin_rsaz_1024_gather5
+ .rva .LSEH_end_rsaz_1024_gather5
+ .rva .LSEH_info_rsaz_1024_gather5
+.section .xdata
+.align 8
+.LSEH_info_rsaz_1024_sqr_avx2:
+ .byte 9,0,0,0
+ .rva rsaz_se_handler
+ .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
+.LSEH_info_rsaz_1024_mul_avx2:
+ .byte 9,0,0,0
+ .rva rsaz_se_handler
+ .rva .Lmul_1024_body,.Lmul_1024_epilogue
+.LSEH_info_rsaz_1024_gather5:
+ .byte 0x01,0x36,0x17,0x0b
+ .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
+ .byte 0x00,0xb3,0x00,0x00 # set_frame r11
+___
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
+
+ s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
+ s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
+ print $_,"\n";
+}
+
+}}} else {{{
+print <<___; # assembler is too old
+.text
+
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,\@abi-omnipotent
+rsaz_avx2_eligible:
+ xor %eax,%eax
+ ret
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.globl rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.globl rsaz_1024_norm2red_avx2
+.globl rsaz_1024_red2norm_avx2
+.globl rsaz_1024_scatter5_avx2
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_sqr_avx2,\@abi-omnipotent
+rsaz_1024_sqr_avx2:
+rsaz_1024_mul_avx2:
+rsaz_1024_norm2red_avx2:
+rsaz_1024_red2norm_avx2:
+rsaz_1024_scatter5_avx2:
+rsaz_1024_gather5_avx2:
+ .byte 0x0f,0x0b # ud2
+ ret
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}}}
+
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/rsaz-x86_64.pl b/openssl-1.1.0h/crypto/bn/asm/rsaz-x86_64.pl
new file mode 100755
index 0000000..6f3b664
--- /dev/null
+++ b/openssl-1.1.0h/crypto/bn/asm/rsaz-x86_64.pl
@@ -0,0 +1,2358 @@
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+##############################################################################
+# #
+# Copyright (c) 2012, Intel Corporation #
+# #
+# All rights reserved. #
+# #
+# Redistribution and use in source and binary forms, with or without #
+# modification, are permitted provided that the following conditions are #
+# met: #
+# #
+# * Redistributions of source code must retain the above copyright #
+# notice, this list of conditions and the following disclaimer. #
+# #
+# * Redistributions in binary form must reproduce the above copyright #
+# notice, this list of conditions and the following disclaimer in the #
+# documentation and/or other materials provided with the #
+# distribution. #
+# #
+# * Neither the name of the Intel Corporation nor the names of its #
+# contributors may be used to endorse or promote products derived from #
+# this software without specific prior written permission. #
+# #
+# #
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
+# #
+##############################################################################
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Architecture Group, Microprocessor and Chipset Development, #
+# Israel Development Center, Haifa, Israel #
+# (2) University of Haifa #
+##############################################################################
+# Reference: #
+# [1] S. Gueron, "Efficient Software Implementations of Modular #
+# Exponentiation", http://eprint.iacr.org/2011/239 #
+# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
+# IEEE Proceedings of 9th International Conference on Information #
+# Technology: New Generations (ITNG 2012), 821-823 (2012). #
+# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
+# Journal of Cryptographic Engineering 2:31-43 (2012). #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
+# resistant 512-bit and 1024-bit modular exponentiation for optimizing #
+# RSA1024 and RSA2048 on x86_64 platforms", #
+# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
+##############################################################################
+
+# While original submission covers 512- and 1024-bit exponentiation,
+# this module is limited to 512-bit version only (and as such
+# accelerates RSA1024 sign). This is because improvement for longer
+# keys is not high enough to justify the effort, highest measured
+# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
+# for the moment of this writing!] Nor does this module implement
+# "monolithic" complete exponentiation jumbo-subroutine, but adheres
+# to more modular mixture of C and assembly. And it's optimized even
+# for processors other than Intel Core family (see table below for
+# improvement coefficients).
+# <appro@openssl.org>
+#
+# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
+# ----------------+---------------------------
+# Opteron +13% |+5% +20%
+# Bulldozer -0% |-1% +10%
+# P4 +11% |+7% +8%
+# Westmere +5% |+14% +17%
+# Sandy Bridge +2% |+12% +29%
+# Ivy Bridge +1% |+11% +35%
+# Haswell(**) -0% |+12% +39%
+# Atom +13% |+11% +4%
+# VIA Nano +70% |+9% +25%
+#
+# (*) rsax engine and fips numbers are presented for reference
+# purposes;
+# (**) MULX was attempted, but found to give only marginal improvement;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $addx = ($1>=12);
+}
+
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $addx = ($ver>=3.03);
+}
+
+($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
+{
+my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
+
+$code.=<<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl rsaz_512_sqr
+.type rsaz_512_sqr,\@function,5
+.align 32
+rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lsqr_body:
+ movq $mod, %rbp # common argument
+ movq ($inp), %rdx
+ movq 8($inp), %rax
+ movq $n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Loop_sqrx
+___
+$code.=<<___;
+ jmp .Loop_sqr
+
+.align 32
+.Loop_sqr:
+ movl $times,128+8(%rsp)
+#first iteration
+ movq %rdx, %rbx
+ mulq %rdx
+ movq %rax, %r8
+ movq 16($inp), %rax
+ movq %rdx, %r9
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 24($inp), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 32($inp), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 40($inp), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($inp), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 56($inp), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r14
+ movq %rbx, %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ addq %r8, %r8 #shlq \$1, %r8
+ movq %r9, %rcx
+ adcq %r9, %r9 #shld \$1, %r8, %r9
+
+ mulq %rax
+ movq %rax, (%rsp)
+ addq %rdx, %r8
+ adcq \$0, %r9
+
+ movq %r8, 8(%rsp)
+ shrq \$63, %rcx
+
+#second iteration
+ movq 8($inp), %r8
+ movq 16($inp), %rax
+ mulq %r8
+ addq %rax, %r10
+ movq 24($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r11
+ movq 32($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r11
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r12
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r12
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r13
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r13
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r14
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r14
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r8
+ addq %rax, %r15
+ movq %r8, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r15
+ movq %rdx, %r8
+ movq %r10, %rdx
+ adcq \$0, %r8
+
+ add %rdx, %rdx
+ lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+ movq %r11, %rbx
+ adcq %r11, %r11 #shld \$1, %r10, %r11
+
+ mulq %rax
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq \$0, %r11
+
+ movq %r9, 16(%rsp)
+ movq %r10, 24(%rsp)
+ shrq \$63, %rbx
+
+#third iteration
+ movq 16($inp), %r9
+ movq 24($inp), %rax
+ mulq %r9
+ addq %rax, %r12
+ movq 32($inp), %rax
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ addq %rax, %r13
+ movq 40($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r13
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ addq %rax, %r14
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r14
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ movq %r12, %r10
+ lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
+ addq %rax, %r15
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rcx, %r15
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r9
+ shrq \$63, %r10
+ addq %rax, %r8
+ movq %r9, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ movq %r13, %rcx
+ leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
+
+ mulq %rax
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq \$0, %r13
+
+ movq %r11, 32(%rsp)
+ movq %r12, 40(%rsp)
+ shrq \$63, %rcx
+
+#fourth iteration
+ movq 24($inp), %r10
+ movq 32($inp), %rax
+ mulq %r10
+ addq %rax, %r14
+ movq 40($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ addq %rax, %r15
+ movq 48($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r15
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ movq %r14, %r12
+ leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
+ addq %rax, %r8
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ addq %rbx, %r8
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r10
+ shrq \$63, %r12
+ addq %rax, %r9
+ movq %r10, %rax
+ adcq \$0, %rdx
+ addq %rbx, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ movq %r15, %rbx
+ leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
+
+ mulq %rax
+ addq %rax, %r13
+ adcq %rdx, %r14
+ adcq \$0, %r15
+
+ movq %r13, 48(%rsp)
+ movq %r14, 56(%rsp)
+ shrq \$63, %rbx
+
+#fifth iteration
+ movq 32($inp), %r11
+ movq 40($inp), %rax
+ mulq %r11
+ addq %rax, %r8
+ movq 48($inp), %rax
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r11
+ addq %rax, %r9
+ movq 56($inp), %rax
+ adcq \$0, %rdx
+ movq %r8, %r12
+ leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
+ addq %rcx, %r9
+ movq %rdx, %rcx
+ adcq \$0, %rcx
+
+ mulq %r11
+ shrq \$63, %r12
+ addq %rax, %r10
+ movq %r11, %rax
+ adcq \$0, %rdx
+ addq %rcx, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ movq %r9, %rcx
+ leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
+
+ mulq %rax
+ addq %rax, %r15
+ adcq %rdx, %r8
+ adcq \$0, %r9
+
+ movq %r15, 64(%rsp)
+ movq %r8, 72(%rsp)
+ shrq \$63, %rcx
+
+#sixth iteration
+ movq 40($inp), %r12
+ movq 48($inp), %rax
+ mulq %r12
+ addq %rax, %r10
+ movq 56($inp), %rax
+ movq %rdx, %rbx
+ adcq \$0, %rbx
+
+ mulq %r12
+ addq %rax, %r11
+ movq %r12, %rax
+ movq %r10, %r15
+ leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
+ adcq \$0, %rdx
+ shrq \$63, %r15
+ addq %rbx, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ movq %r11, %rbx
+ leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
+
+ mulq %rax
+ addq %rax, %r9
+ adcq %rdx, %r10
+ adcq \$0, %r11
+
+ movq %r9, 80(%rsp)
+ movq %r10, 88(%rsp)
+
+#seventh iteration
+ movq 48($inp), %r13
+ movq 56($inp), %rax
+ mulq %r13
+ addq %rax, %r12
+ movq %r13, %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ xorq %r14, %r14
+ shlq \$1, %rbx
+ adcq %r12, %r12 #shld \$1, %rbx, %r12
+ adcq %r13, %r13 #shld \$1, %r12, %r13
+ adcq %r14, %r14 #shld \$1, %r13, %r14
+
+ mulq %rax
+ addq %rax, %r11
+ adcq %rdx, %r12
+ adcq \$0, %r13
+
+ movq %r11, 96(%rsp)
+ movq %r12, 104(%rsp)
+
+#eighth iteration
+ movq 56($inp), %rax
+ mulq %rax
+ addq %rax, %r13
+ adcq \$0, %rdx
+
+ addq %rdx, %r14
+
+ movq %r13, 112(%rsp)
+ movq %r14, 120(%rsp)
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8, %rdx
+ movq %r9, %rax
+ movl 128+8(%rsp), $times
+ movq $out, $inp
+
+ decl $times
+ jnz .Loop_sqr
+___
+if ($addx) {
+$code.=<<___;
+ jmp .Lsqr_tail
+
+.align 32
+.Loop_sqrx:
+ movl $times,128+8(%rsp)
+ movq $out, %xmm0 # off-load
+ movq %rbp, %xmm1 # off-load
+#first iteration
+ mulx %rax, %r8, %r9
+
+ mulx 16($inp), %rcx, %r10
+ xor %rbp, %rbp # cf=0, of=0
+
+ mulx 24($inp), %rax, %r11
+ adcx %rcx, %r9
+
+ mulx 32($inp), %rcx, %r12
+ adcx %rax, %r10
+
+ mulx 40($inp), %rax, %r13
+ adcx %rcx, %r11
+
+ .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
+ adcx %rax, %r12
+ adcx %rcx, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
+ adcx %rax, %r14
+ adcx %rbp, %r15 # %rbp is 0
+
+ mov %r9, %rcx
+ shld \$1, %r8, %r9
+ shl \$1, %r8
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rdx, %r8
+ mov 8($inp), %rdx
+ adcx %rbp, %r9
+
+ mov %rax, (%rsp)
+ mov %r8, 8(%rsp)
+
+#second iteration
+ mulx 16($inp), %rax, %rbx
+ adox %rax, %r10
+ adcx %rbx, %r11
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
+ adox $out, %r11
+ adcx %r8, %r12
+
+ mulx 32($inp), %rax, %rbx
+ adox %rax, %r12
+ adcx %rbx, %r13
+
+ mulx 40($inp), $out, %r8
+ adox $out, %r13
+ adcx %r8, %r14
+
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
+ adox %rax, %r14
+ adcx %rbx, %r15
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
+ adox $out, %r15
+ adcx %rbp, %r8
+ adox %rbp, %r8
+
+ mov %r11, %rbx
+ shld \$1, %r10, %r11
+ shld \$1, %rcx, %r10
+
+ xor %ebp,%ebp
+ mulx %rdx, %rax, %rcx
+ mov 16($inp), %rdx
+ adcx %rax, %r9
+ adcx %rcx, %r10
+ adcx %rbp, %r11
+
+ mov %r9, 16(%rsp)
+ .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
+
+#third iteration
+ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
+ adox $out, %r12
+ adcx %r9, %r13
+
+ mulx 32($inp), %rax, %rcx
+ adox %rax, %r13
+ adcx %rcx, %r14
+
+ mulx 40($inp), $out, %r9
+ adox $out, %r14
+ adcx %r9, %r15
+
+ .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
+ adox %rax, %r15
+ adcx %rcx, %r8
+
+ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
+ adox $out, %r8
+ adcx %rbp, %r9
+ adox %rbp, %r9
+
+ mov %r13, %rcx
+ shld \$1, %r12, %r13
+ shld \$1, %rbx, %r12
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r11
+ adcx %rdx, %r12
+ mov 24($inp), %rdx
+ adcx %rbp, %r13
+
+ mov %r11, 32(%rsp)
+ .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
+
+#fourth iteration
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
+ adox %rax, %r14
+ adcx %rbx, %r15
+
+ mulx 40($inp), $out, %r10
+ adox $out, %r15
+ adcx %r10, %r8
+
+ mulx 48($inp), %rax, %rbx
+ adox %rax, %r8
+ adcx %rbx, %r9
+
+ mulx 56($inp), $out, %r10
+ adox $out, %r9
+ adcx %rbp, %r10
+ adox %rbp, %r10
+
+ .byte 0x66
+ mov %r15, %rbx
+ shld \$1, %r14, %r15
+ shld \$1, %rcx, %r14
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r13
+ adcx %rdx, %r14
+ mov 32($inp), %rdx
+ adcx %rbp, %r15
+
+ mov %r13, 48(%rsp)
+ mov %r14, 56(%rsp)
+
+#fifth iteration
+ .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
+ adox $out, %r8
+ adcx %r11, %r9
+
+ mulx 48($inp), %rax, %rcx
+ adox %rax, %r9
+ adcx %rcx, %r10
+
+ mulx 56($inp), $out, %r11
+ adox $out, %r10
+ adcx %rbp, %r11
+ adox %rbp, %r11
+
+ mov %r9, %rcx
+ shld \$1, %r8, %r9
+ shld \$1, %rbx, %r8
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r15
+ adcx %rdx, %r8
+ mov 40($inp), %rdx
+ adcx %rbp, %r9
+
+ mov %r15, 64(%rsp)
+ mov %r8, 72(%rsp)
+
+#sixth iteration
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
+ adox %rax, %r10
+ adcx %rbx, %r11
+
+ .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
+ adox $out, %r11
+ adcx %rbp, %r12
+ adox %rbp, %r12
+
+ mov %r11, %rbx
+ shld \$1, %r10, %r11
+ shld \$1, %rcx, %r10
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r9
+ adcx %rdx, %r10
+ mov 48($inp), %rdx
+ adcx %rbp, %r11
+
+ mov %r9, 80(%rsp)
+ mov %r10, 88(%rsp)
+
+#seventh iteration
+ .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
+ adox %rax, %r12
+ adox %rbp, %r13
+
+ xor %r14, %r14
+ shld \$1, %r13, %r14
+ shld \$1, %r12, %r13
+ shld \$1, %rbx, %r12
+
+ xor %ebp, %ebp
+ mulx %rdx, %rax, %rdx
+ adcx %rax, %r11
+ adcx %rdx, %r12
+ mov 56($inp), %rdx
+ adcx %rbp, %r13
+
+ .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
+ .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
+
+#eighth iteration
+ mulx %rdx, %rax, %rdx
+ adox %rax, %r13
+ adox %rbp, %rdx
+
+ .byte 0x66
+ add %rdx, %r14
+
+ movq %r13, 112(%rsp)
+ movq %r14, 120(%rsp)
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8, %rdx
+ movq %r9, %rax
+ movl 128+8(%rsp), $times
+ movq $out, $inp
+
+ decl $times
+ jnz .Loop_sqrx
+
+.Lsqr_tail:
+___
+}
+$code.=<<___;
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lsqr_epilogue:
+ ret
+.size rsaz_512_sqr,.-rsaz_512_sqr
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$code.=<<___;
+.globl rsaz_512_mul
+.type rsaz_512_mul,\@function,5
+.align 32
+rsaz_512_mul:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lmul_body:
+ movq $out, %xmm0 # off-load arguments
+ movq $mod, %xmm1
+ movq $n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx
+___
+$code.=<<___;
+ movq ($bp), %rbx # pass b[0]
+ movq $bp, %rbp # pass argument
+ call __rsaz_512_mul
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_tail
+
+.align 32
+.Lmulx:
+ movq $bp, %rbp # pass argument
+ movq ($bp), %rdx # pass b[0]
+ call __rsaz_512_mulx
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+.Lmul_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_epilogue:
+ ret
+.size rsaz_512_mul,.-rsaz_512_mul
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl rsaz_512_mul_gather4
+.type rsaz_512_mul_gather4,\@function,6
+.align 32
+rsaz_512_mul_gather4:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$`128+24+($win64?0xb0:0)`, %rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0xa0(%rsp)
+ movaps %xmm7,0xb0(%rsp)
+ movaps %xmm8,0xc0(%rsp)
+ movaps %xmm9,0xd0(%rsp)
+ movaps %xmm10,0xe0(%rsp)
+ movaps %xmm11,0xf0(%rsp)
+ movaps %xmm12,0x100(%rsp)
+ movaps %xmm13,0x110(%rsp)
+ movaps %xmm14,0x120(%rsp)
+ movaps %xmm15,0x130(%rsp)
+___
+$code.=<<___;
+.Lmul_gather4_body:
+ movd $pwr,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 16*0($bp),%xmm8
+ movdqa 16*1($bp),%xmm9
+ movdqa 16*2($bp),%xmm10
+ movdqa 16*3($bp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($bp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($bp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($bp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($bp),%xmm15
+ leaq 128($bp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx_gather
+___
+$code.=<<___;
+ movq %xmm8,%rbx
+
+ movq $n0, 128(%rsp) # off-load arguments
+ movq $out, 128+8(%rsp)
+ movq $mod, 128+16(%rsp)
+
+ movq ($ap), %rax
+ movq 8($ap), %rcx
+ mulq %rbx # 0 iteration
+ movq %rax, (%rsp)
+ movq %rcx, %rax
+ movq %rdx, %r8
+
+ mulq %rbx
+ addq %rax, %r8
+ movq 16($ap), %rax
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 24($ap), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 32($ap), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 40($ap), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($ap), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 56($ap), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r14
+ movq ($ap), %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8(%rsp), %rdi
+ movl \$7, %ecx
+ jmp .Loop_mul_gather
+
+.align 32
+.Loop_mul_gather:
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rbx
+
+ mulq %rbx
+ addq %rax, %r8
+ movq 8($ap), %rax
+ movq %r8, (%rdi)
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 16($ap), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 24($ap), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 32($ap), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 40($ap), %rax
+ adcq \$0, %rdx
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48($ap), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56($ap), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r15
+ movq ($ap), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8(%rdi), %rdi
+
+ decl %ecx
+ jnz .Loop_mul_gather
+
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, 32(%rdi)
+ movq %r13, 40(%rdi)
+ movq %r14, 48(%rdi)
+ movq %r15, 56(%rdi)
+
+ movq 128+8(%rsp), $out
+ movq 128+16(%rsp), %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_gather_tail
+
+.align 32
+.Lmulx_gather:
+ movq %xmm8,%rdx
+
+ mov $n0, 128(%rsp) # off-load arguments
+ mov $out, 128+8(%rsp)
+ mov $mod, 128+16(%rsp)
+
+ mulx ($ap), %rbx, %r8 # 0 iteration
+ mov %rbx, (%rsp)
+ xor %edi, %edi # cf=0, of=0
+
+ mulx 8($ap), %rax, %r9
+
+ mulx 16($ap), %rbx, %r10
+ adcx %rax, %r8
+
+ mulx 24($ap), %rax, %r11
+ adcx %rbx, %r9
+
+ mulx 32($ap), %rbx, %r12
+ adcx %rax, %r10
+
+ mulx 40($ap), %rax, %r13
+ adcx %rbx, %r11
+
+ mulx 48($ap), %rbx, %r14
+ adcx %rax, %r12
+
+ mulx 56($ap), %rax, %r15
+ adcx %rbx, %r13
+ adcx %rax, %r14
+ .byte 0x67
+ mov %r8, %rbx
+ adcx %rdi, %r15 # %rdi is 0
+
+ mov \$-7, %rcx
+ jmp .Loop_mulx_gather
+
+.align 32
+.Loop_mulx_gather:
+ movdqa 16*0(%rbp),%xmm8
+ movdqa 16*1(%rbp),%xmm9
+ movdqa 16*2(%rbp),%xmm10
+ movdqa 16*3(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7(%rbp),%xmm15
+ leaq 128(%rbp), %rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,%rdx
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ mulx 8($ap), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16($ap), %rax, %r10
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ .byte 0x67
+ adox %r15, %r14
+
+ mulx 56($ap), %rax, %r15
+ mov %rbx, 64(%rsp,%rcx,8)
+ adcx %rax, %r14
+ adox %rdi, %r15
+ mov %r8, %rbx
+ adcx %rdi, %r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Loop_mulx_gather
+
+ mov %r8, 64(%rsp)
+ mov %r9, 64+8(%rsp)
+ mov %r10, 64+16(%rsp)
+ mov %r11, 64+24(%rsp)
+ mov %r12, 64+32(%rsp)
+ mov %r13, 64+40(%rsp)
+ mov %r14, 64+48(%rsp)
+ mov %r15, 64+56(%rsp)
+
+ mov 128(%rsp), %rdx # pull arguments
+ mov 128+8(%rsp), $out
+ mov 128+16(%rsp), %rbp
+
+ mov (%rsp), %r8
+ mov 8(%rsp), %r9
+ mov 16(%rsp), %r10
+ mov 24(%rsp), %r11
+ mov 32(%rsp), %r12
+ mov 40(%rsp), %r13
+ mov 48(%rsp), %r14
+ mov 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+.Lmul_gather_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp), %rax
+___
+$code.=<<___ if ($win64);
+ movaps 0xa0-0xc8(%rax),%xmm6
+ movaps 0xb0-0xc8(%rax),%xmm7
+ movaps 0xc0-0xc8(%rax),%xmm8
+ movaps 0xd0-0xc8(%rax),%xmm9
+ movaps 0xe0-0xc8(%rax),%xmm10
+ movaps 0xf0-0xc8(%rax),%xmm11
+ movaps 0x100-0xc8(%rax),%xmm12
+ movaps 0x110-0xc8(%rax),%xmm13
+ movaps 0x120-0xc8(%rax),%xmm14
+ movaps 0x130-0xc8(%rax),%xmm15
+ lea 0xb0(%rax),%rax
+___
+$code.=<<___;
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_gather4_epilogue:
+ ret
+.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
+___
+}
+{
+my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl rsaz_512_mul_scatter4
+.type rsaz_512_mul_scatter4,\@function,6
+.align 32
+rsaz_512_mul_scatter4:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $pwr, $pwr
+ subq \$128+24, %rsp
+.Lmul_scatter4_body:
+ leaq ($tbl,$pwr,8), $tbl
+ movq $out, %xmm0 # off-load arguments
+ movq $mod, %xmm1
+ movq $tbl, %xmm2
+ movq $n0, 128(%rsp)
+
+ movq $out, %rbp
+___
+$code.=<<___ if ($addx);
+ movl \$0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl \$0x80100,%r11d # check for MULX and ADO/CX
+ je .Lmulx_scatter
+___
+$code.=<<___;
+ movq ($out),%rbx # pass b[0]
+ call __rsaz_512_mul
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lmul_scatter_tail
+
+.align 32
+.Lmulx_scatter:
+ movq ($out), %rdx # pass b[0]
+ call __rsaz_512_mulx
+
+ movq %xmm0, $out
+ movq %xmm1, %rbp
+
+ movq 128(%rsp), %rdx # pull $n0
+ movq (%rsp), %r8
+ movq 8(%rsp), %r9
+ movq 16(%rsp), %r10
+ movq 24(%rsp), %r11
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ movq 48(%rsp), %r14
+ movq 56(%rsp), %r15
+
+ call __rsaz_512_reducex
+
+.Lmul_scatter_tail:
+___
+$code.=<<___;
+ addq 64(%rsp), %r8
+ adcq 72(%rsp), %r9
+ adcq 80(%rsp), %r10
+ adcq 88(%rsp), %r11
+ adcq 96(%rsp), %r12
+ adcq 104(%rsp), %r13
+ adcq 112(%rsp), %r14
+ adcq 120(%rsp), %r15
+ movq %xmm2, $inp
+ sbbq %rcx, %rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8, 128*0($inp) # scatter
+ movq %r9, 128*1($inp)
+ movq %r10, 128*2($inp)
+ movq %r11, 128*3($inp)
+ movq %r12, 128*4($inp)
+ movq %r13, 128*5($inp)
+ movq %r14, 128*6($inp)
+ movq %r15, 128*7($inp)
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_scatter4_epilogue:
+ ret
+.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
+___
+}
+{
+my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
+$code.=<<___;
+.globl rsaz_512_mul_by_one
+.type rsaz_512_mul_by_one,\@function,4
+.align 32
+rsaz_512_mul_by_one:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ subq \$128+24, %rsp
+.Lmul_by_one_body:
+___
+$code.=<<___ if ($addx);
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+___
+$code.=<<___;
+ movq $mod, %rbp # reassign argument
+ movq $n0, 128(%rsp)
+
+ movq ($inp), %r8
+ pxor %xmm0, %xmm0
+ movq 8($inp), %r9
+ movq 16($inp), %r10
+ movq 24($inp), %r11
+ movq 32($inp), %r12
+ movq 40($inp), %r13
+ movq 48($inp), %r14
+ movq 56($inp), %r15
+
+ movdqa %xmm0, (%rsp)
+ movdqa %xmm0, 16(%rsp)
+ movdqa %xmm0, 32(%rsp)
+ movdqa %xmm0, 48(%rsp)
+ movdqa %xmm0, 64(%rsp)
+ movdqa %xmm0, 80(%rsp)
+ movdqa %xmm0, 96(%rsp)
+___
+$code.=<<___ if ($addx);
+ andl \$0x80100,%eax
+ cmpl \$0x80100,%eax # check for MULX and ADO/CX
+ je .Lby_one_callx
+___
+$code.=<<___;
+ call __rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+ jmp .Lby_one_tail
+.align 32
+.Lby_one_callx:
+ movq 128(%rsp), %rdx # pull $n0
+ call __rsaz_512_reducex
+.Lby_one_tail:
+___
+$code.=<<___;
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ leaq 128+24+48(%rsp), %rax
+ movq -48(%rax), %r15
+ movq -40(%rax), %r14
+ movq -32(%rax), %r13
+ movq -24(%rax), %r12
+ movq -16(%rax), %rbp
+ movq -8(%rax), %rbx
+ leaq (%rax), %rsp
+.Lmul_by_one_epilogue:
+ ret
+.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
+___
+}
+{ # __rsaz_512_reduce
+ #
+ # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
+ # output: %r8-%r15
+ # clobbers: everything except %rbp and %rdi
+$code.=<<___;
+.type __rsaz_512_reduce,\@abi-omnipotent
+.align 32
+__rsaz_512_reduce:
+ movq %r8, %rbx
+ imulq 128+8(%rsp), %rbx
+ movq 0(%rbp), %rax
+ movl \$8, %ecx
+ jmp .Lreduction_loop
+
+.align 32
+.Lreduction_loop:
+ mulq %rbx
+ movq 8(%rbp), %rax
+ negq %r8
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 16(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 24(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 32(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq 128+8(%rsp), %rsi
+ #movq %rdx, %r11
+ #adcq \$0, %r11
+ adcq \$0, %rdx
+ movq %rdx, %r11
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 40(%rbp), %rax
+ adcq \$0, %rdx
+ imulq %r8, %rsi
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ movq %rsi, %rbx
+ addq %rax, %r15
+ movq 0(%rbp), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ decl %ecx
+ jne .Lreduction_loop
+
+ ret
+.size __rsaz_512_reduce,.-__rsaz_512_reduce
+___
+}
+if ($addx) {
+ # __rsaz_512_reducex
+ #
+ # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
+ # output: %r8-%r15
+ # clobbers: everything except %rbp and %rdi
+$code.=<<___;
+.type __rsaz_512_reducex,\@abi-omnipotent
+.align 32
+__rsaz_512_reducex:
+ #movq 128+8(%rsp), %rdx # pull $n0
+ imulq %r8, %rdx
+ xorq %rsi, %rsi # cf=0,of=0
+ movl \$8, %ecx
+ jmp .Lreduction_loopx
+
+.align 32
+.Lreduction_loopx:
+ mov %r8, %rbx
+ mulx 0(%rbp), %rax, %r8
+ adcx %rbx, %rax
+ adox %r9, %r8
+
+ mulx 8(%rbp), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16(%rbp), %rbx, %r10
+ adcx %rbx, %r9
+ adox %r11, %r10
+
+ mulx 24(%rbp), %rbx, %r11
+ adcx %rbx, %r10
+ adox %r12, %r11
+
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
+ mov %rdx, %rax
+ mov %r8, %rdx
+ adcx %rbx, %r11
+ adox %r13, %r12
+
+ mulx 128+8(%rsp), %rbx, %rdx
+ mov %rax, %rdx
+
+ mulx 40(%rbp), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56(%rbp), %rax, %r15
+ mov %rbx, %rdx
+ adcx %rax, %r14
+ adox %rsi, %r15 # %rsi is 0
+ adcx %rsi, %r15 # cf=0
+
+ decl %ecx # of=0
+ jne .Lreduction_loopx
+
+ ret
+.size __rsaz_512_reducex,.-__rsaz_512_reducex
+___
+}
+{ # __rsaz_512_subtract
+ # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
+ # output:
+ # clobbers: everything but %rdi, %rsi and %rbp
+$code.=<<___;
+.type __rsaz_512_subtract,\@abi-omnipotent
+.align 32
+__rsaz_512_subtract:
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ movq 0($mod), %r8
+ movq 8($mod), %r9
+ negq %r8
+ notq %r9
+ andq %rcx, %r8
+ movq 16($mod), %r10
+ andq %rcx, %r9
+ notq %r10
+ movq 24($mod), %r11
+ andq %rcx, %r10
+ notq %r11
+ movq 32($mod), %r12
+ andq %rcx, %r11
+ notq %r12
+ movq 40($mod), %r13
+ andq %rcx, %r12
+ notq %r13
+ movq 48($mod), %r14
+ andq %rcx, %r13
+ notq %r14
+ movq 56($mod), %r15
+ andq %rcx, %r14
+ notq %r15
+ andq %rcx, %r15
+
+ addq ($out), %r8
+ adcq 8($out), %r9
+ adcq 16($out), %r10
+ adcq 24($out), %r11
+ adcq 32($out), %r12
+ adcq 40($out), %r13
+ adcq 48($out), %r14
+ adcq 56($out), %r15
+
+ movq %r8, ($out)
+ movq %r9, 8($out)
+ movq %r10, 16($out)
+ movq %r11, 24($out)
+ movq %r12, 32($out)
+ movq %r13, 40($out)
+ movq %r14, 48($out)
+ movq %r15, 56($out)
+
+ ret
+.size __rsaz_512_subtract,.-__rsaz_512_subtract
+___
+}
+{ # __rsaz_512_mul
+ #
+ # input: %rsi - ap, %rbp - bp
+ # output:
+ # clobbers: everything
+my ($ap,$bp) = ("%rsi","%rbp");
+$code.=<<___;
+.type __rsaz_512_mul,\@abi-omnipotent
+.align 32
+__rsaz_512_mul:
+ leaq 8(%rsp), %rdi
+
+ movq ($ap), %rax
+ mulq %rbx
+ movq %rax, (%rdi)
+ movq 8($ap), %rax
+ movq %rdx, %r8
+
+ mulq %rbx
+ addq %rax, %r8
+ movq 16($ap), %rax
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 24($ap), %rax
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 32($ap), %rax
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 40($ap), %rax
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 48($ap), %rax
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 56($ap), %rax
+ movq %rdx, %r14
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r14
+ movq ($ap), %rax
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8($bp), $bp
+ leaq 8(%rdi), %rdi
+
+ movl \$7, %ecx
+ jmp .Loop_mul
+
+.align 32
+.Loop_mul:
+ movq ($bp), %rbx
+ mulq %rbx
+ addq %rax, %r8
+ movq 8($ap), %rax
+ movq %r8, (%rdi)
+ movq %rdx, %r8
+ adcq \$0, %r8
+
+ mulq %rbx
+ addq %rax, %r9
+ movq 16($ap), %rax
+ adcq \$0, %rdx
+ addq %r9, %r8
+ movq %rdx, %r9
+ adcq \$0, %r9
+
+ mulq %rbx
+ addq %rax, %r10
+ movq 24($ap), %rax
+ adcq \$0, %rdx
+ addq %r10, %r9
+ movq %rdx, %r10
+ adcq \$0, %r10
+
+ mulq %rbx
+ addq %rax, %r11
+ movq 32($ap), %rax
+ adcq \$0, %rdx
+ addq %r11, %r10
+ movq %rdx, %r11
+ adcq \$0, %r11
+
+ mulq %rbx
+ addq %rax, %r12
+ movq 40($ap), %rax
+ adcq \$0, %rdx
+ addq %r12, %r11
+ movq %rdx, %r12
+ adcq \$0, %r12
+
+ mulq %rbx
+ addq %rax, %r13
+ movq 48($ap), %rax
+ adcq \$0, %rdx
+ addq %r13, %r12
+ movq %rdx, %r13
+ adcq \$0, %r13
+
+ mulq %rbx
+ addq %rax, %r14
+ movq 56($ap), %rax
+ adcq \$0, %rdx
+ addq %r14, %r13
+ movq %rdx, %r14
+ leaq 8($bp), $bp
+ adcq \$0, %r14
+
+ mulq %rbx
+ addq %rax, %r15
+ movq ($ap), %rax
+ adcq \$0, %rdx
+ addq %r15, %r14
+ movq %rdx, %r15
+ adcq \$0, %r15
+
+ leaq 8(%rdi), %rdi
+
+ decl %ecx
+ jnz .Loop_mul
+
+ movq %r8, (%rdi)
+ movq %r9, 8(%rdi)
+ movq %r10, 16(%rdi)
+ movq %r11, 24(%rdi)
+ movq %r12, 32(%rdi)
+ movq %r13, 40(%rdi)
+ movq %r14, 48(%rdi)
+ movq %r15, 56(%rdi)
+
+ ret
+.size __rsaz_512_mul,.-__rsaz_512_mul
+___
+}
+if ($addx) {
+ # __rsaz_512_mulx
+ #
+ # input: %rsi - ap, %rbp - bp
+ # output:
+ # clobbers: everything
+my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
+$code.=<<___;
+.type __rsaz_512_mulx,\@abi-omnipotent
+.align 32
+__rsaz_512_mulx:
+ mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
+ mov \$-6, %rcx
+
+ mulx 8($ap), %rax, %r9
+ movq %rbx, 8(%rsp)
+
+ mulx 16($ap), %rbx, %r10
+ adc %rax, %r8
+
+ mulx 24($ap), %rax, %r11
+ adc %rbx, %r9
+
+ mulx 32($ap), %rbx, %r12
+ adc %rax, %r10
+
+ mulx 40($ap), %rax, %r13
+ adc %rbx, %r11
+
+ mulx 48($ap), %rbx, %r14
+ adc %rax, %r12
+
+ mulx 56($ap), %rax, %r15
+ mov 8($bp), %rdx
+ adc %rbx, %r13
+ adc %rax, %r14
+ adc \$0, %r15
+
+ xor $zero, $zero # cf=0,of=0
+ jmp .Loop_mulx
+
+.align 32
+.Loop_mulx:
+ movq %r8, %rbx
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ mulx 8($ap), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ mulx 16($ap), %rax, %r10
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ mulx 24($ap), %rax, %r11
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ mulx 56($ap), %rax, %r15
+ movq 64($bp,%rcx,8), %rdx
+ movq %rbx, 8+64-8(%rsp,%rcx,8)
+ adcx %rax, %r14
+ adox $zero, %r15
+ adcx $zero, %r15 # cf=0
+
+ inc %rcx # of=0
+ jnz .Loop_mulx
+
+ movq %r8, %rbx
+ mulx ($ap), %rax, %r8
+ adcx %rax, %rbx
+ adox %r9, %r8
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
+ adcx %rax, %r8
+ adox %r10, %r9
+
+ .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
+ adcx %rax, %r9
+ adox %r11, %r10
+
+ mulx 24($ap), %rax, %r11
+ adcx %rax, %r10
+ adox %r12, %r11
+
+ mulx 32($ap), %rax, %r12
+ adcx %rax, %r11
+ adox %r13, %r12
+
+ mulx 40($ap), %rax, %r13
+ adcx %rax, %r12
+ adox %r14, %r13
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
+ adcx %rax, %r13
+ adox %r15, %r14
+
+ .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
+ adcx %rax, %r14
+ adox $zero, %r15
+ adcx $zero, %r15
+
+ mov %rbx, 8+64-8(%rsp)
+ mov %r8, 8+64(%rsp)
+ mov %r9, 8+64+8(%rsp)
+ mov %r10, 8+64+16(%rsp)
+ mov %r11, 8+64+24(%rsp)
+ mov %r12, 8+64+32(%rsp)
+ mov %r13, 8+64+40(%rsp)
+ mov %r14, 8+64+48(%rsp)
+ mov %r15, 8+64+56(%rsp)
+
+ ret
+.size __rsaz_512_mulx,.-__rsaz_512_mulx
+___
+}
+{
+my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+$code.=<<___;
+.globl rsaz_512_scatter4
+.type rsaz_512_scatter4,\@abi-omnipotent
+.align 16
+rsaz_512_scatter4:
+ leaq ($out,$power,8), $out
+ movl \$8, %r9d
+ jmp .Loop_scatter
+.align 16
+.Loop_scatter:
+ movq ($inp), %rax
+ leaq 8($inp), $inp
+ movq %rax, ($out)
+ leaq 128($out), $out
+ decl %r9d
+ jnz .Loop_scatter
+ ret
+.size rsaz_512_scatter4,.-rsaz_512_scatter4
+
+.globl rsaz_512_gather4
+.type rsaz_512_gather4,\@abi-omnipotent
+.align 16
+rsaz_512_gather4:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_rsaz_512_gather4:
+ .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
+ .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
+ .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
+ .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
+ .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
+ .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
+ .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
+ .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
+ .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
+ .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
+___
+$code.=<<___;
+ movd $power,%xmm8
+ movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
+ movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
+
+ pshufd \$0,%xmm8,%xmm8 # broadcast $power
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+ movdqa %xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+ paddd %xmm`$i`,%xmm`$i+1`
+ pcmpeqd %xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+ pcmpeqd %xmm8,%xmm7
+ movl \$8, %r9d
+ jmp .Loop_gather
+.align 16
+.Loop_gather:
+ movdqa 16*0($inp),%xmm8
+ movdqa 16*1($inp),%xmm9
+ movdqa 16*2($inp),%xmm10
+ movdqa 16*3($inp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 16*4($inp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 16*5($inp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 16*6($inp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 16*7($inp),%xmm15
+ leaq 128($inp), $inp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd \$0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,($out)
+ leaq 8($out), $out
+ decl %r9d
+ jnz .Loop_gather
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ add \$0xa8,%rsp
+___
+$code.=<<___;
+ ret
+.LSEH_end_rsaz_512_gather4:
+.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+ .long 0,0, 1,1
+ .long 2,2, 2,2
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 128+24+48(%rax),%rax
+
+ lea .Lmul_gather4_epilogue(%rip),%rbx
+ cmp %r10,%rbx
+ jne .Lse_not_in_mul_gather4
+
+ lea 0xb0(%rax),%rax
+
+ lea -48-0xa8(%rax),%rsi
+ lea 512($context),%rdi
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lse_not_in_mul_gather4:
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_rsaz_512_sqr
+ .rva .LSEH_end_rsaz_512_sqr
+ .rva .LSEH_info_rsaz_512_sqr
+
+ .rva .LSEH_begin_rsaz_512_mul
+ .rva .LSEH_end_rsaz_512_mul
+ .rva .LSEH_info_rsaz_512_mul
+
+ .rva .LSEH_begin_rsaz_512_mul_gather4
+ .rva .LSEH_end_rsaz_512_mul_gather4
+ .rva .LSEH_info_rsaz_512_mul_gather4
+
+ .rva .LSEH_begin_rsaz_512_mul_scatter4
+ .rva .LSEH_end_rsaz_512_mul_scatter4
+ .rva .LSEH_info_rsaz_512_mul_scatter4
+
+ .rva .LSEH_begin_rsaz_512_mul_by_one
+ .rva .LSEH_end_rsaz_512_mul_by_one
+ .rva .LSEH_info_rsaz_512_mul_by_one
+
+ .rva .LSEH_begin_rsaz_512_gather4
+ .rva .LSEH_end_rsaz_512_gather4
+ .rva .LSEH_info_rsaz_512_gather4
+
+.section .xdata
+.align 8
+.LSEH_info_rsaz_512_sqr:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_gather4:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_scatter4:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_mul_by_one:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
+.LSEH_info_rsaz_512_gather4:
+ .byte 0x01,0x46,0x16,0x00
+ .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
+ .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
+ .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
+ .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
+ .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
+ .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
+ .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
+ .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
+ .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
+ .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
+ .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/openssl-1.1.0h/crypto/bn/asm/s390x-gf2m.pl b/openssl-1.1.0h/crypto/bn/asm/s390x-gf2m.pl
new file mode 100644