From 535bc3d6e20975d23a7b0f82939a3e028481867d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Thu, 9 Feb 2023 19:53:30 +0100 Subject: [PATCH 1/2] initial bring-over of "Intel Intelligent Storage Acceleration Library" port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Fülöp --- .../icp/gcm-simd/isa-l_crypto-ported/LICENSE | 26 + .../icp/gcm-simd/isa-l_crypto-ported/README | 18 + .../gcm-simd/isa-l_crypto-ported/gcm128_sse.S | 31 + .../gcm-simd/isa-l_crypto-ported/gcm256_sse.S | 31 + .../isa-l_crypto-ported/gcm_defines.S | 295 +++ .../gcm-simd/isa-l_crypto-ported/gcm_sse.S | 2153 ++++++++++++++++ .../gcm-simd/isa-l_crypto-ported/reg_sizes.S | 224 ++ contrib/icp/gcm-simd/isa-l_crypto/LICENSE | 26 + contrib/icp/gcm-simd/isa-l_crypto/README | 10 + .../icp/gcm-simd/isa-l_crypto/gcm128_sse.asm | 31 + .../icp/gcm-simd/isa-l_crypto/gcm256_sse.asm | 31 + .../icp/gcm-simd/isa-l_crypto/gcm_defines.asm | 291 +++ contrib/icp/gcm-simd/isa-l_crypto/gcm_sse.asm | 2171 +++++++++++++++++ .../icp/gcm-simd/isa-l_crypto/reg_sizes.asm | 459 ++++ .../asm-x86_64/modes/THIRDPARTYLICENSE.intel | 26 + .../modes/THIRDPARTYLICENSE.intel.descrip | 1 + .../icp/asm-x86_64/modes/isalc_gcm128_sse.S | 31 + .../icp/asm-x86_64/modes/isalc_gcm256_sse.S | 31 + .../icp/asm-x86_64/modes/isalc_gcm_defines.S | 293 +++ module/icp/asm-x86_64/modes/isalc_gcm_sse.S | 2150 ++++++++++++++++ module/icp/asm-x86_64/modes/isalc_reg_sizes.S | 221 ++ 21 files changed, 8550 insertions(+) create mode 100644 contrib/icp/gcm-simd/isa-l_crypto-ported/LICENSE create mode 100644 contrib/icp/gcm-simd/isa-l_crypto-ported/README create mode 100644 contrib/icp/gcm-simd/isa-l_crypto-ported/gcm128_sse.S create mode 100644 contrib/icp/gcm-simd/isa-l_crypto-ported/gcm256_sse.S create mode 100644 contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_defines.S create mode 100644 contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_sse.S create mode 100644 contrib/icp/gcm-simd/isa-l_crypto-ported/reg_sizes.S create mode 100644 contrib/icp/gcm-simd/isa-l_crypto/LICENSE create mode 100644 contrib/icp/gcm-simd/isa-l_crypto/README create mode 100644 contrib/icp/gcm-simd/isa-l_crypto/gcm128_sse.asm create mode 100644 contrib/icp/gcm-simd/isa-l_crypto/gcm256_sse.asm create mode 100644 contrib/icp/gcm-simd/isa-l_crypto/gcm_defines.asm create mode 100644 contrib/icp/gcm-simd/isa-l_crypto/gcm_sse.asm create mode 100644 contrib/icp/gcm-simd/isa-l_crypto/reg_sizes.asm create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel.descrip create mode 100644 module/icp/asm-x86_64/modes/isalc_gcm128_sse.S create mode 100644 module/icp/asm-x86_64/modes/isalc_gcm256_sse.S create mode 100644 module/icp/asm-x86_64/modes/isalc_gcm_defines.S create mode 100644 module/icp/asm-x86_64/modes/isalc_gcm_sse.S create mode 100644 module/icp/asm-x86_64/modes/isalc_reg_sizes.S diff --git a/contrib/icp/gcm-simd/isa-l_crypto-ported/LICENSE b/contrib/icp/gcm-simd/isa-l_crypto-ported/LICENSE new file mode 100644 index 000000000000..ecebef110b46 --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto-ported/LICENSE @@ -0,0 +1,26 @@ + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/icp/gcm-simd/isa-l_crypto-ported/README b/contrib/icp/gcm-simd/isa-l_crypto-ported/README new file mode 100644 index 000000000000..219d427c845e --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto-ported/README @@ -0,0 +1,18 @@ +This directory contains the ported "Intel(R) Intelligent Storage Acceleration +Library Crypto Version" [1] GCM x86-64 assembly files [2]. They were adapted +for the GNU assembler and translated to AT&T syntax. The later was necessary to +support LLVM clangs integrated assembler. It was verified that the ported +versions still pass the GCM tests in the isa-l_crypto source tree. The original +files can be found in the isa-l_crypto directory one level up. + +The ported assembler files where then further adapted to be used within the +ICP. + +The main purpose to include these files (and the original ones) here, is to +serve as a reference if upstream changes need to be applied to the files +included and modified in the ICP. They could be used by other projects +depending on the GNU or LLVM assemblers as a starting point as well. + + +[1] https://github.com/intel/isa-l_crypto +[2] https://github.com/intel/isa-l_crypto/tree/v2.24.0/aes diff --git a/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm128_sse.S b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm128_sse.S new file mode 100644 index 000000000000..6b6422291dc2 --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm128_sse.S @@ -0,0 +1,31 @@ +//####################################################################### +// Copyright(c) 2011-2016 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, +// DATA, OR PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//####################################################################### + +#define GCM128_MODE 1 +#include "gcm_sse_att.S" diff --git a/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm256_sse.S b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm256_sse.S new file mode 100644 index 000000000000..31781f598ced --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm256_sse.S @@ -0,0 +1,31 @@ +////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2016 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +////////////////////////////////////////////////////////////////////////// + +#define GCM256_MODE 1 +#include "gcm_sse_att.S" diff --git a/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_defines.S b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_defines.S new file mode 100644 index 000000000000..12a74bbe084a --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_defines.S @@ -0,0 +1,295 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2016 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef GCM_DEFINES_ASM_INCLUDED +#define GCM_DEFINES_ASM_INCLUDED + +// +// Authors: +// Erdinc Ozturk +// Vinodh Gopal +// James Guilford + +// Port to GNU as and translation to GNU as att-syntax +// Copyright(c) 2023 Attila Fülöp + +//////////// + +.section .rodata + +.balign 16 +POLY: .quad 0x0000000000000001, 0xC200000000000000 + +// unused for sse +.balign 64 +POLY2: .quad 0x00000001C2000000, 0xC200000000000000 + .quad 0x00000001C2000000, 0xC200000000000000 + .quad 0x00000001C2000000, 0xC200000000000000 + .quad 0x00000001C2000000, 0xC200000000000000 +.balign 16 +TWOONE: .quad 0x0000000000000001, 0x0000000100000000 + +// order of these constants should not change. +// more specifically, ALL_F should follow SHIFT_MASK, and ZERO should +// follow ALL_F + +.balign 64 +SHUF_MASK: .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + +SHIFT_MASK: .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 +ALL_F: .quad 0xffffffffffffffff, 0xffffffffffffffff +ZERO: .quad 0x0000000000000000, 0x0000000000000000 // unused for sse +ONE: .quad 0x0000000000000001, 0x0000000000000000 +TWO: .quad 0x0000000000000002, 0x0000000000000000 // unused for sse +ONEf: .quad 0x0000000000000000, 0x0100000000000000 +TWOf: .quad 0x0000000000000000, 0x0200000000000000 // unused for sse + +// Below unused for sse +.balign 64 +ddq_add_1234: + .quad 0x0000000000000001, 0x0000000000000000 + .quad 0x0000000000000002, 0x0000000000000000 + .quad 0x0000000000000003, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + +.balign 64 +ddq_add_5678: + .quad 0x0000000000000005, 0x0000000000000000 + .quad 0x0000000000000006, 0x0000000000000000 + .quad 0x0000000000000007, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + +.balign 64 +ddq_add_4444: + .quad 0x0000000000000004, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + +.balign 64 +ddq_add_8888: + .quad 0x0000000000000008, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + +.balign 64 +ddq_addbe_1234: + .quad 0x0000000000000000, 0x0100000000000000 + .quad 0x0000000000000000, 0x0200000000000000 + .quad 0x0000000000000000, 0x0300000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + +.balign 64 +ddq_addbe_5678: + .quad 0x0000000000000000, 0x0500000000000000 + .quad 0x0000000000000000, 0x0600000000000000 + .quad 0x0000000000000000, 0x0700000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + +.balign 64 +ddq_addbe_4444: + .quad 0x0000000000000000, 0x0400000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + +.balign 64 +ddq_addbe_8888: + .quad 0x0000000000000000, 0x0800000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + +.balign 64 +byte_len_to_mask_table: + .short 0x0000, 0x0001, 0x0003, 0x0007 + .short 0x000f, 0x001f, 0x003f, 0x007f + .short 0x00ff, 0x01ff, 0x03ff, 0x07ff + .short 0x0fff, 0x1fff, 0x3fff, 0x7fff + .short 0xffff + +.balign 64 +byte64_len_to_mask_table: + .quad 0x0000000000000000, 0x0000000000000001 + .quad 0x0000000000000003, 0x0000000000000007 + .quad 0x000000000000000f, 0x000000000000001f + .quad 0x000000000000003f, 0x000000000000007f + .quad 0x00000000000000ff, 0x00000000000001ff + .quad 0x00000000000003ff, 0x00000000000007ff + .quad 0x0000000000000fff, 0x0000000000001fff + .quad 0x0000000000003fff, 0x0000000000007fff + .quad 0x000000000000ffff, 0x000000000001ffff + .quad 0x000000000003ffff, 0x000000000007ffff + .quad 0x00000000000fffff, 0x00000000001fffff + .quad 0x00000000003fffff, 0x00000000007fffff + .quad 0x0000000000ffffff, 0x0000000001ffffff + .quad 0x0000000003ffffff, 0x0000000007ffffff + .quad 0x000000000fffffff, 0x000000001fffffff + .quad 0x000000003fffffff, 0x000000007fffffff + .quad 0x00000000ffffffff, 0x00000001ffffffff + .quad 0x00000003ffffffff, 0x00000007ffffffff + .quad 0x0000000fffffffff, 0x0000001fffffffff + .quad 0x0000003fffffffff, 0x0000007fffffffff + .quad 0x000000ffffffffff, 0x000001ffffffffff + .quad 0x000003ffffffffff, 0x000007ffffffffff + .quad 0x00000fffffffffff, 0x00001fffffffffff + .quad 0x00003fffffffffff, 0x00007fffffffffff + .quad 0x0000ffffffffffff, 0x0001ffffffffffff + .quad 0x0003ffffffffffff, 0x0007ffffffffffff + .quad 0x000fffffffffffff, 0x001fffffffffffff + .quad 0x003fffffffffffff, 0x007fffffffffffff + .quad 0x00ffffffffffffff, 0x01ffffffffffffff + .quad 0x03ffffffffffffff, 0x07ffffffffffffff + .quad 0x0fffffffffffffff, 0x1fffffffffffffff + .quad 0x3fffffffffffffff, 0x7fffffffffffffff + .quad 0xffffffffffffffff + +.balign 64 +mask_out_top_block: + .quad 0xffffffffffffffff, 0xffffffffffffffff + .quad 0xffffffffffffffff, 0xffffffffffffffff + .quad 0xffffffffffffffff, 0xffffffffffffffff + .quad 0x0000000000000000, 0x0000000000000000 + +.section .text + + +////define the fields of gcm_data struct +//typedef struct gcm_data +//{ +// u8 expanded_keys[16*15]// +// u8 shifted_hkey_1[16]// // store HashKey <<1 mod poly here +// u8 shifted_hkey_2[16]// // store HashKey^2 <<1 mod poly here +// u8 shifted_hkey_3[16]// // store HashKey^3 <<1 mod poly here +// u8 shifted_hkey_4[16]// // store HashKey^4 <<1 mod poly here +// u8 shifted_hkey_5[16]// // store HashKey^5 <<1 mod poly here +// u8 shifted_hkey_6[16]// // store HashKey^6 <<1 mod poly here +// u8 shifted_hkey_7[16]// // store HashKey^7 <<1 mod poly here +// u8 shifted_hkey_8[16]// // store HashKey^8 <<1 mod poly here +// u8 shifted_hkey_1_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_2_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_3_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_4_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_5_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_6_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_7_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_8_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +//} gcm_data// + +#ifndef GCM_KEYS_VAES_AVX512_INCLUDED +#define HashKey 16*15 // store HashKey <<1 mod poly here +#define HashKey_1 16*15 // store HashKey <<1 mod poly here +#define HashKey_2 16*16 // store HashKey^2 <<1 mod poly here +#define HashKey_3 16*17 // store HashKey^3 <<1 mod poly here +#define HashKey_4 16*18 // store HashKey^4 <<1 mod poly here +#define HashKey_5 16*19 // store HashKey^5 <<1 mod poly here +#define HashKey_6 16*20 // store HashKey^6 <<1 mod poly here +#define HashKey_7 16*21 // store HashKey^7 <<1 mod poly here +#define HashKey_8 16*22 // store HashKey^8 <<1 mod poly here +#define HashKey_k 16*23 // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +#define HashKey_2_k 16*24 // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_3_k 16*25 // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_4_k 16*26 // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_5_k 16*27 // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_6_k 16*28 // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_7_k 16*29 // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_8_k 16*30 // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +#endif + +#define AadHash 16*0 // store current Hash of data which has been input +#define AadLen 16*1 // store length of input data which will not be encrypted or decrypted +#define InLen (16*1)+8 // store length of input data which will be encrypted or decrypted +#define PBlockEncKey 16*2 // encryption key for the partial block at the end of the previous update +#define OrigIV 16*3 // input IV +#define CurCount 16*4 // Current counter for generation of encryption key +#define PBlockLen 16*5 // length of partial block at the end of the previous update + +.macro xmmreg name, num + .set xmm\name, %xmm\num +.endm + +#define arg(x) (STACK_OFFSET + 8*(x))(%r14) + + +#if __OUTPUT_FORMAT__ != elf64 +#define arg1 %rcx +#define arg2 %rdx +#define arg3 %r8 +#define arg4 %r9 +#define arg5 %rsi +#define arg6 (STACK_OFFSET + 8*6)(%r14) +#define arg7 (STACK_OFFSET + 8*7)(%r14) +#define arg8 (STACK_OFFSET + 8*8)(%r14) +#define arg9 (STACK_OFFSET + 8*9)(%r14) +#define arg10 (STACK_OFFSET + 8*10)(%r14) +#else +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx +#define arg4 %rcx +#define arg5 %r8 +#define arg6 %r9 +#define arg7 ((STACK_OFFSET) + 8*1)(%r14) +#define arg8 ((STACK_OFFSET) + 8*2)(%r14) +#define arg9 ((STACK_OFFSET) + 8*3)(%r14) +#define arg10 ((STACK_OFFSET) + 8*4)(%r14) +#endif + +#ifdef NT_LDST +#define NT_LD +#define NT_ST +#endif + +////// Use Non-temporal load/stor +#ifdef NT_LD +#define XLDR movntdqa +#define VXLDR vmovntdqa +#define VX512LDR vmovntdqa +#else +#define XLDR movdqu +#define VXLDR vmovdqu +#define VX512LDR vmovdqu8 +#endif + +////// Use Non-temporal load/stor +#ifdef NT_ST +#define XSTR movntdq +#define VXSTR vmovntdq +#define VX512STR vmovntdq +#else +#define XSTR movdqu +#define VXSTR vmovdqu +#define VX512STR vmovdqu8 +#endif + +#endif // GCM_DEFINES_ASM_INCLUDED diff --git a/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_sse.S b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_sse.S new file mode 100644 index 000000000000..eec65600ddc6 --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto-ported/gcm_sse.S @@ -0,0 +1,2153 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2017 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// +// Authors: +// Erdinc Ozturk +// Vinodh Gopal +// James Guilford +// +// +// References: +// This code was derived and highly optimized from the code described in +// paper: +// Vinodh Gopal et. al. Optimized Galois-Counter-Mode +// Implementation on Intel Architecture Processors. August, 2010 +// +// For the shift-based reductions used in this code, we used the method +// described in paper: +// Shay Gueron, Michael E. Kounavis. Intel Carry-Less +// Multiplication Instruction and its Usage for Computing the GCM +// Mode. January, 2010. +// +// +// Assumptions: +// +// +// +// iv: +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Salt (From the SA) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Initialization Vector | +// | (This is the sequence number from IPSec header) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 0x1 | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// +// +// AAD: +// AAD will be padded with 0 to the next 16byte multiple +// for example, assume AAD is a u32 vector +// +// if AAD is 8 bytes: +// AAD[3] = {A0, A1}; +// padded AAD in xmm register = {A1 A0 0 0} +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | SPI (A1) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 32-bit Sequence Number (A0) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 0x0 | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// AAD Format with 32-bit Sequence Number +// +// if AAD is 12 bytes: +// AAD[3] = {A0, A1, A2}; +// padded AAD in xmm register = {A2 A1 A0 0} +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | SPI (A2) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 64-bit Extended Sequence Number {A1,A0} | +// | | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 0x0 | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// AAD Format with 64-bit Extended Sequence Number +// +// +// aadLen: +// Must be a multiple of 4 bytes and from the definition of the spec. +// The code additionally supports any aadLen length. +// +// TLen: +// from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +// +// poly = x^128 + x^127 + x^126 + x^121 + 1 +// throughout the code, one tab and two tab indentations are used. one tab is +// for GHASH part, two tabs is for AES part. +// + +// Port to GNU as and translation to GNU as att-syntax +// Copyright(c) 2023 Attila Fülöp + +// .altmacro +.att_syntax prefix + +#include "../include/reg_sizes_att.S" +#include "gcm_defines_att.S" + +#if !defined(GCM128_MODE) && !defined(GCM256_MODE) +#error "No GCM mode selected for gcm_sse.S!" +#endif + +#if defined(FUNCT_EXTENSION) +#error "No support for non-temporal versions yet!" +#endif +#define _nt 1 + +#ifdef GCM128_MODE +#define FN_NAME(x,y) aes_gcm_ ## x ## _128 ## y ## sse +#define NROUNDS 9 +#endif + +#ifdef GCM256_MODE +#define FN_NAME(x,y) aes_gcm_ ## x ## _256 ## y ## sse +#define NROUNDS 13 +#endif + + +// need to push 5 registers into stack to maintain +#define STACK_OFFSET 8*5 + +#define TMP2 16*0 // Temporary storage for AES State 2 (State 1 is stored in an XMM register) +#define TMP3 16*1 // Temporary storage for AES State 3 +#define TMP4 16*2 // Temporary storage for AES State 4 +#define TMP5 16*3 // Temporary storage for AES State 5 +#define TMP6 16*4 // Temporary storage for AES State 6 +#define TMP7 16*5 // Temporary storage for AES State 7 +#define TMP8 16*6 // Temporary storage for AES State 8 + +#define LOCAL_STORAGE 16*7 + +#if __OUTPUT_FORMAT == win64 +#define XMM_STORAGE 16*10 +#else +#define XMM_STORAGE 0 +#endif + +#define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +//////////////////////////////////////////////////////////////// +// Utility Macros +//////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +// Input: A and B (128-bits each, bit-reflected) +// Output: C = A*B*x mod poly, (i.e. >>1 ) +// To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +// GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +//////////////////////////////////////////////////////////////////////////////// +.macro GHASH_MUL GH, HK, T1, T2, T3, T4, T5 + // \GH, \HK hold the values for the two operands which are carry-less + // multiplied. + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqa \GH, \T1 + pshufd $0b01001110, \GH, \T2 + pshufd $0b01001110, \HK, \T3 + pxor \GH, \T2 // \T2 = (a1+a0) + pxor \HK, \T3 // \T3 = (b1+b0) + + pclmulqdq $0x11, \HK, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \HK, \GH // \GH = a0*b0 + pclmulqdq $0x00, \T3, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \GH, \T2 + pxor \T1, \T2 // \T2 = a0*b1+a1*b0 + + movdqa \T2, \T3 + pslldq $8, \T3 // shift-L \T3 2 DWs + psrldq $8, \T2 // shift-R \T2 2 DWs + pxor \T3, \GH + pxor \T2, \T1 // <\T1:\GH> holds the result of the carry-less multiplication of \GH by \HK + + + //first phase of the reduction + movdqa \GH, \T2 + movdqa \GH, \T3 + movdqa \GH, \T4 // move \GH into \T2, \T3, \T4 in order to perform the three shifts independently + + pslld $31, \T2 // packed right shifting << 31 + pslld $30, \T3 // packed right shifting shift << 30 + pslld $25, \T4 // packed right shifting shift << 25 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + movdqa \T2, \T5 + psrldq $4, \T5 // shift-R \T5 1 DW + + pslldq $12, \T2 // shift-L \T2 3 DWs + pxor \T2, \GH // first phase of the reduction complete + //////////////////////////////////////////////////////////////////////// + + //second phase of the reduction + movdqa \GH, \T2 // make 3 copies of \GH (in in \T2, \T3, \T4) for doing three shift operations + movdqa \GH, \T3 + movdqa \GH, \T4 + + psrld $1, \T2 // packed left shifting >> 1 + psrld $2, \T3 // packed left shifting >> 2 + psrld $7, \T4 // packed left shifting >> 7 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + pxor \T5, \T2 + pxor \T2, \GH + pxor \T1, \GH // the result is in \T1 + +.endm // GHASH_MUL + +//////////////////////////////////////////////////////////////////////////////// +// PRECOMPUTE: Precompute HashKey_{2..8} and HashKey{,_{2..8}}_k. +// HasKey_i_k holds XORed values of the low and high parts of the Haskey_i. +//////////////////////////////////////////////////////////////////////////////// +.macro PRECOMPUTE GDATA, HK, T1, T2, T3, T4, T5, T6 + + movdqa \HK, \T4 + pshufd $0b01001110, \HK, \T1 + pxor \HK, \T1 + movdqu \T1, HashKey_k(\GDATA) + + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^2<<1 mod poly + movdqu \T4, HashKey_2(\GDATA) // [HashKey_2] = HashKey^2<<1 mod poly + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_2_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^3<<1 mod poly + movdqu \T4, HashKey_3(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_3_k(\GDATA) + + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^4<<1 mod poly + movdqu \T4, HashKey_4(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_4_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^5<<1 mod poly + movdqu \T4, HashKey_5(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_5_k(\GDATA) + + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^6<<1 mod poly + movdqu \T4, HashKey_6(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_6_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^7<<1 mod poly + movdqu \T4, HashKey_7(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_7_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^8<<1 mod poly + movdqu \T4, HashKey_8(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_8_k(\GDATA) + +.endm // PRECOMPUTE + + +//////////////////////////////////////////////////////////////////////////////// +// READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less +// than 16 bytes. +// Returns 0 if data has length 0. +// Input: The input data (INPUT), that data's length (LENGTH). +// Output: The packed xmm register (OUTPUT). +//////////////////////////////////////////////////////////////////////////////// +.macro READ_SMALL_DATA_INPUT OUTPUT, INPUT, LENGTH, \ + END_READ_LOCATION, COUNTER, TMP1 + + // clang compat: no local support + // LOCAL _byte_loop_1, _byte_loop_2, _done + + pxor \OUTPUT, \OUTPUT + mov \LENGTH, \COUNTER + mov \INPUT, \END_READ_LOCATION + add \LENGTH, \END_READ_LOCATION + xor \TMP1, \TMP1 + + + cmp $8, \COUNTER + jl _byte_loop_2_\@ + pinsrq $0, (\INPUT), \OUTPUT //Read in 8 bytes if they exists + je _done_\@ + + sub $8, \COUNTER + +_byte_loop_1_\@: //Read in data 1 byte at a time while data is left + shl $8, \TMP1 //This loop handles when 8 bytes were already read in + dec \END_READ_LOCATION + + //// mov BYTE(\TMP1), BYTE [\END_READ_LOCATION] + bytereg \TMP1 + movb (\END_READ_LOCATION), breg + dec \COUNTER + jg _byte_loop_1_\@ + pinsrq $1, \TMP1, \OUTPUT + jmp _done_\@ + +_byte_loop_2_\@: //Read in data 1 byte at a time while data is left + cmp $0, \COUNTER + je _done_\@ + shl $8, \TMP1 //This loop handles when no bytes were already read in + dec \END_READ_LOCATION + //// mov BYTE(\TMP1), BYTE [\END_READ_LOCATION] + bytereg \TMP1 + movb (\END_READ_LOCATION), breg + dec \COUNTER + jg _byte_loop_2_\@ + pinsrq $0, \TMP1, \OUTPUT +_done_\@: + +.endm // READ_SMALL_DATA_INPUT + + +//////////////////////////////////////////////////////////////////////////////// +// CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +// Input: The input data (A_IN), that data's length (A_LEN), and the hash key +// (HASH_KEY). +// Output: The hash of the data (AAD_HASH). +//////////////////////////////////////////////////////////////////////////////// +.macro CALC_AAD_HASH A_IN, A_LEN, AAD_HASH, HASH_KEY, XTMP1, XTMP2, XTMP3, \ + XTMP4, XTMP5, T1, T2, T3, T4, T5 + + // clang compat: no local support + // LOCAL _get_AAD_loop16, _get_small_AAD_block, _CALC_AAD_done + + mov \A_IN, \T1 // T1 = AAD + mov \A_LEN, \T2 // T2 = aadLen + pxor \AAD_HASH, \AAD_HASH + + cmp $16, \T2 + jl _get_small_AAD_block_\@ + +_get_AAD_loop16_\@: + + movdqu (\T1), \XTMP1 + //byte-reflect the AAD data + pshufb SHUF_MASK(%rip), \XTMP1 + pxor \XTMP1, \AAD_HASH + GHASH_MUL \AAD_HASH, \HASH_KEY, \XTMP1, \XTMP2, \XTMP3, \XTMP4, \XTMP5 + + sub $16, \T2 + je _CALC_AAD_done_\@ + + add $16, \T1 + cmp $16, \T2 + jge _get_AAD_loop16_\@ + +_get_small_AAD_block_\@: + READ_SMALL_DATA_INPUT \XTMP1, \T1, \T2, \T3, \T4, \T5 + //byte-reflect the AAD data + pshufb SHUF_MASK(%rip), \XTMP1 + pxor \XTMP1, \AAD_HASH + GHASH_MUL \AAD_HASH, \HASH_KEY, \XTMP1, \XTMP2, \XTMP3, \XTMP4, \XTMP5 + +_CALC_AAD_done_\@: + +.endm // CALC_AAD_HASH + + + +//////////////////////////////////////////////////////////////////////////////// +// PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +// between update calls. Requires the input data be at least 1 byte long. +// Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text +// (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), the current data offset +// (DATA_OFFSET), and whether encoding or decoding (ENC_DEC). +// Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated +// GDATA_CTX. +// Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, +// xmm10, xmm11, xmm13 +//////////////////////////////////////////////////////////////////////////////// +.macro PARTIAL_BLOCK GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + PLAIN_CYPH_LEN, DATA_OFFSET, AAD_HASH, ENC_DEC + + // clang compat: no local support + // LOCAL _fewer_than_16_bytes, _data_read, _no_extra_mask_1 + // LOCAL _partial_incomplete_1, _dec_done, _no_extra_mask_2 + // LOCAL _partial_incomplete_2, _encode_done, _partial_fill + // LOCAL _count_set, _less_than_8_bytes_left, _partial_block_done + + mov PBlockLen(\GDATA_CTX), %r13 + cmp $0, %r13 + je _partial_block_done_\@ //Leave Macro if no partial blocks + + cmp $16, \PLAIN_CYPH_LEN //Read in input data without over reading + jl _fewer_than_16_bytes_\@ + XLDR (\PLAIN_CYPH_IN), %xmm1 //If more than 16 bytes of data, just fill the xmm register + jmp _data_read_\@ + +_fewer_than_16_bytes_\@: + lea (\PLAIN_CYPH_IN, \DATA_OFFSET), %r10 + READ_SMALL_DATA_INPUT %xmm1, %r10, \PLAIN_CYPH_LEN, %rax, %r12, %r15 + mov PBlockLen(\GDATA_CTX), %r13 + +_data_read_\@: //Finished reading in data + + + movdqu PBlockEncKey(\GDATA_CTX), %xmm9 //xmm9 = ctx_data.partial_block_enc_key + movdqu HashKey(\GDATA_KEY), %xmm13 + + lea SHIFT_MASK(%rip), %r12 + + add %r13, %r12 // adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 // get the appropriate shuffle mask + pshufb %xmm2, %xmm9 // shift right r13 bytes + + .ifc \ENC_DEC, DEC + + movdqa %xmm1, %xmm3 + pxor %xmm1, %xmm9 // Cyphertext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r15 + add %r13, %r15 + sub $16, %r15 //Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge _no_extra_mask_1_\@ //Determine if if partial block is not being filled and shift mask accordingly + sub %r15, %r12 +_no_extra_mask_1_\@: + + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out bottom r13 bytes of xmm9 + + pand %xmm1, %xmm3 + pshufb SHUF_MASK(%rip), %xmm3 + pshufb %xmm2, %xmm3 + pxor %xmm3, \AAD_HASH + + + cmp $0, %r15 + jl _partial_incomplete_1_\@ + + GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 //GHASH computation for the last <16 Byte block + xor %rax, %rax + mov %rax, PBlockLen(\GDATA_CTX) + jmp _dec_done_\@ +_partial_incomplete_1_\@: + add \PLAIN_CYPH_LEN, PBlockLen(\GDATA_CTX) +_dec_done_\@: + movdqu \AAD_HASH, AadHash(\GDATA_CTX) + + .else // .ifc \ENC_DEC, DEC + + pxor %xmm1, %xmm9 // Plaintext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r15 + add %r13, %r15 + sub $16, %r15 //Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge _no_extra_mask_2_\@ //Determine if if partial block is not being filled and shift mask accordingly + sub %r15, %r12 +_no_extra_mask_2_\@: + + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out bottom r13 bytes of xmm9 + + pshufb SHUF_MASK(%rip), %xmm9 + pshufb %xmm2, %xmm9 + pxor %xmm9, \AAD_HASH + + cmp $0, %r15 + jl _partial_incomplete_2_\@ + + GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 //GHASH computation for the last <16 Byte block + xor %rax, %rax + mov %rax, PBlockLen(\GDATA_CTX) + jmp _encode_done_\@ +_partial_incomplete_2_\@: + add \PLAIN_CYPH_LEN, PBlockLen(\GDATA_CTX) +_encode_done_\@: + movdqu \AAD_HASH, AadHash(\GDATA_CTX) + + pshufb SHUF_MASK(%rip), %xmm9 // shuffle xmm9 back to output as ciphertext + pshufb %xmm2, %xmm9 + + .endif // .ifc \ENC_DEC, DEC + + + ////////////////////////////////////////////////////////// + // output encrypted Bytes + cmp $0, %r15 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 + sub %r12, %r13 // Set r13 to be the number of bytes to write out + jmp _count_set_\@ +_partial_fill_\@: + mov \PLAIN_CYPH_LEN, %r13 +_count_set_\@: + movq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET) + add $8, \DATA_OFFSET + psrldq $8, %xmm9 + movq %xmm9, %rax + sub $8, %r13 +_less_than_8_bytes_left_\@: + mov %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET) + add $1, \DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ + ////////////////////////////////////////////////////////// +_partial_block_done_\@: +.endm // PARTIAL_BLOCK + +//////////////////////////////////////////////////////////////////////////////// +// INITIAL_BLOCKS: If a = number of total plaintext bytes; b = floor(a/16); +// \num_initial_blocks = b mod 8; encrypt the initial \num_initial_blocks +// blocks and apply ghash on the ciphertext. +// \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, r14 are used as a +// pointer only, not modified. +// Updated AAD_HASH is returned in \T3. +//////////////////////////////////////////////////////////////////////////////// +.macro INITIAL_BLOCKS GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + LENGTH, DATA_OFFSET, num_initial_blocks, T1, HASH_KEY, \ + T3, T4, T5, CTR, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, \ + XMM7, XMM8, T6, T_key, ENC_DEC + + // clang compat: no local support + // LOCAL _initial_blocks_done + +.altmacro +.set i, (8-\num_initial_blocks) + xmmreg i, %i + movdqu \XMM8, xmmi // move AAD_HASH to temp reg + + // start AES for \num_initial_blocks blocks + movdqu CurCount(\GDATA_CTX), \CTR // \CTR = Y0 + + +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, xmmi + pshufb SHUF_MASK(%rip), xmmi // perform a 16Byte swap +.set i, (i+1) +.endr + +movdqu 16*0(\GDATA_KEY), \T_key +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + pxor \T_key, xmmi +.set i, (i+1) +.endr + +.set j, 1 +.rept NROUNDS // encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu 16*j(\GDATA_KEY), \T_key +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + aesenc \T_key, xmmi +.set i, (i+1) +.endr + +.set j, (j+1) +.endr + +movdqu 16*j(\GDATA_KEY), \T_key // encrypt with last (14th) key round (12 for GCM192) +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + aesenclast \T_key, xmmi +.set i, (i+1) +.endr + +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + XLDR (\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, xmmi + XSTR xmmi, (\CYPH_PLAIN_OUT, \DATA_OFFSET) // write back ciphertext for \num_initial_blocks blocks + add $16, \DATA_OFFSET + .ifc \ENC_DEC, DEC + movdqa \T1, xmmi + .endif + pshufb SHUF_MASK(%rip), xmmi // prepare ciphertext for GHASH computations +.set i, (i+1) +.endr + + +.set i, (8-\num_initial_blocks) +.set j, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + xmmreg j, %j + pxor xmmi, xmmj + GHASH_MUL xmmj, <\HASH_KEY>, <\T1>, <\T3>, <\T4>, <\T5>, <\T6> // apply GHASH on \num_initial_blocks blocks +.set i, (i+1) +.set j, (j+1) +.endr +.noaltmacro + + // \XMM8 has the current Hash Value + movdqa \XMM8, \T3 + + cmp $128, \LENGTH + jl _initial_blocks_done_\@ // no need for precomputed constants + +//////////////////////////////////////////////////////////////////////////////// +// Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM1 + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM2 + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM3 + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM4 + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM5 + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM6 + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM7 + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM8 + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + + movdqu 16*0(\GDATA_KEY), \T_key + pxor \T_key, \XMM1 + pxor \T_key, \XMM2 + pxor \T_key, \XMM3 + pxor \T_key, \XMM4 + pxor \T_key, \XMM5 + pxor \T_key, \XMM6 + pxor \T_key, \XMM7 + pxor \T_key, \XMM8 + +.set i, 1 +.rept NROUNDS // do early (13) rounds (11 for GCM192) + movdqu 16*i(\GDATA_KEY), \T_key + aesenc \T_key, \XMM1 + aesenc \T_key, \XMM2 + aesenc \T_key, \XMM3 + aesenc \T_key, \XMM4 + aesenc \T_key, \XMM5 + aesenc \T_key, \XMM6 + aesenc \T_key, \XMM7 + aesenc \T_key, \XMM8 +.set i, (i+1) +.endr + + movdqu 16*i(\GDATA_KEY), \T_key // do final key round + aesenclast \T_key, \XMM1 + aesenclast \T_key, \XMM2 + aesenclast \T_key, \XMM3 + aesenclast \T_key, \XMM4 + aesenclast \T_key, \XMM5 + aesenclast \T_key, \XMM6 + aesenclast \T_key, \XMM7 + aesenclast \T_key, \XMM8 + + XLDR 16*0(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM1 + XSTR \XMM1, 16*0(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM1 + .endif + + XLDR 16*1(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM2 + XSTR \XMM2, 16*1(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM2 + .endif + + XLDR 16*2(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM3 + XSTR \XMM3, 16*2(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM3 + .endif + + XLDR 16*3(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM4 + XSTR \XMM4, 16*3(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM4 + .endif + + XLDR 16*4(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM5 + XSTR \XMM5, 16*4(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM5 + .endif + + XLDR 16*5(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM6 + XSTR \XMM6, 16*5(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM6 + .endif + + XLDR 16*6(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM7 + XSTR \XMM7, 16*6(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM7 + .endif + + XLDR 16*7(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM8 + XSTR \XMM8, 16*7(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM8 + .endif + + add $128, \DATA_OFFSET + + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + pxor \T3, \XMM1 // combine GHASHed value with the corresponding ciphertext + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + +//////////////////////////////////////////////////////////////////////////////// + +_initial_blocks_done_\@: +.noaltmacro +.endm // INITIAL_BLOCKS + + +//////////////////////////////////////////////////////////////////////////////// +// GHASH_8_ENCRYPT_8_PARALLEL: Encrypt 8 blocks at a time and ghash the 8 +// previously encrypted ciphertext blocks. +// \GDATA (KEY), \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN are used as pointers only, +// not modified. +// \DATA_OFFSET is the data offset value +//////////////////////////////////////////////////////////////////////////////// +.macro GHASH_8_ENCRYPT_8_PARALLEL GDATA, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + DATA_OFFSET, T1, T2, T3, T4, T5, T6, CTR, \ + XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, \ + XMM8, T7, loop_idx, ENC_DEC + + + movdqa \XMM1, \T7 + movdqu \XMM2, TMP2(%rsp) + movdqu \XMM3, TMP3(%rsp) + movdqu \XMM4, TMP4(%rsp) + movdqu \XMM5, TMP5(%rsp) + movdqu \XMM6, TMP6(%rsp) + movdqu \XMM7, TMP7(%rsp) + movdqu \XMM8, TMP8(%rsp) + + //////////////////////////////////////////////////////////////////////// + //// Karatsuba Method + + movdqa \T7, \T4 + pshufd $0b01001110, \T7, \T6 + pxor \T7, \T6 + .ifc \loop_idx, in_order + paddd ONE(%rip), \CTR // INCR CNT + .else + paddd ONEf(%rip), \CTR // INCR CNT + .endif + movdqu HashKey_8(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T4 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T7 // \T7 = a0*b0 + movdqu HashKey_8_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T6 // \T2 = (a1+a0)*(b1+b0) + movdqa \CTR, \XMM1 + + .ifc \loop_idx, in_order + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM2 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM3 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM4 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM5 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM6 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM7 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM8 + + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + + .else // .ifc \loop_idx, in_order + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM2 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM3 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM4 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM5 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM6 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM7 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM8 + + .endif // .ifc \loop_idx, in_order + //////////////////////////////////////////////////////////////////////// + + movdqu 16*0(\GDATA), \T1 + pxor \T1, \XMM1 + pxor \T1, \XMM2 + pxor \T1, \XMM3 + pxor \T1, \XMM4 + pxor \T1, \XMM5 + pxor \T1, \XMM6 + pxor \T1, \XMM7 + pxor \T1, \XMM8 + + // \XMM6, \T5 hold the values for the two operands which are + // carry-less multiplied + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqu TMP2(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_7(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_7_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*1(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*2(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqu TMP3(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_6(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_6_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*3(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu TMP4(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_5(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_5_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*4(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*5(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu TMP5(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_4(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_4_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*6(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + + movdqu TMP6(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_3(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_3_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*7(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu TMP7(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_2(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_2_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*8(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + + // \XMM8, \T5 hold the values for the two operands which are + // carry-less multiplied. + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqu TMP8(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T3, \T7 + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + + movdqu 16*9(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + +#ifdef GCM128_MODE + movdqu 16*10(\GDATA), \T5 +#endif +#ifdef GCM192_MODE + movdqu 16*10(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*11(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*12(\GDATA), \T5 // finish last key round +#endif +#ifdef GCM256_MODE + movdqu 16*10(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*11(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*12(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*13(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*14(\GDATA), \T5 // finish last key round +#endif + +.altmacro +.set i, 0 +.set j, 1 +.rept 8 + xmmreg j, %j + XLDR 16*i(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + + .ifc \ENC_DEC, DEC + movdqa \T1, \T3 + .endif + + pxor \T5, \T1 + aesenclast \T1, xmmj // XMM1:XMM8 + XSTR xmmj, 16*i(\CYPH_PLAIN_OUT, \DATA_OFFSET) // Write to the Output buffer + + .ifc \ENC_DEC, DEC + movdqa \T3, xmmj + .endif +.set i, (i+1) +.set j, (j+1) +.endr +.noaltmacro + + pxor \T6, \T2 + pxor \T4, \T2 + pxor \T7, \T2 + + + movdqa \T2, \T3 + pslldq $8, \T3 // shift-L \T3 2 DWs + psrldq $8, \T2 // shift-R \T2 2 DWs + pxor \T3, \T7 + pxor \T2, \T4 // accumulate the results in \T4:\T7 + + + + //first phase of the reduction + movdqa \T7, \T2 + movdqa \T7, \T3 + movdqa \T7, \T1 // move \T7 into \T2, \T3, \T1 in order to perform the three shifts independently + + pslld $31, \T2 // packed right shifting << 31 + pslld $30, \T3 // packed right shifting shift << 30 + pslld $25, \T1 // packed right shifting shift << 25 + pxor \T3, \T2 // xor the shifted versions + pxor \T1, \T2 + + movdqa \T2, \T5 + psrldq $4, \T5 // shift-R \T5 1 DW + + pslldq $12, \T2 // shift-L \T2 3 DWs + pxor \T2, \T7 // first phase of the reduction complete + + //////////////////////////////////////////////////////////////////////// + + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + + //second phase of the reduction + movdqa \T7, \T2 // make 3 copies of \T7 (in in \T2, \T3, \T1) for doing three shift operations + movdqa \T7, \T3 + movdqa \T7, \T1 + + psrld $1, \T2 // packed left shifting >> 1 + psrld $2, \T3 // packed left shifting >> 2 + psrld $7, \T1 // packed left shifting >> 7 + pxor \T3, \T2 // xor the shifted versions + pxor \T1, \T2 + + pxor \T5, \T2 + pxor \T2, \T7 + pxor \T4, \T7 // the result is in \T4 + + + pxor \T7, \XMM1 + +.endm // GHASH_8_ENCRYPT_8_PARALLEL + +//////////////////////////////////////////////////////////////////////////////// +// GHASH_LAST_8: GHASH the last 8 ciphertext blocks. +//////////////////////////////////////////////////////////////////////////////// +.macro GHASH_LAST_8 GDATA, T1, T2, T3, T4, T5, T6, T7, \ + XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8 + + + // Karatsuba Method + movdqa \XMM1, \T6 + pshufd $0b01001110, \XMM1, \T2 + pxor \XMM1, \T2 + movdqu HashKey_8(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T6 // \T6 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM1 // \XMM1 = a0*b0 + movdqu HashKey_8_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + movdqa \XMM1, \T7 + movdqa \T2, \XMM1 // result in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM2, \T1 + pshufd $0b01001110, \XMM2, \T2 + pxor \XMM2, \T2 + movdqu HashKey_7(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM2 // \XMM2 = a0*b0 + movdqu HashKey_7_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM2, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM3, \T1 + pshufd $0b01001110, \XMM3, \T2 + pxor \XMM3, \T2 + movdqu HashKey_6(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM3 // \XMM3 = a0*b0 + movdqu HashKey_6_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM3, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM4, \T1 + pshufd $0b01001110, \XMM4, \T2 + pxor \XMM4, \T2 + movdqu HashKey_5(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM4 // \XMM4 = a0*b0 + movdqu HashKey_5_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM4, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM5, \T1 + pshufd $0b01001110, \XMM5, \T2 + pxor \XMM5, \T2 + movdqu HashKey_4(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM5 // \XMM5 = a0*b0 + movdqu HashKey_4_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM5, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM6, \T1 + pshufd $0b01001110, \XMM6, \T2 + pxor \XMM6, \T2 + movdqu HashKey_3(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM6 // \XMM6 = a0*b0 + movdqu HashKey_3_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM6, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM7, \T1 + pshufd $0b01001110, \XMM7, \T2 + pxor \XMM7, \T2 + movdqu HashKey_2(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM7 // \XMM7 = a0*b0 + movdqu HashKey_2_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM7, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + + // Karatsuba Method + movdqa \XMM8, \T1 + pshufd $0b01001110, \XMM8, \T2 + pxor \XMM8, \T2 + movdqu HashKey(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM8 // \XMM8 = a0*b0 + movdqu HashKey_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM8, \T7 + pxor \XMM1, \T2 + pxor \T6, \T2 + pxor \T7, \T2 // middle section of the temp results combined as in Karatsuba algorithm + + + movdqa \T2, \T4 + pslldq $8, \T4 // shift-L \T4 2 DWs + psrldq $8, \T2 // shift-R \T2 2 DWs + pxor \T4, \T7 + pxor \T2, \T6 // <\T6:\T7> holds the result of the accumulated carry-less multiplications + + + //first phase of the reduction + movdqa \T7, \T2 + movdqa \T7, \T3 + movdqa \T7, \T4 // move \T7 into \T2, \T3, \T4 in order to perform the three shifts independently + + pslld $31, \T2 // packed right shifting << 31 + pslld $30, \T3 // packed right shifting shift << 30 + pslld $25, \T4 // packed right shifting shift << 25 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + movdqa \T2, \T1 + psrldq $4, \T1 // shift-R \T1 1 DW + + pslldq $12, \T2 // shift-L \T2 3 DWs + pxor \T2, \T7 // first phase of the reduction complete + //////////////////////////////////////////////////////////////////////// + + //second phase of the reduction + movdqa \T7, \T2 // make 3 copies of \T7 (in in \T2, \T3, \T4) for doing three shift operations + movdqa \T7, \T3 + movdqa \T7, \T4 + + psrld $1, \T2 // packed left shifting >> 1 + psrld $2, \T3 // packed left shifting >> 2 + psrld $7, \T4 // packed left shifting >> 7 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + pxor \T1, \T2 + pxor \T2, \T7 + pxor \T7, \T6 // the result is in \T6 + +.endm // GHASH_LAST_8 + +//////////////////////////////////////////////////////////////////////////////// +// ENCRYPT_SINGLE_BLOCK: Encrypt a single block. +//////////////////////////////////////////////////////////////////////////////// +.macro ENCRYPT_SINGLE_BLOCK GDATA, ST, T1 + + movdqu 16*0(\GDATA), \T1 + pxor \T1, \ST + +.set i, 1 +.rept NROUNDS + movdqu 16*i(\GDATA), \T1 + aesenc \T1, \ST + +.set i, (i+1) +.endr + movdqu 16*i(\GDATA), \T1 + aesenclast \T1, \ST +.endm // ENCRYPT_SINGLE_BLOCK + + +//////////////////////////////////////////////////////////////////////////////// +// FUNC_SAVE: Save clobbered regs on the stack. +//////////////////////////////////////////////////////////////////////////////// +.macro FUNC_SAVE + //// Required for Update/GMC_ENC + //the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + push %rsi + mov %rsp, %r14 + + sub $(VARIABLE_OFFSET), %rsp + and $~63, %rsp + +#if __OUTPUT_FORMAT__ == win64 + // xmm6:xmm15 need to be maintained for Windows + movdqu %xmm6, (LOCAL_STORAGE + 0*16)(%rsp) + movdqu %xmm7, (LOCAL_STORAGE + 1*16)(%rsp) + movdqu %xmm8, (LOCAL_STORAGE + 2*16)(%rsp) + movdqu %xmm9, (LOCAL_STORAGE + 3*16)(%rsp) + movdqu %xmm10, (LOCAL_STORAGE + 4*16)(%rsp) + movdqu %xmm11, (LOCAL_STORAGE + 5*16)(%rsp) + movdqu %xmm12, (LOCAL_STORAGE + 6*16)(%rsp) + movdqu %xmm13, (LOCAL_STORAGE + 7*16)(%rsp) + movdqu %xmm14, (LOCAL_STORAGE + 8*16)(%rsp) + movdqu %xmm15, (LOCAL_STORAGE + 9*16)(%rsp) + + mov arg(5), arg5 // XXXX [r14 + STACK_OFFSET + 8*5] +#endif +.endm // FUNC_SAVE + +//////////////////////////////////////////////////////////////////////////////// +// FUNC_RESTORE: Restore clobbered regs from the stack. +//////////////////////////////////////////////////////////////////////////////// +.macro FUNC_RESTORE + +#if __OUTPUT_FORMAT__ == win64 + movdqu (LOCAL_STORAGE + 9*16)(%rsp), %xmm15 + movdqu (LOCAL_STORAGE + 8*16)(%rsp), %xmm14 + movdqu (LOCAL_STORAGE + 7*16)(%rsp), %xmm13 + movdqu (LOCAL_STORAGE + 6*16)(%rsp), %xmm12 + movdqu (LOCAL_STORAGE + 5*16)(%rsp), %xmm11 + movdqu (LOCAL_STORAGE + 4*16)(%rsp), %xmm10 + movdqu (LOCAL_STORAGE + 3*16)(%rsp), %xmm9 + movdqu (LOCAL_STORAGE + 2*16)(%rsp), %xmm8 + movdqu (LOCAL_STORAGE + 1*16)(%rsp), %xmm7 + movdqu (LOCAL_STORAGE + 0*16)(%rsp), %xmm6 +#endif + + // Required for Update/GMC_ENC + mov %r14, %rsp + pop %rsi + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm // FUNC_RESTORE + + +//////////////////////////////////////////////////////////////////////////////// +// GCM_INIT: Initializes a gcm_context_data struct to prepare for +// encoding/decoding. +// Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +// Additional Authentication data (A_IN), Additional Data length (A_LEN). +// Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized +// other parts of GDATA. +// Clobbers rax, r10-r13 and xmm0-xmm6 +//////////////////////////////////////////////////////////////////////////////// +.macro GCM_INIT GDATA_KEY, GDATA_CTX, IV, A_IN, A_LEN + +#define AAD_HASH %xmm0 +#define SUBHASH %xmm1 + + movdqu HashKey(\GDATA_KEY), SUBHASH + + CALC_AAD_HASH \A_IN, \A_LEN, AAD_HASH, SUBHASH, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %r10, %r11, %r12, %r13, %rax + pxor %xmm3, %xmm2 + mov \A_LEN, %r10 + + movdqu AAD_HASH, AadHash(\GDATA_CTX) // ctx_data.aad hash = aad_hash + mov %r10, AadLen(\GDATA_CTX) // ctx_data.aad_length = aad_length + xor %r10, %r10 + mov %r10, InLen(\GDATA_CTX) // ctx_data.in_length = 0 + mov %r10, PBlockLen(\GDATA_CTX) // ctx_data.partial_block_length = 0 + movdqu %xmm2, PBlockEncKey(\GDATA_CTX) // ctx_data.partial_block_enc_key = 0 + mov \IV, %r10 + movdqa ONEf(%rip), %xmm2 // read 12 IV bytes and pad with 0x00000001 + pinsrq $0, (%r10), %xmm2 + pinsrd $2, 8(%r10), %xmm2 + movdqu %xmm2, OrigIV(\GDATA_CTX) // ctx_data.orig_IV = iv + + pshufb SHUF_MASK(%rip), %xmm2 + + movdqu %xmm2, CurCount(\GDATA_CTX) // ctx_data.current_counter = iv +.endm // GCM_INIT + + +//////////////////////////////////////////////////////////////////////////////// +// GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed +// gcm_context_data struct has been initialized by GCM_INIT. +// Requires the input data be at least 1 byte long because of +// READ_SMALL_INPUT_DATA. +// Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), +// input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN) and whether +// encoding or decoding (ENC_DEC). +// Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated +// GDATA_CTX +// Clobbers rax, r10-r15, and xmm0-xmm15 +//////////////////////////////////////////////////////////////////////////////// +.macro GCM_ENC_DEC GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + PLAIN_CYPH_LEN, ENC_DEC + +#define DATA_OFFSET %r11 + + // clang compat: no local support + // LOCAL _initial_num_blocks_is_7, _initial_num_blocks_is_6 + // LOCAL _initial_num_blocks_is_5, _initial_num_blocks_is_4 + // LOCAL _initial_num_blocks_is_3, _initial_num_blocks_is_2 + // LOCAL _initial_num_blocks_is_1, _initial_num_blocks_is_0 + // LOCAL _initial_blocks_encrypted, _encrypt_by_8_new, _encrypt_by_8 + // LOCAL _eight_cipher_left, _zero_cipher_left, _large_enough_update + // LOCAL _data_read, _less_than_8_bytes_left, _multiple_of_16_bytes + +// Macro flow: +// calculate the number of 16byte blocks in the message +// process (number of 16byte blocks) mod 8 '_initial_num_blocks_is_# .. _initial_blocks_encrypted' +// process 8 16 byte blocks at a time until all are done '_encrypt_by_8_new .. _eight_cipher_left' +// if there is a block of less tahn 16 bytes process it '_zero_cipher_left .. _multiple_of_16_bytes' + + cmp $0, \PLAIN_CYPH_LEN + je _multiple_of_16_bytes_\@ + + xor DATA_OFFSET, DATA_OFFSET + add \PLAIN_CYPH_LEN, InLen(\GDATA_CTX) //Update length of data processed + movdqu HashKey(\GDATA_KEY), %xmm13 // xmm13 = HashKey + movdqu AadHash(\GDATA_CTX), %xmm8 + + + PARTIAL_BLOCK \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, \PLAIN_CYPH_LEN, DATA_OFFSET, %xmm8, \ENC_DEC + + mov \PLAIN_CYPH_LEN, %r13 // save the number of bytes of plaintext/ciphertext + sub DATA_OFFSET, %r13 + mov %r13, %r10 //save the amount of data left to process in r10 + and $-16, %r13 // r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0_\@ + + + cmp $7, %r12 + je _initial_num_blocks_is_7_\@ + cmp $6, %r12 + je _initial_num_blocks_is_6_\@ + cmp $5, %r12 + je _initial_num_blocks_is_5_\@ + cmp $4, %r12 + je _initial_num_blocks_is_4_\@ + cmp $3, %r12 + je _initial_num_blocks_is_3_\@ + cmp $2, %r12 + je _initial_num_blocks_is_2_\@ + + jmp _initial_num_blocks_is_1_\@ + +_initial_num_blocks_is_7_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*7), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_6_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*6), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_5_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*5), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_4_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*4), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_3_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*3), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_2_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*2), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_1_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*1), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_0_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + +_initial_blocks_encrypted_\@: + cmp $0, %r13 + je _zero_cipher_left_\@ + + sub $128, %r13 + je _eight_cipher_left_\@ + + movd %xmm9, %r15d + and $255, %r15d + pshufb SHUF_MASK(%rip), %xmm9 + + +_encrypt_by_8_new_\@: + cmp $(255-8), %r15d + jg _encrypt_by_8_\@ + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL \GDATA_KEY, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, DATA_OFFSET + sub $128, %r13 + jne _encrypt_by_8_new_\@ + + pshufb SHUF_MASK(%rip), %xmm9 + jmp _eight_cipher_left_\@ + +_encrypt_by_8_\@: + pshufb SHUF_MASK(%rip), %xmm9 + add $8, %r15b + + GHASH_8_ENCRYPT_8_PARALLEL \GDATA_KEY, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + pshufb SHUF_MASK(%rip), %xmm9 + add $128, DATA_OFFSET + sub $128, %r13 + jne _encrypt_by_8_new_\@ + + pshufb SHUF_MASK(%rip), %xmm9 + + + +_eight_cipher_left_\@: + GHASH_LAST_8 \GDATA_KEY, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left_\@: + movdqu %xmm14, AadHash(\GDATA_CTX) + movdqu %xmm9, CurCount(\GDATA_CTX) + + mov %r10, %r13 + and $15, %r13 // r13 = (\PLAIN_CYPH_LEN mod 16) + + je _multiple_of_16_bytes_\@ + + mov %r13, PBlockLen(\GDATA_CTX) // my_ctx.data.partial_blck_length = r13 + // handle the last <16 Byte block seperately + + paddd ONE(%rip), %xmm9 // INCR CNT to get Yn + movdqu %xmm9, CurCount(\GDATA_CTX) // my_ctx.data.current_counter = xmm9 + pshufb SHUF_MASK(%rip), %xmm9 + ENCRYPT_SINGLE_BLOCK \GDATA_KEY, %xmm9, %xmm2 // E(K, Yn) + movdqu %xmm9, PBlockEncKey(\GDATA_CTX) // my_ctx_data.partial_block_enc_key = xmm9 + + cmp $16, \PLAIN_CYPH_LEN + jge _large_enough_update_\@ + + lea (\PLAIN_CYPH_IN, DATA_OFFSET), %r10 + READ_SMALL_DATA_INPUT %xmm1, %r10, %r13, %r12, %r15, %rax + lea (SHIFT_MASK + 16)(%rip), %r12 + sub %r13, %r12 + jmp _data_read_\@ + +_large_enough_update_\@: + sub $16, DATA_OFFSET + add %r13, DATA_OFFSET + + movdqu (\PLAIN_CYPH_IN, DATA_OFFSET), %xmm1 // receive the last <16 Byte block + + sub %r13, DATA_OFFSET + add $16, DATA_OFFSET + + lea (SHIFT_MASK + 16)(%rip), %r12 + sub %r13, %r12 // adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 // get the appropriate shuffle mask + pshufb %xmm2, %xmm1 // shift right 16-r13 bytes +_data_read_\@: + .ifc \ENC_DEC, DEC + + movdqa %xmm1, %xmm2 + pxor %xmm1, %xmm9 // Plaintext XOR E(K, Yn) + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out top 16-r13 bytes of xmm9 + pand %xmm1, %xmm2 + pshufb SHUF_MASK(%rip), %xmm2 + pxor %xmm2, %xmm14 + movdqu %xmm14, AadHash(\GDATA_CTX) + + .else // .ifc \ENC_DEC, DEC + + pxor %xmm1, %xmm9 // Plaintext XOR E(K, Yn) + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out top 16-r13 bytes of xmm9 + pshufb SHUF_MASK(%rip), %xmm9 + pxor %xmm9, %xmm14 + movdqu %xmm14, AadHash(\GDATA_CTX) + + pshufb SHUF_MASK(%rip), %xmm9 // shuffle xmm9 back to output as ciphertext + + .endif // .ifc \ENC_DEC, DEC + + + ////////////////////////////////////////////////////////// + // output r13 Bytes + movq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + + mov %rax, (\CYPH_PLAIN_OUT, DATA_OFFSET) + add $8, DATA_OFFSET + psrldq $8, %xmm9 + movq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left_\@: + movb %al, (\CYPH_PLAIN_OUT, DATA_OFFSET) + add $1, DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ + ////////////////////////////////////////////////////////// + +_multiple_of_16_bytes_\@: + +.endm // GCM_ENC_DEC + + +//////////////////////////////////////////////////////////////////////////////// +// GCM_COMPLETE: Finishes Encyrption/Decryption of last partial block after +// GCM_UPDATE finishes. +// Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and +// whether encoding or decoding (ENC_DEC). +// Output: Authorization Tag (AUTH_TAG) and Authorization Tag length +// (AUTH_TAG_LEN) +// Clobbers %rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +//////////////////////////////////////////////////////////////////////////////// +.macro GCM_COMPLETE GDATA_KEY, GDATA_CTX, AUTH_TAG, AUTH_TAG_LEN, ENC_DEC + +#define PLAIN_CYPH_LEN %rax + + // clang compat: no local support + // LOCAL _partial_done, _return_T, _T_8, _T_12, _T_16, _return_T_done + + mov PBlockLen(\GDATA_CTX), %r12 // r12 = aadLen (number of bytes) + movdqu AadHash(\GDATA_CTX), %xmm14 + movdqu HashKey(\GDATA_KEY), %xmm13 + + cmp $0, %r12 + + je _partial_done_\@ + + GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 //GHASH computation for the last <16 Byte block + movdqu %xmm14, AadHash(\GDATA_CTX) + +_partial_done_\@: + + mov AadLen(\GDATA_CTX), %r12 // r12 = aadLen (number of bytes) + mov InLen(\GDATA_CTX), PLAIN_CYPH_LEN + + shl $3, %r12 // convert into number of bits + movd %r12d, %xmm15 // len(A) in xmm15 + + shl $3, PLAIN_CYPH_LEN // len(C) in bits (*128) + movq PLAIN_CYPH_LEN, %xmm1 + pslldq $8, %xmm15 // xmm15 = len(A)|| 0x0000000000000000 + pxor %xmm1, %xmm15 // xmm15 = len(A)||len(C) + + pxor %xmm15, %xmm14 + GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 // final GHASH computation + pshufb SHUF_MASK(%rip), %xmm14 // perform a 16Byte swap + movdqu OrigIV(\GDATA_CTX), %xmm9 // xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK \GDATA_KEY, %xmm9, %xmm2 // E(K, Y0) + + pxor %xmm14, %xmm9 + +_return_T_\@: + mov \AUTH_TAG, %r10 // r10 = authTag + mov \AUTH_TAG_LEN, %r11 // r11 = auth_tag_len + + cmp $16, %r11 + je _T_16_\@ + + cmp $12, %r11 + je _T_12_\@ + +_T_8_\@: + movq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done_\@ + +_T_12_\@: + movq %xmm9, %rax + mov %rax, (%r10) + psrldq $8, %xmm9 + movd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done_\@ + +_T_16_\@: + movdqu %xmm9, (%r10) + +_return_T_done_\@: +.endm //GCM_COMPLETE + + +#if 1 + + .balign 16 +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_precomp_{128,256}_sse +// (struct gcm_key_data *key_data); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(precomp,_) +FN_NAME(precomp,_): + + endbranch + + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + sub $(VARIABLE_OFFSET), %rsp + and $(~63), %rsp // align rsp to 64 bytes + +#if __OUTPUT_FORMAT__ == win64 + // only xmm6 needs to be maintained + movdqu %xmm6, (LOCAL_STORAGE + 0*16)(%rsp) +#endif + + pxor %xmm6, %xmm6 + ENCRYPT_SINGLE_BLOCK arg1, %xmm6, %xmm2 // xmm6 = HashKey + + pshufb SHUF_MASK(%rip), %xmm6 + /////////////// PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + movdqa %xmm6, %xmm2 + psllq $1, %xmm6 + psrlq $63, %xmm2 + movdqa %xmm2, %xmm1 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + por %xmm2, %xmm6 + + //reduction + pshufd $0b00100100, %xmm1, %xmm2 + pcmpeqd TWOONE(%rip), %xmm2 + pand POLY(%rip), %xmm2 + pxor %xmm2, %xmm6 // xmm6 holds the HashKey<<1 mod poly + /////////////////////////////////////////////////////////////////////// + movdqu %xmm6, HashKey(arg1) // store HashKey<<1 mod poly + + PRECOMPUTE arg1, %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + +#if __OUTPUT_FORMAT__ == win64 + movdqu (LOCAL_STORAGE + 0*16)(%rsp), %xmm6 +#endif + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_init_128_sse / aes_gcm_init_256_sse ( +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *iv, +// const u8 *aad, +// u64 aad_len); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + + push %r12 + push %r13 +#if __OUTPUT_FORMAT__ == win64 + push arg5 + sub $(1*16), %rsp + movdqu %xmm6, (0*16)(%rsp) + mov (1*16 + 8*3 + 8*5)(%rsp), arg5 +#endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +#if __OUTPUT_FORMAT__ == win64 + movdqu (0*16)(%rsp), %xmm6 + add $(1*16), %rsp + pop arg5 +#endif + pop %r13 + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_enc_128_update_sse / aes_gcm_enc_256_update_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len); +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_dec_256_update_sse / aes_gcm_dec_256_update_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len); +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_256_finalize_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *auth_tag, +// u64 auth_tag_len); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + + endbranch + + push %r12 + +#if __OUTPUT_FORMAT__ == win64 + // xmm6:xmm15 need to be maintained for Windows + sub $(5*16), %rsp + movdqu %xmm6, (0*16)(%rsp) + movdqu %xmm9, (1*16)(%rsp) + movdqu %xmm11, (2*16)(%rsp) + movdqu %xmm14, (3*16)(%rsp) + movdqu %xmm15, (4*16)(%rsp) +#endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +#if __OUTPUT_FORMAT__ == win64 + movdqu (4*16)(%rsp), %xmm15 + movdqu (3*16)(%rsp), %xmm14 + movdqu (2*16)(%rsp), %xmm11 + movdqu (1*16)(%rsp), %xmm9 + movdqu (0*16)(%rsp), %xmm6 + add $(5*16), %rsp +#endif + + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_256_finalize_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *auth_tag, +// u64 auth_tag_len); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + + endbranch + + push %r12 + +#if __OUTPUT_FORMAT == win64 + // xmm6:xmm15 need to be maintained for Windows + sub $(5*16), %rsp + movdqu %xmm6, (0*16)(%rsp) + movdqu %xmm9, (1*16)(%rsp) + movdqu %xmm11, (2*16)(%rsp) + movdqu %xmm14, (3*16)(%rsp) + movdqu %xmm15, (4*16)(%rsp) +#endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +#if __OUTPUT_FORMAT__ == win64 + movdqu (4*16)(%rsp), %xmm15 + movdqu (3*16)(%rsp), %xmm14 + movdqu (2*16)(%rsp), %xmm11 + movdqu (1*16)(%rsp), %xmm9 + movdqu (0*16)(%rsp), %xmm6 + add $(5*16), %rsp +#endif + + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_enc_128_sse / aes_gcm_enc_256_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len, +// u8 *iv, +// const u8 *aad, +// u64 aad_len, +// u8 *auth_tag, +// u64 auth_tag_len)// +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + FUNC_RESTORE + + ret + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_dec_128_sse / aes_gcm_dec_256_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len, +// u8 *iv, +// const u8 *aad, +// u64 aad_len, +// u8 *auth_tag, +// u64 auth_tag_len)// +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + FUNC_RESTORE + + ret + +.global FN_NAME(this_is_gas,_) +FN_NAME(this_is_gas,_): + endbranch + FUNC_SAVE + FUNC_RESTORE + ret + +#else + // GAS doesnt't provide the linenuber in the macro + //////////////////////// + // GHASH_MUL xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + // PRECOMPUTE rax, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + // READ_SMALL_DATA_INPUT xmm1, r10, 8, rax, r12, r15 + // ENCRYPT_SINGLE_BLOCK rax, xmm0, xmm1 + // INITIAL_BLOCKS rdi,rsi,rdx,rcx,r13,r11,7,xmm12,xmm13,xmm14,xmm15,xmm11,xmm9,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm10,xmm0,ENC + // CALC_AAD_HASH [r14+8*5+8*1],[r14+8*5+8*2],xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,r10,r11,r12,r13,rax + // READ_SMALL_DATA_INPUT xmm2, r10, r11, r12, r13, rax + // PARTIAL_BLOCK rdi,rsi,rdx,rcx,r8,r11,xmm8,ENC + // GHASH_8_ENCRYPT_8_PARALLEL rdi,rdx,rcx,r11,xmm0,xmm10,xmm11,xmm12,xmm13,xmm14,xmm9,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm15,out_order,ENC + //GHASH_LAST_8 rdi,xmm0,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8 +#endif diff --git a/contrib/icp/gcm-simd/isa-l_crypto-ported/reg_sizes.S b/contrib/icp/gcm-simd/isa-l_crypto-ported/reg_sizes.S new file mode 100644 index 000000000000..0b63dbd2a0ef --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto-ported/reg_sizes.S @@ -0,0 +1,224 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2019 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// Port to GNU as and translation to GNU as att-syntax +// Copyright(c) 2023 Attila Fülöp + +#ifndef _REG_SIZES_ASM_ +#define _REG_SIZES_ASM_ + + +// define d, w and b variants for registers + +.macro dwordreg reg + .if \reg == %r8 || \reg == %r9 || \reg == %r10 || \reg == %r11 || \reg == %r12 || \reg == %r13 || \reg == %r14 || \reg == %r15 + .set dreg, \reg\()d + .elseif \reg == %rax + .set dreg, %eax + .elseif \reg == %rcx + .set dreg, %ecx + .elseif \reg == %rdx + .set dreg, %edx + .elseif \reg == %rbx + .set dreg, %ebx + .elseif \reg == %rsp + .set dreg, %esp + .elseif \reg == %rbp + .set dreg, %ebp + .elseif \reg == %rsi + .set dreg, %esi + .elseif \reg == %rdi + .set dreg, %edi + .else + .error "Invalid register '\reg\()' while expanding macro 'dwordreg\()'" + .endif +.endm + +.macro wordreg reg + .if \reg == %r8 || \reg == %r9 || \reg == %r10 || \reg == %r11 || \reg == %r12 || \reg == %r13 || \reg == %r14 || \reg == %r15 + .set wreg, \reg\()w + .elseif \reg == %rax + .set wreg, %ax + .elseif \reg == %rcx + .set wreg, %cx + .elseif \reg == %rdx + .set wreg, %dx + .elseif \reg == %rbx + .set wreg, %bx + .elseif \reg == %rsp + .set wreg, %sp + .elseif \reg == %rbp + .set wreg, %bp + .elseif \reg == %rsi + .set wreg, %si + .elseif \reg == %rdi + .set wreg, %di + .else + .error "Invalid register '\reg\()' while expanding macro 'wordreg\()'" + .endif +.endm + + +.macro bytereg reg + .if \reg == %r8 || \reg == %r9 || \reg == %r10 || \reg == %r11 || \reg == %r12 || \reg == %r13 || \reg == %r14 || \reg == %r15 + .set breg, \reg\()b + .elseif \reg == %rax + .set breg, %al + .elseif \reg == %rcx + .set breg, %cl + .elseif \reg == %rdx + .set breg, %dl + .elseif \reg == %rbx + .set breg, %bl + .elseif \reg == rsp + .set breg, %spl + .elseif \reg == %rbp + .set breg, %bpl + .elseif \reg == rsi + .set breg, %sil + .elseif \reg == rdi + .set breg, %dil + .else + .error "Invalid register '\reg\()' while expanding macro 'bytereg\()'" + .endif +.endm + +// clang compat: Below won't owrk with clang; do it a bit different +// #define ZERO_TO_THIRTYONE \ +// 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, \ +// 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + +// .macro xword reg +// .irep i, ZERO_TO_THIRTYONE +// .if \reg == %xmm\i || \reg == %ymm\i || \reg == %zmm\i +// .set xmmreg, %xmm\i +// .endif +// .endr +// .endm + +// .macro yword reg +// .irep i, ZERO_TO_THIRTYONE +// .if \reg == %xmm\i || \reg == %ymm\i || \reg == %zmm\i +// .set ymmreg, %ymm\i +// .endif +// .endr +// .endm + +// .macro zword reg +// .irep i, ZERO_TO_THIRTYONE +// .if \reg == %xmm\i || \reg == %ymm\i || \reg == %zmm\i +// .set zmmreg, %zmm\i +// .endif +// .endr +// .endm + +// Example usage: +// xword %zmm12 +// pxor xmmreg, xmmreg // => pxor %xmm12, %xmm12 +.macro xword reg + .set i, 0 + .rep 32 + .altmacro + do_xyzword <\reg>, xmm, %i + .noaltmacro + .set i, (i+1) + .endr +.endm + +.macro yword reg + .set i, 0 + .rep 32 + .altmacro + do_xyzword <\reg>, ymm, %i + .noaltmacro + .set i, (i+1) + .endr +.endm + +.macro zword reg + .set i, 0 + .rep 32 + .altmacro + do_xyzword <\reg>, zmm, %i + .noaltmacro + .set i, (i+1) + .endr +.endm + +.macro do_xyzword creg, prfx, idx + .if \creg == %xmm\idx || \creg == %ymm\idx || \creg == %zmm\idx + .set \prfx\()reg, %\prfx\idx + .endif +.endm + + +// FIXME: handle later +#define elf32 1 +#define elf64 2 +#define win64 3 +#define machos64 4 + +#ifndef __OUTPUT_FORMAT__ +#define __OUTPUT_FORMAT__ elf64 +#endif + +#if __OUTPUT_FORMAT__ == elf32 +.section .note.GNU-stack,"",%progbits +.section .text +#endif +#if __OUTPUT_FORMAT__ == elf64 +#ifndef __x86_64__ +#define __x86_64__ +#endif +.section .note.GNU-stack,"",%progbits +.section .text +#endif +#if __OUTPUT_FORMAT__ == win64 +#define __x86_64__ +#endif +#if __OUTPUT_FORMAT__ == macho64 +#define __x86_64__ +#endif + + +#ifdef __x86_64__ +#define endbranch .byte 0xf3, 0x0f, 0x1e, 0xfa +#else +#define endbranch .byte 0xf3, 0x0f, 0x1e, 0xfb +#endif + +#ifdef REL_TEXT +#define WRT_OPT +#elif __OUTPUT_FORMAT__ == elf64 +#define WRT_OPT wrt ..plt +#else +#define WRT_OPT +#endif + +#endif // ifndef _REG_SIZES_ASM_ diff --git a/contrib/icp/gcm-simd/isa-l_crypto/LICENSE b/contrib/icp/gcm-simd/isa-l_crypto/LICENSE new file mode 100644 index 000000000000..ecebef110b46 --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto/LICENSE @@ -0,0 +1,26 @@ + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/icp/gcm-simd/isa-l_crypto/README b/contrib/icp/gcm-simd/isa-l_crypto/README new file mode 100644 index 000000000000..55317bb4459b --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto/README @@ -0,0 +1,10 @@ +This directory contains some of the original "Intel(R) Intelligent Storage +Acceleration Library Crypto Version" [1] GCM x86-64 assembly files [2]. They +are included here for reference purposes only. + +These files were ported to the GNU assembler to be used within the ICP. The +ported version can be found in the isa-l_crypto-ported directory one level up. + + +[1] https://github.com/intel/isa-l_crypto +[2] https://github.com/intel/isa-l_crypto/tree/v2.24.0/aes \ No newline at end of file diff --git a/contrib/icp/gcm-simd/isa-l_crypto/gcm128_sse.asm b/contrib/icp/gcm-simd/isa-l_crypto/gcm128_sse.asm new file mode 100644 index 000000000000..1717a86628fd --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto/gcm128_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_sse.asm" diff --git a/contrib/icp/gcm-simd/isa-l_crypto/gcm256_sse.asm b/contrib/icp/gcm-simd/isa-l_crypto/gcm256_sse.asm new file mode 100644 index 000000000000..c583d02b86ca --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto/gcm256_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_sse.asm" diff --git a/contrib/icp/gcm-simd/isa-l_crypto/gcm_defines.asm b/contrib/icp/gcm-simd/isa-l_crypto/gcm_defines.asm new file mode 100644 index 000000000000..e823b79596df --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto/gcm_defines.asm @@ -0,0 +1,291 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef GCM_DEFINES_ASM_INCLUDED +%define GCM_DEFINES_ASM_INCLUDED + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford + + +;;;;;; + +section .data + +align 16 + +POLY dq 0x0000000000000001, 0xC200000000000000 + +align 64 +POLY2 dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 +align 16 +TWOONE dq 0x0000000000000001, 0x0000000100000000 + +; order of these constants should not change. +; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F + +align 64 +SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + +SHIFT_MASK dq 0x0706050403020100, 0x0f0e0d0c0b0a0908 +ALL_F dq 0xffffffffffffffff, 0xffffffffffffffff +ZERO dq 0x0000000000000000, 0x0000000000000000 +ONE dq 0x0000000000000001, 0x0000000000000000 +TWO dq 0x0000000000000002, 0x0000000000000000 +ONEf dq 0x0000000000000000, 0x0100000000000000 +TWOf dq 0x0000000000000000, 0x0200000000000000 + +align 64 +ddq_add_1234: + dq 0x0000000000000001, 0x0000000000000000 + dq 0x0000000000000002, 0x0000000000000000 + dq 0x0000000000000003, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_5678: + dq 0x0000000000000005, 0x0000000000000000 + dq 0x0000000000000006, 0x0000000000000000 + dq 0x0000000000000007, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_add_4444: + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_8888: + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_addbe_1234: + dq 0x0000000000000000, 0x0100000000000000 + dq 0x0000000000000000, 0x0200000000000000 + dq 0x0000000000000000, 0x0300000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_5678: + dq 0x0000000000000000, 0x0500000000000000 + dq 0x0000000000000000, 0x0600000000000000 + dq 0x0000000000000000, 0x0700000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +ddq_addbe_4444: + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_8888: + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +byte_len_to_mask_table: + dw 0x0000, 0x0001, 0x0003, 0x0007, + dw 0x000f, 0x001f, 0x003f, 0x007f, + dw 0x00ff, 0x01ff, 0x03ff, 0x07ff, + dw 0x0fff, 0x1fff, 0x3fff, 0x7fff, + dw 0xffff + +align 64 +byte64_len_to_mask_table: + dq 0x0000000000000000, 0x0000000000000001 + dq 0x0000000000000003, 0x0000000000000007 + dq 0x000000000000000f, 0x000000000000001f + dq 0x000000000000003f, 0x000000000000007f + dq 0x00000000000000ff, 0x00000000000001ff + dq 0x00000000000003ff, 0x00000000000007ff + dq 0x0000000000000fff, 0x0000000000001fff + dq 0x0000000000003fff, 0x0000000000007fff + dq 0x000000000000ffff, 0x000000000001ffff + dq 0x000000000003ffff, 0x000000000007ffff + dq 0x00000000000fffff, 0x00000000001fffff + dq 0x00000000003fffff, 0x00000000007fffff + dq 0x0000000000ffffff, 0x0000000001ffffff + dq 0x0000000003ffffff, 0x0000000007ffffff + dq 0x000000000fffffff, 0x000000001fffffff + dq 0x000000003fffffff, 0x000000007fffffff + dq 0x00000000ffffffff, 0x00000001ffffffff + dq 0x00000003ffffffff, 0x00000007ffffffff + dq 0x0000000fffffffff, 0x0000001fffffffff + dq 0x0000003fffffffff, 0x0000007fffffffff + dq 0x000000ffffffffff, 0x000001ffffffffff + dq 0x000003ffffffffff, 0x000007ffffffffff + dq 0x00000fffffffffff, 0x00001fffffffffff + dq 0x00003fffffffffff, 0x00007fffffffffff + dq 0x0000ffffffffffff, 0x0001ffffffffffff + dq 0x0003ffffffffffff, 0x0007ffffffffffff + dq 0x000fffffffffffff, 0x001fffffffffffff + dq 0x003fffffffffffff, 0x007fffffffffffff + dq 0x00ffffffffffffff, 0x01ffffffffffffff + dq 0x03ffffffffffffff, 0x07ffffffffffffff + dq 0x0fffffffffffffff, 0x1fffffffffffffff + dq 0x3fffffffffffffff, 0x7fffffffffffffff + dq 0xffffffffffffffff + +align 64 +mask_out_top_block: + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0x0000000000000000, 0x0000000000000000 + +section .text + + +;;define the fields of gcm_data struct +;typedef struct gcm_data +;{ +; u8 expanded_keys[16*15]; +; u8 shifted_hkey_1[16]; // store HashKey <<1 mod poly here +; u8 shifted_hkey_2[16]; // store HashKey^2 <<1 mod poly here +; u8 shifted_hkey_3[16]; // store HashKey^3 <<1 mod poly here +; u8 shifted_hkey_4[16]; // store HashKey^4 <<1 mod poly here +; u8 shifted_hkey_5[16]; // store HashKey^5 <<1 mod poly here +; u8 shifted_hkey_6[16]; // store HashKey^6 <<1 mod poly here +; u8 shifted_hkey_7[16]; // store HashKey^7 <<1 mod poly here +; u8 shifted_hkey_8[16]; // store HashKey^8 <<1 mod poly here +; u8 shifted_hkey_1_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_2_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_3_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_4_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_5_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_6_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_7_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_8_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +;} gcm_data; + +%ifndef GCM_KEYS_VAES_AVX512_INCLUDED +%define HashKey 16*15 ; store HashKey <<1 mod poly here +%define HashKey_1 16*15 ; store HashKey <<1 mod poly here +%define HashKey_2 16*16 ; store HashKey^2 <<1 mod poly here +%define HashKey_3 16*17 ; store HashKey^3 <<1 mod poly here +%define HashKey_4 16*18 ; store HashKey^4 <<1 mod poly here +%define HashKey_5 16*19 ; store HashKey^5 <<1 mod poly here +%define HashKey_6 16*20 ; store HashKey^6 <<1 mod poly here +%define HashKey_7 16*21 ; store HashKey^7 <<1 mod poly here +%define HashKey_8 16*22 ; store HashKey^8 <<1 mod poly here +%define HashKey_k 16*23 ; store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +%define HashKey_2_k 16*24 ; store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_3_k 16*25 ; store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_4_k 16*26 ; store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_5_k 16*27 ; store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_6_k 16*28 ; store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_7_k 16*29 ; store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_8_k 16*30 ; store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +%endif + +%define AadHash 16*0 ; store current Hash of data which has been input +%define AadLen 16*1 ; store length of input data which will not be encrypted or decrypted +%define InLen (16*1)+8 ; store length of input data which will be encrypted or decrypted +%define PBlockEncKey 16*2 ; encryption key for the partial block at the end of the previous update +%define OrigIV 16*3 ; input IV +%define CurCount 16*4 ; Current counter for generation of encryption key +%define PBlockLen 16*5 ; length of partial block at the end of the previous update + +%define reg(q) xmm %+ q +%define arg(x) [r14 + STACK_OFFSET + 8*x] + + + + +%ifnidn __OUTPUT_FORMAT__, elf64 + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 + %xdefine arg4 r9 + %xdefine arg5 rsi ;[r14 + STACK_OFFSET + 8*5] - need push and load + %xdefine arg6 [r14 + STACK_OFFSET + 8*6] + %xdefine arg7 [r14 + STACK_OFFSET + 8*7] + %xdefine arg8 [r14 + STACK_OFFSET + 8*8] + %xdefine arg9 [r14 + STACK_OFFSET + 8*9] + %xdefine arg10 [r14 + STACK_OFFSET + 8*10] + +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx + %xdefine arg4 rcx + %xdefine arg5 r8 + %xdefine arg6 r9 + %xdefine arg7 [r14 + STACK_OFFSET + 8*1] + %xdefine arg8 [r14 + STACK_OFFSET + 8*2] + %xdefine arg9 [r14 + STACK_OFFSET + 8*3] + %xdefine arg10 [r14 + STACK_OFFSET + 8*4] +%endif + +%ifdef NT_LDST + %define NT_LD + %define NT_ST +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_LD + %define XLDR movntdqa + %define VXLDR vmovntdqa + %define VX512LDR vmovntdqa +%else + %define XLDR movdqu + %define VXLDR vmovdqu + %define VX512LDR vmovdqu8 +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_ST + %define XSTR movntdq + %define VXSTR vmovntdq + %define VX512STR vmovntdq +%else + %define XSTR movdqu + %define VXSTR vmovdqu + %define VX512STR vmovdqu8 +%endif + +%endif ; GCM_DEFINES_ASM_INCLUDED diff --git a/contrib/icp/gcm-simd/isa-l_crypto/gcm_sse.asm b/contrib/icp/gcm-simd/isa-l_crypto/gcm_sse.asm new file mode 100644 index 000000000000..e35860496357 --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto/gcm_sse.asm @@ -0,0 +1,2171 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_sse.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 13 +%endif + + +default rel +; need to push 5 registers into stack to maintain +%define STACK_OFFSET 8*5 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqa %%T1, %%GH + pshufd %%T2, %%GH, 01001110b + pshufd %%T3, %%HK, 01001110b + pxor %%T2, %%GH ; %%T2 = (a1+a0) + pxor %%T3, %%HK ; %%T3 = (b1+b0) + + pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T2, %%GH + pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%GH, %%T3 + pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK + + + ;first phase of the reduction + movdqa %%T2, %%GH + movdqa %%T3, %%GH + movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%GH + movdqa %%T4,%%GH + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T5 + pxor %%GH, %%T2 + pxor %%GH, %%T1 ; the result is in %%T1 + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + movdqa %%T4, %%HK + pshufd %%T1, %%HK, 01001110b + pxor %%T1, %%HK + movdqu [%%GDATA + HashKey_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly + movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly + movdqu [%%GDATA + HashKey_3], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_3_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly + movdqu [%%GDATA + HashKey_4], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly + movdqu [%%GDATA + HashKey_5], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_5_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly + movdqu [%%GDATA + HashKey_6], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly + movdqu [%%GDATA + HashKey_7], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly + movdqu [%%GDATA + HashKey_8], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_8_k], %%T1 + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + pxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + pinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + pinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + pxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + movdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + mov r13, [%%GDATA_CTX + PBlockLen] + +%%_data_read: ;Finished reading in data + + + movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key + movdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + movdqa xmm3, xmm1 + pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pand xmm3, xmm1 + pshufb xmm3, [SHUF_MASK] + pshufb xmm3, xmm2 + pxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_dec_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pshufb xmm9, [SHUF_MASK] + pshufb xmm9, xmm2 + pxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_encode_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + pshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + + ; start AES for %%num_initial_blocks blocks + movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + paddd %%CTR, [ONE] ; INCR Y0 + movdqa reg(i), %%CTR + pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +movdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + pxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192) +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + pxor reg(i), %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + movdqa reg(i), %%T1 + %endif + pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + pxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + movdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done ; no need for precomputed constants + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM1, %%CTR + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM2, %%CTR + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM3, %%CTR + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM4, %%CTR + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM5, %%CTR + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM6, %%CTR + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM7, %%CTR + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM8, %%CTR + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + movdqu %%T_key, [%%GDATA_KEY+16*0] + pxor %%XMM1, %%T_key + pxor %%XMM2, %%T_key + pxor %%XMM3, %%T_key + pxor %%XMM4, %%T_key + pxor %%XMM5, %%T_key + pxor %%XMM6, %%T_key + pxor %%XMM7, %%T_key + pxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS ; do early (13) rounds (11 for GCM192) + movdqu %%T_key, [%%GDATA_KEY+16*i] + aesenc %%XMM1, %%T_key + aesenc %%XMM2, %%T_key + aesenc %%XMM3, %%T_key + aesenc %%XMM4, %%T_key + aesenc %%XMM5, %%T_key + aesenc %%XMM6, %%T_key + aesenc %%XMM7, %%T_key + aesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round + aesenclast %%XMM1, %%T_key + aesenclast %%XMM2, %%T_key + aesenclast %%XMM3, %%T_key + aesenclast %%XMM4, %%T_key + aesenclast %%XMM5, %%T_key + aesenclast %%XMM6, %%T_key + aesenclast %%XMM7, %%T_key + aesenclast %%XMM8, %%T_key + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + pxor %%XMM1, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM1, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + pxor %%XMM2, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM2, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + pxor %%XMM3, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM3, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + pxor %%XMM4, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM4, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + pxor %%XMM5, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM5, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + pxor %%XMM6, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM6, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + pxor %%XMM7, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM7, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + pxor %%XMM8, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + movdqa %%T7, %%XMM1 + movdqu [rsp + TMP2], %%XMM2 + movdqu [rsp + TMP3], %%XMM3 + movdqu [rsp + TMP4], %%XMM4 + movdqu [rsp + TMP5], %%XMM5 + movdqu [rsp + TMP6], %%XMM6 + movdqu [rsp + TMP7], %%XMM7 + movdqu [rsp + TMP8], %%XMM8 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + + movdqa %%T4, %%T7 + pshufd %%T6, %%T7, 01001110b + pxor %%T6, %%T7 + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + %else + paddd %%CTR, [ONEf] ; INCR CNT + %endif + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_8_k] + pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + movdqa %%XMM1, %%CTR + + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM8, %%CTR + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + %else + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM8, %%CTR + %endif + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + movdqu %%T1, [%%GDATA + 16*0] + pxor %%XMM1, %%T1 + pxor %%XMM2, %%T1 + pxor %%XMM3, %%T1 + pxor %%XMM4, %%T1 + pxor %%XMM5, %%T1 + pxor %%XMM6, %%T1 + pxor %%XMM7, %%T1 + pxor %%XMM8, %%T1 + + ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP2] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*1] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + movdqu %%T1, [%%GDATA + 16*2] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqu %%T1, [rsp + TMP3] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*3] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP4] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*4] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*5] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP5] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + + movdqu %%T1, [%%GDATA + 16*6] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + movdqu %%T1, [rsp + TMP6] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*7] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP7] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*8] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP8] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T7, %%T3 + pxor %%T4, %%T1 + + movdqu %%T1, [%%GDATA + 16*9] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + +%ifdef GCM128_MODE + movdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*12] ; finish last key round +%endif +%ifdef GCM256_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*12] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*13] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*14] ; finish last key round +%endif + +%assign i 0 +%assign j 1 +%rep 8 + XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + +%ifidn %%ENC_DEC, DEC + movdqa %%T3, %%T1 +%endif + + pxor %%T1, %%T5 + aesenclast reg(j), %%T1 ; XMM1:XMM8 + XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer + +%ifidn %%ENC_DEC, DEC + movdqa reg(j), %%T3 +%endif +%assign i (i+1) +%assign j (j+1) +%endrep + + + + + pxor %%T2, %%T6 + pxor %%T2, %%T4 + pxor %%T2, %%T7 + + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T3 + pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7 + + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T1, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T1 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T1,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T1,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T1 + + pxor %%T2, %%T5 + pxor %%T7, %%T2 + pxor %%T7, %%T4 ; the result is in %%T4 + + + pxor %%XMM1, %%T7 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ; Karatsuba Method + movdqa %%T6, %%XMM1 + pshufd %%T2, %%XMM1, 01001110b + pxor %%T2, %%XMM1 + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1 + + pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_8_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + movdqa %%T7, %%XMM1 + movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM2 + pshufd %%T2, %%XMM2, 01001110b + pxor %%T2, %%XMM2 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM2 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM3 + pshufd %%T2, %%XMM3, 01001110b + pxor %%T2, %%XMM3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM3 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM4 + pshufd %%T2, %%XMM4, 01001110b + pxor %%T2, %%XMM4 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM4 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM5 + pshufd %%T2, %%XMM5, 01001110b + pxor %%T2, %%XMM5 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM5 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM6 + pshufd %%T2, %%XMM6, 01001110b + pxor %%T2, %%XMM6 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM6 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM7 + pshufd %%T2, %%XMM7, 01001110b + pxor %%T2, %%XMM7 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM7 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM8 + pshufd %%T2, %%XMM8, 01001110b + pxor %%T2, %%XMM8 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM8 + pxor %%T2, %%XMM1 + pxor %%T2, %%T6 + pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm + + + movdqa %%T4, %%T2 + pslldq %%T4, 8 ; shift-L %%T4 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T4 + pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T1, %%T2 + psrldq %%T1, 4 ; shift-R %%T1 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T4,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T1 + pxor %%T7, %%T2 + pxor %%T6, %%T7 ; the result is in %%T6 + +%endmacro + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 3 +%define %%GDATA %1 +%define %%ST %2 +%define %%T1 %3 + movdqu %%T1, [%%GDATA+16*0] + pxor %%ST, %%T1 +%assign i 1 +%rep NROUNDS + movdqu %%T1, [%%GDATA+16*i] + aesenc %%ST, %%T1 +%assign i (i+1) +%endrep + movdqu %%T1, [%%GDATA+16*i] + aesenclast %%ST, %%T1 +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + push rsi + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 + + mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5] +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13 and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 +%define %%SUBHASH xmm1 + + + movdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + pxor xmm2, xmm3 + mov r10, %%A_LEN + + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + pinsrq xmm2, [r10], 0 + pinsrd xmm2, [r10+8], 2 + movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + pshufb xmm2, [SHUF_MASK] + + movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data +; struct has been initialized by GCM_INIT. +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed + movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + movdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext + sub r13, %%DATA_OFFSET + mov r10, r13 ;save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + movd r15d, xmm9 + and r15d, 255 + pshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + pshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + pshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + movdqu [%%GDATA_CTX + AadHash], xmm14 + movdqu [%%GDATA_CTX + CurCount], xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + paddd xmm9, [ONE] ; INCR CNT to get Yn + movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9 + pshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn) + movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: + %ifidn %%ENC_DEC, DEC + movdqa xmm2, xmm1 + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pand xmm2, xmm1 + pshufb xmm2, [SHUF_MASK] + pxor xmm14, xmm2 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + %else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pshufb xmm9, [SHUF_MASK] + pxor xmm14, xmm9 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + %endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes) + movdqu xmm14, [%%GDATA_CTX + AadHash] + movdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + movdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + movd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + movq xmm1, %%PLAIN_CYPH_LEN + pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + pxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + pxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0) + + pxor xmm9, xmm14 + + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + movq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + movq rax, xmm9 + mov [r10], rax + psrldq xmm9, 8 + movd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + movdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ;GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + pxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey + + pshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + movdqa xmm2, xmm6 + psllq xmm6, 1 + psrlq xmm2, 63 + movdqa xmm1, xmm2 + pslldq xmm2, 8 + psrldq xmm1, 8 + por xmm6, xmm2 + ;reduction + pshufd xmm2, xmm1, 00100100b + pcmpeqd xmm2, [TWOONE] + pand xmm2, [POLY] + pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse ( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + push arg5 + sub rsp, 1*16 + movdqu [rsp + 0*16],xmm6 + mov arg5, [rsp + 1*16 + 8*3 + 8*5] +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + add rsp, 1*16 + pop arg5 +%endif + pop r13 + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + + FUNC_RESTORE + + ret diff --git a/contrib/icp/gcm-simd/isa-l_crypto/reg_sizes.asm b/contrib/icp/gcm-simd/isa-l_crypto/reg_sizes.asm new file mode 100644 index 000000000000..991fe48b80a0 --- /dev/null +++ b/contrib/icp/gcm-simd/isa-l_crypto/reg_sizes.asm @@ -0,0 +1,459 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _REG_SIZES_ASM_ +%define _REG_SIZES_ASM_ + +%ifndef AS_FEATURE_LEVEL +%define AS_FEATURE_LEVEL 4 +%endif + +%define EFLAGS_HAS_CPUID (1<<21) +%define FLAG_CPUID1_ECX_CLMUL (1<<1) +%define FLAG_CPUID1_EDX_SSE2 (1<<26) +%define FLAG_CPUID1_ECX_SSE3 (1) +%define FLAG_CPUID1_ECX_SSE4_1 (1<<19) +%define FLAG_CPUID1_ECX_SSE4_2 (1<<20) +%define FLAG_CPUID1_ECX_POPCNT (1<<23) +%define FLAG_CPUID1_ECX_AESNI (1<<25) +%define FLAG_CPUID1_ECX_OSXSAVE (1<<27) +%define FLAG_CPUID1_ECX_AVX (1<<28) +%define FLAG_CPUID1_EBX_AVX2 (1<<5) + +%define FLAG_CPUID7_EBX_AVX2 (1<<5) +%define FLAG_CPUID7_EBX_AVX512F (1<<16) +%define FLAG_CPUID7_EBX_AVX512DQ (1<<17) +%define FLAG_CPUID7_EBX_AVX512IFMA (1<<21) +%define FLAG_CPUID7_EBX_AVX512PF (1<<26) +%define FLAG_CPUID7_EBX_AVX512ER (1<<27) +%define FLAG_CPUID7_EBX_AVX512CD (1<<28) +%define FLAG_CPUID7_EBX_SHA (1<<29) +%define FLAG_CPUID7_EBX_AVX512BW (1<<30) +%define FLAG_CPUID7_EBX_AVX512VL (1<<31) + +%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1) +%define FLAG_CPUID7_ECX_AVX512VBMI2 (1 << 6) +%define FLAG_CPUID7_ECX_GFNI (1 << 8) +%define FLAG_CPUID7_ECX_VAES (1 << 9) +%define FLAG_CPUID7_ECX_VPCLMULQDQ (1 << 10) +%define FLAG_CPUID7_ECX_VNNI (1 << 11) +%define FLAG_CPUID7_ECX_BITALG (1 << 12) +%define FLAG_CPUID7_ECX_VPOPCNTDQ (1 << 14) + +%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ) +%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ) + +%define FLAG_XGETBV_EAX_XMM (1<<1) +%define FLAG_XGETBV_EAX_YMM (1<<2) +%define FLAG_XGETBV_EAX_XMM_YMM 0x6 +%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0 + +%define FLAG_CPUID1_EAX_AVOTON 0x000406d0 +%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0 + +; define d and w variants for registers + +%define raxd eax +%define raxw ax +%define raxb al + +%define rbxd ebx +%define rbxw bx +%define rbxb bl + +%define rcxd ecx +%define rcxw cx +%define rcxb cl + +%define rdxd edx +%define rdxw dx +%define rdxb dl + +%define rsid esi +%define rsiw si +%define rsib sil + +%define rdid edi +%define rdiw di +%define rdib dil + +%define rbpd ebp +%define rbpw bp +%define rbpb bpl + +%define zmm0x xmm0 +%define zmm1x xmm1 +%define zmm2x xmm2 +%define zmm3x xmm3 +%define zmm4x xmm4 +%define zmm5x xmm5 +%define zmm6x xmm6 +%define zmm7x xmm7 +%define zmm8x xmm8 +%define zmm9x xmm9 +%define zmm10x xmm10 +%define zmm11x xmm11 +%define zmm12x xmm12 +%define zmm13x xmm13 +%define zmm14x xmm14 +%define zmm15x xmm15 +%define zmm16x xmm16 +%define zmm17x xmm17 +%define zmm18x xmm18 +%define zmm19x xmm19 +%define zmm20x xmm20 +%define zmm21x xmm21 +%define zmm22x xmm22 +%define zmm23x xmm23 +%define zmm24x xmm24 +%define zmm25x xmm25 +%define zmm26x xmm26 +%define zmm27x xmm27 +%define zmm28x xmm28 +%define zmm29x xmm29 +%define zmm30x xmm30 +%define zmm31x xmm31 + +%define ymm0x xmm0 +%define ymm1x xmm1 +%define ymm2x xmm2 +%define ymm3x xmm3 +%define ymm4x xmm4 +%define ymm5x xmm5 +%define ymm6x xmm6 +%define ymm7x xmm7 +%define ymm8x xmm8 +%define ymm9x xmm9 +%define ymm10x xmm10 +%define ymm11x xmm11 +%define ymm12x xmm12 +%define ymm13x xmm13 +%define ymm14x xmm14 +%define ymm15x xmm15 +%define ymm16x xmm16 +%define ymm17x xmm17 +%define ymm18x xmm18 +%define ymm19x xmm19 +%define ymm20x xmm20 +%define ymm21x xmm21 +%define ymm22x xmm22 +%define ymm23x xmm23 +%define ymm24x xmm24 +%define ymm25x xmm25 +%define ymm26x xmm26 +%define ymm27x xmm27 +%define ymm28x xmm28 +%define ymm29x xmm29 +%define ymm30x xmm30 +%define ymm31x xmm31 + +%define xmm0x xmm0 +%define xmm1x xmm1 +%define xmm2x xmm2 +%define xmm3x xmm3 +%define xmm4x xmm4 +%define xmm5x xmm5 +%define xmm6x xmm6 +%define xmm7x xmm7 +%define xmm8x xmm8 +%define xmm9x xmm9 +%define xmm10x xmm10 +%define xmm11x xmm11 +%define xmm12x xmm12 +%define xmm13x xmm13 +%define xmm14x xmm14 +%define xmm15x xmm15 +%define xmm16x xmm16 +%define xmm17x xmm17 +%define xmm18x xmm18 +%define xmm19x xmm19 +%define xmm20x xmm20 +%define xmm21x xmm21 +%define xmm22x xmm22 +%define xmm23x xmm23 +%define xmm24x xmm24 +%define xmm25x xmm25 +%define xmm26x xmm26 +%define xmm27x xmm27 +%define xmm28x xmm28 +%define xmm29x xmm29 +%define xmm30x xmm30 +%define xmm31x xmm31 + +%define zmm0y ymm0 +%define zmm1y ymm1 +%define zmm2y ymm2 +%define zmm3y ymm3 +%define zmm4y ymm4 +%define zmm5y ymm5 +%define zmm6y ymm6 +%define zmm7y ymm7 +%define zmm8y ymm8 +%define zmm9y ymm9 +%define zmm10y ymm10 +%define zmm11y ymm11 +%define zmm12y ymm12 +%define zmm13y ymm13 +%define zmm14y ymm14 +%define zmm15y ymm15 +%define zmm16y ymm16 +%define zmm17y ymm17 +%define zmm18y ymm18 +%define zmm19y ymm19 +%define zmm20y ymm20 +%define zmm21y ymm21 +%define zmm22y ymm22 +%define zmm23y ymm23 +%define zmm24y ymm24 +%define zmm25y ymm25 +%define zmm26y ymm26 +%define zmm27y ymm27 +%define zmm28y ymm28 +%define zmm29y ymm29 +%define zmm30y ymm30 +%define zmm31y ymm31 + +%define xmm0y ymm0 +%define xmm1y ymm1 +%define xmm2y ymm2 +%define xmm3y ymm3 +%define xmm4y ymm4 +%define xmm5y ymm5 +%define xmm6y ymm6 +%define xmm7y ymm7 +%define xmm8y ymm8 +%define xmm9y ymm9 +%define xmm10y ymm10 +%define xmm11y ymm11 +%define xmm12y ymm12 +%define xmm13y ymm13 +%define xmm14y ymm14 +%define xmm15y ymm15 +%define xmm16y ymm16 +%define xmm17y ymm17 +%define xmm18y ymm18 +%define xmm19y ymm19 +%define xmm20y ymm20 +%define xmm21y ymm21 +%define xmm22y ymm22 +%define xmm23y ymm23 +%define xmm24y ymm24 +%define xmm25y ymm25 +%define xmm26y ymm26 +%define xmm27y ymm27 +%define xmm28y ymm28 +%define xmm29y ymm29 +%define xmm30y ymm30 +%define xmm31y ymm31 + +%define xmm0z zmm0 +%define xmm1z zmm1 +%define xmm2z zmm2 +%define xmm3z zmm3 +%define xmm4z zmm4 +%define xmm5z zmm5 +%define xmm6z zmm6 +%define xmm7z zmm7 +%define xmm8z zmm8 +%define xmm9z zmm9 +%define xmm10z zmm10 +%define xmm11z zmm11 +%define xmm12z zmm12 +%define xmm13z zmm13 +%define xmm14z zmm14 +%define xmm15z zmm15 +%define xmm16z zmm16 +%define xmm17z zmm17 +%define xmm18z zmm18 +%define xmm19z zmm19 +%define xmm20z zmm20 +%define xmm21z zmm21 +%define xmm22z zmm22 +%define xmm23z zmm23 +%define xmm24z zmm24 +%define xmm25z zmm25 +%define xmm26z zmm26 +%define xmm27z zmm27 +%define xmm28z zmm28 +%define xmm29z zmm29 +%define xmm30z zmm30 +%define xmm31z zmm31 + +%define ymm0z zmm0 +%define ymm1z zmm1 +%define ymm2z zmm2 +%define ymm3z zmm3 +%define ymm4z zmm4 +%define ymm5z zmm5 +%define ymm6z zmm6 +%define ymm7z zmm7 +%define ymm8z zmm8 +%define ymm9z zmm9 +%define ymm10z zmm10 +%define ymm11z zmm11 +%define ymm12z zmm12 +%define ymm13z zmm13 +%define ymm14z zmm14 +%define ymm15z zmm15 +%define ymm16z zmm16 +%define ymm17z zmm17 +%define ymm18z zmm18 +%define ymm19z zmm19 +%define ymm20z zmm20 +%define ymm21z zmm21 +%define ymm22z zmm22 +%define ymm23z zmm23 +%define ymm24z zmm24 +%define ymm25z zmm25 +%define ymm26z zmm26 +%define ymm27z zmm27 +%define ymm28z zmm28 +%define ymm29z zmm29 +%define ymm30z zmm30 +%define ymm31z zmm31 + +%define DWORD(reg) reg %+ d +%define WORD(reg) reg %+ w +%define BYTE(reg) reg %+ b + +%define XWORD(reg) reg %+ x +%define YWORD(reg) reg %+ y +%define ZWORD(reg) reg %+ z + +%ifdef INTEL_CET_ENABLED + %ifdef __NASM_VER__ + %if AS_FEATURE_LEVEL >= 10 + %ifidn __OUTPUT_FORMAT__,elf32 +section .note.gnu.property note alloc noexec align=4 +DD 0x00000004,0x0000000c,0x00000005,0x00554e47 +DD 0xc0000002,0x00000004,0x00000003 + %endif + %ifidn __OUTPUT_FORMAT__,elf64 +section .note.gnu.property note alloc noexec align=8 +DD 0x00000004,0x00000010,0x00000005,0x00554e47 +DD 0xc0000002,0x00000004,0x00000003,0x00000000 + %endif + %endif + %endif +%endif + +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,elf64 + %define __x86_64__ +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif +%ifidn __OUTPUT_FORMAT__,win64 + %define __x86_64__ +%endif +%ifidn __OUTPUT_FORMAT__,macho64 + %define __x86_64__ +%endif + +%ifdef __x86_64__ + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa +%else + %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb +%endif + +%ifdef REL_TEXT + %define WRT_OPT +%elifidn __OUTPUT_FORMAT__, elf64 + %define WRT_OPT wrt ..plt +%else + %define WRT_OPT +%endif + +%macro mk_global 1-3 + %ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, macho64 + global %1 + %elifidn __OUTPUT_FORMAT__, win64 + global %1 + %else + global %1:%2 %3 + %endif + %else + global %1:%2 %3 + %endif +%endmacro + + +; Fixes for nasm lack of MS proc helpers +%ifdef __NASM_VER__ + %ifidn __OUTPUT_FORMAT__, win64 + %macro alloc_stack 1 + sub rsp, %1 + %endmacro + + %macro proc_frame 1 + %1: + %endmacro + + %macro save_xmm128 2 + movdqa [rsp + %2], %1 + %endmacro + + %macro save_reg 2 + mov [rsp + %2], %1 + %endmacro + + %macro rex_push_reg 1 + push %1 + %endmacro + + %macro push_reg 1 + push %1 + %endmacro + + %define end_prolog + %endif + + %define endproc_frame +%endif + +%ifidn __OUTPUT_FORMAT__, macho64 + %define elf64 macho64 + mac_equ equ 1 +%endif + +%macro slversion 4 + section .text + global %1_slver_%2%3%4 + global %1_slver + %1_slver: + %1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro + +%endif ; ifndef _REG_SIZES_ASM_ diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel new file mode 100644 index 000000000000..ecebef110b46 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel @@ -0,0 +1,26 @@ + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel.descrip new file mode 100644 index 000000000000..6184759c8b74 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel.descrip @@ -0,0 +1 @@ +PORTIONS OF GCM and GHASH FUNCTIONALITY diff --git a/module/icp/asm-x86_64/modes/isalc_gcm128_sse.S b/module/icp/asm-x86_64/modes/isalc_gcm128_sse.S new file mode 100644 index 000000000000..f552d8630073 --- /dev/null +++ b/module/icp/asm-x86_64/modes/isalc_gcm128_sse.S @@ -0,0 +1,31 @@ +//####################################################################### +// Copyright(c) 2011-2016 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, +// DATA, OR PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//####################################################################### + +#define GCM128_MODE 1 +#include "isalc_gcm_sse_att.S" diff --git a/module/icp/asm-x86_64/modes/isalc_gcm256_sse.S b/module/icp/asm-x86_64/modes/isalc_gcm256_sse.S new file mode 100644 index 000000000000..c88cb0ed055f --- /dev/null +++ b/module/icp/asm-x86_64/modes/isalc_gcm256_sse.S @@ -0,0 +1,31 @@ +////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2016 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +////////////////////////////////////////////////////////////////////////// + +#define GCM256_MODE 1 +#include "isalc_gcm_sse_att.S" diff --git a/module/icp/asm-x86_64/modes/isalc_gcm_defines.S b/module/icp/asm-x86_64/modes/isalc_gcm_defines.S new file mode 100644 index 000000000000..00ec4c654d9f --- /dev/null +++ b/module/icp/asm-x86_64/modes/isalc_gcm_defines.S @@ -0,0 +1,293 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2016 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef GCM_DEFINES_ASM_INCLUDED +#define GCM_DEFINES_ASM_INCLUDED + +// +// Authors: +// Erdinc Ozturk +// Vinodh Gopal +// James Guilford + + +//////////// + +.section .rodata + +.balign 16 +POLY: .quad 0x0000000000000001, 0xC200000000000000 + +// unused for sse +.balign 64 +POLY2: .quad 0x00000001C2000000, 0xC200000000000000 + .quad 0x00000001C2000000, 0xC200000000000000 + .quad 0x00000001C2000000, 0xC200000000000000 + .quad 0x00000001C2000000, 0xC200000000000000 +.balign 16 +TWOONE: .quad 0x0000000000000001, 0x0000000100000000 + +// order of these constants should not change. +// more specifically, ALL_F should follow SHIFT_MASK, and ZERO should +// follow ALL_F + +.balign 64 +SHUF_MASK: .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 + +SHIFT_MASK: .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 +ALL_F: .quad 0xffffffffffffffff, 0xffffffffffffffff +ZERO: .quad 0x0000000000000000, 0x0000000000000000 // unused for sse +ONE: .quad 0x0000000000000001, 0x0000000000000000 +TWO: .quad 0x0000000000000002, 0x0000000000000000 // unused for sse +ONEf: .quad 0x0000000000000000, 0x0100000000000000 +TWOf: .quad 0x0000000000000000, 0x0200000000000000 // unused for sse + +// Below unused for sse +.balign 64 +ddq_add_1234: + .quad 0x0000000000000001, 0x0000000000000000 + .quad 0x0000000000000002, 0x0000000000000000 + .quad 0x0000000000000003, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + +.balign 64 +ddq_add_5678: + .quad 0x0000000000000005, 0x0000000000000000 + .quad 0x0000000000000006, 0x0000000000000000 + .quad 0x0000000000000007, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + +.balign 64 +ddq_add_4444: + .quad 0x0000000000000004, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + .quad 0x0000000000000004, 0x0000000000000000 + +.balign 64 +ddq_add_8888: + .quad 0x0000000000000008, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + .quad 0x0000000000000008, 0x0000000000000000 + +.balign 64 +ddq_addbe_1234: + .quad 0x0000000000000000, 0x0100000000000000 + .quad 0x0000000000000000, 0x0200000000000000 + .quad 0x0000000000000000, 0x0300000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + +.balign 64 +ddq_addbe_5678: + .quad 0x0000000000000000, 0x0500000000000000 + .quad 0x0000000000000000, 0x0600000000000000 + .quad 0x0000000000000000, 0x0700000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + +.balign 64 +ddq_addbe_4444: + .quad 0x0000000000000000, 0x0400000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + .quad 0x0000000000000000, 0x0400000000000000 + +.balign 64 +ddq_addbe_8888: + .quad 0x0000000000000000, 0x0800000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + .quad 0x0000000000000000, 0x0800000000000000 + +.balign 64 +byte_len_to_mask_table: + .short 0x0000, 0x0001, 0x0003, 0x0007 + .short 0x000f, 0x001f, 0x003f, 0x007f + .short 0x00ff, 0x01ff, 0x03ff, 0x07ff + .short 0x0fff, 0x1fff, 0x3fff, 0x7fff + .short 0xffff + +.balign 64 +byte64_len_to_mask_table: + .quad 0x0000000000000000, 0x0000000000000001 + .quad 0x0000000000000003, 0x0000000000000007 + .quad 0x000000000000000f, 0x000000000000001f + .quad 0x000000000000003f, 0x000000000000007f + .quad 0x00000000000000ff, 0x00000000000001ff + .quad 0x00000000000003ff, 0x00000000000007ff + .quad 0x0000000000000fff, 0x0000000000001fff + .quad 0x0000000000003fff, 0x0000000000007fff + .quad 0x000000000000ffff, 0x000000000001ffff + .quad 0x000000000003ffff, 0x000000000007ffff + .quad 0x00000000000fffff, 0x00000000001fffff + .quad 0x00000000003fffff, 0x00000000007fffff + .quad 0x0000000000ffffff, 0x0000000001ffffff + .quad 0x0000000003ffffff, 0x0000000007ffffff + .quad 0x000000000fffffff, 0x000000001fffffff + .quad 0x000000003fffffff, 0x000000007fffffff + .quad 0x00000000ffffffff, 0x00000001ffffffff + .quad 0x00000003ffffffff, 0x00000007ffffffff + .quad 0x0000000fffffffff, 0x0000001fffffffff + .quad 0x0000003fffffffff, 0x0000007fffffffff + .quad 0x000000ffffffffff, 0x000001ffffffffff + .quad 0x000003ffffffffff, 0x000007ffffffffff + .quad 0x00000fffffffffff, 0x00001fffffffffff + .quad 0x00003fffffffffff, 0x00007fffffffffff + .quad 0x0000ffffffffffff, 0x0001ffffffffffff + .quad 0x0003ffffffffffff, 0x0007ffffffffffff + .quad 0x000fffffffffffff, 0x001fffffffffffff + .quad 0x003fffffffffffff, 0x007fffffffffffff + .quad 0x00ffffffffffffff, 0x01ffffffffffffff + .quad 0x03ffffffffffffff, 0x07ffffffffffffff + .quad 0x0fffffffffffffff, 0x1fffffffffffffff + .quad 0x3fffffffffffffff, 0x7fffffffffffffff + .quad 0xffffffffffffffff + +.balign 64 +mask_out_top_block: + .quad 0xffffffffffffffff, 0xffffffffffffffff + .quad 0xffffffffffffffff, 0xffffffffffffffff + .quad 0xffffffffffffffff, 0xffffffffffffffff + .quad 0x0000000000000000, 0x0000000000000000 + +.section .text + + +////define the fields of gcm_data struct +//typedef struct gcm_data +//{ +// u8 expanded_keys[16*15]// +// u8 shifted_hkey_1[16]// // store HashKey <<1 mod poly here +// u8 shifted_hkey_2[16]// // store HashKey^2 <<1 mod poly here +// u8 shifted_hkey_3[16]// // store HashKey^3 <<1 mod poly here +// u8 shifted_hkey_4[16]// // store HashKey^4 <<1 mod poly here +// u8 shifted_hkey_5[16]// // store HashKey^5 <<1 mod poly here +// u8 shifted_hkey_6[16]// // store HashKey^6 <<1 mod poly here +// u8 shifted_hkey_7[16]// // store HashKey^7 <<1 mod poly here +// u8 shifted_hkey_8[16]// // store HashKey^8 <<1 mod poly here +// u8 shifted_hkey_1_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_2_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_3_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_4_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_5_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_6_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_7_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_8_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +//} gcm_data// + +#ifndef GCM_KEYS_VAES_AVX512_INCLUDED +#define HashKey 16*15 // store HashKey <<1 mod poly here +#define HashKey_1 16*15 // store HashKey <<1 mod poly here +#define HashKey_2 16*16 // store HashKey^2 <<1 mod poly here +#define HashKey_3 16*17 // store HashKey^3 <<1 mod poly here +#define HashKey_4 16*18 // store HashKey^4 <<1 mod poly here +#define HashKey_5 16*19 // store HashKey^5 <<1 mod poly here +#define HashKey_6 16*20 // store HashKey^6 <<1 mod poly here +#define HashKey_7 16*21 // store HashKey^7 <<1 mod poly here +#define HashKey_8 16*22 // store HashKey^8 <<1 mod poly here +#define HashKey_k 16*23 // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +#define HashKey_2_k 16*24 // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_3_k 16*25 // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_4_k 16*26 // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_5_k 16*27 // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_6_k 16*28 // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_7_k 16*29 // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +#define HashKey_8_k 16*30 // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +#endif + +#define AadHash 16*0 // store current Hash of data which has been input +#define AadLen 16*1 // store length of input data which will not be encrypted or decrypted +#define InLen (16*1)+8 // store length of input data which will be encrypted or decrypted +#define PBlockEncKey 16*2 // encryption key for the partial block at the end of the previous update +#define OrigIV 16*3 // input IV +#define CurCount 16*4 // Current counter for generation of encryption key +#define PBlockLen 16*5 // length of partial block at the end of the previous update + +.macro xmmreg name, num + .set xmm\name, %xmm\num +.endm + +#define arg(x) (STACK_OFFSET + 8*(x))(%r14) + + +#if __OUTPUT_FORMAT__ != elf64 +#define arg1 %rcx +#define arg2 %rdx +#define arg3 %r8 +#define arg4 %r9 +#define arg5 %rsi +#define arg6 (STACK_OFFSET + 8*6)(%r14) +#define arg7 (STACK_OFFSET + 8*7)(%r14) +#define arg8 (STACK_OFFSET + 8*8)(%r14) +#define arg9 (STACK_OFFSET + 8*9)(%r14) +#define arg10 (STACK_OFFSET + 8*10)(%r14) +#else +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx +#define arg4 %rcx +#define arg5 %r8 +#define arg6 %r9 +#define arg7 ((STACK_OFFSET) + 8*1)(%r14) +#define arg8 ((STACK_OFFSET) + 8*2)(%r14) +#define arg9 ((STACK_OFFSET) + 8*3)(%r14) +#define arg10 ((STACK_OFFSET) + 8*4)(%r14) +#endif + +#ifdef NT_LDST +#define NT_LD +#define NT_ST +#endif + +////// Use Non-temporal load/stor +#ifdef NT_LD +#define XLDR movntdqa +#define VXLDR vmovntdqa +#define VX512LDR vmovntdqa +#else +#define XLDR movdqu +#define VXLDR vmovdqu +#define VX512LDR vmovdqu8 +#endif + +////// Use Non-temporal load/stor +#ifdef NT_ST +#define XSTR movntdq +#define VXSTR vmovntdq +#define VX512STR vmovntdq +#else +#define XSTR movdqu +#define VXSTR vmovdqu +#define VX512STR vmovdqu8 +#endif + +#endif // GCM_DEFINES_ASM_INCLUDED diff --git a/module/icp/asm-x86_64/modes/isalc_gcm_sse.S b/module/icp/asm-x86_64/modes/isalc_gcm_sse.S new file mode 100644 index 000000000000..5d5be5068904 --- /dev/null +++ b/module/icp/asm-x86_64/modes/isalc_gcm_sse.S @@ -0,0 +1,2150 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2017 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// +// Authors: +// Erdinc Ozturk +// Vinodh Gopal +// James Guilford +// +// +// References: +// This code was derived and highly optimized from the code described in +// paper: +// Vinodh Gopal et. al. Optimized Galois-Counter-Mode +// Implementation on Intel Architecture Processors. August, 2010 +// +// For the shift-based reductions used in this code, we used the method +// described in paper: +// Shay Gueron, Michael E. Kounavis. Intel Carry-Less +// Multiplication Instruction and its Usage for Computing the GCM +// Mode. January, 2010. +// +// +// Assumptions: +// +// +// +// iv: +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Salt (From the SA) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | Initialization Vector | +// | (This is the sequence number from IPSec header) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 0x1 | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// +// +// AAD: +// AAD will be padded with 0 to the next 16byte multiple +// for example, assume AAD is a u32 vector +// +// if AAD is 8 bytes: +// AAD[3] = {A0, A1}; +// padded AAD in xmm register = {A1 A0 0 0} +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | SPI (A1) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 32-bit Sequence Number (A0) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 0x0 | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// AAD Format with 32-bit Sequence Number +// +// if AAD is 12 bytes: +// AAD[3] = {A0, A1, A2}; +// padded AAD in xmm register = {A2 A1 A0 0} +// +// 0 1 2 3 +// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | SPI (A2) | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 64-bit Extended Sequence Number {A1,A0} | +// | | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// | 0x0 | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +// +// AAD Format with 64-bit Extended Sequence Number +// +// +// aadLen: +// Must be a multiple of 4 bytes and from the definition of the spec. +// The code additionally supports any aadLen length. +// +// TLen: +// from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +// +// poly = x^128 + x^127 + x^126 + x^121 + 1 +// throughout the code, one tab and two tab indentations are used. one tab is +// for GHASH part, two tabs is for AES part. +// + +// .altmacro +.att_syntax prefix + +#include "isalc_reg_sizes_att.S" +#include "isalc_gcm_defines_att.S" + +#if !defined(GCM128_MODE) && !defined(GCM256_MODE) +#error "No GCM mode selected for gcm_sse.S!" +#endif + +#if defined(FUNCT_EXTENSION) +#error "No support for non-temporal versions yet!" +#endif +#define _nt 1 + +#ifdef GCM128_MODE +#define FN_NAME(x,y) aes_gcm_ ## x ## _128 ## y ## sse +#define NROUNDS 9 +#endif + +#ifdef GCM256_MODE +#define FN_NAME(x,y) aes_gcm_ ## x ## _256 ## y ## sse +#define NROUNDS 13 +#endif + + +// need to push 5 registers into stack to maintain +#define STACK_OFFSET 8*5 + +#define TMP2 16*0 // Temporary storage for AES State 2 (State 1 is stored in an XMM register) +#define TMP3 16*1 // Temporary storage for AES State 3 +#define TMP4 16*2 // Temporary storage for AES State 4 +#define TMP5 16*3 // Temporary storage for AES State 5 +#define TMP6 16*4 // Temporary storage for AES State 6 +#define TMP7 16*5 // Temporary storage for AES State 7 +#define TMP8 16*6 // Temporary storage for AES State 8 + +#define LOCAL_STORAGE 16*7 + +#if __OUTPUT_FORMAT == win64 +#define XMM_STORAGE 16*10 +#else +#define XMM_STORAGE 0 +#endif + +#define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +//////////////////////////////////////////////////////////////// +// Utility Macros +//////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +// Input: A and B (128-bits each, bit-reflected) +// Output: C = A*B*x mod poly, (i.e. >>1 ) +// To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +// GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +//////////////////////////////////////////////////////////////////////////////// +.macro GHASH_MUL GH, HK, T1, T2, T3, T4, T5 + // \GH, \HK hold the values for the two operands which are carry-less + // multiplied. + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqa \GH, \T1 + pshufd $0b01001110, \GH, \T2 + pshufd $0b01001110, \HK, \T3 + pxor \GH, \T2 // \T2 = (a1+a0) + pxor \HK, \T3 // \T3 = (b1+b0) + + pclmulqdq $0x11, \HK, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \HK, \GH // \GH = a0*b0 + pclmulqdq $0x00, \T3, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \GH, \T2 + pxor \T1, \T2 // \T2 = a0*b1+a1*b0 + + movdqa \T2, \T3 + pslldq $8, \T3 // shift-L \T3 2 DWs + psrldq $8, \T2 // shift-R \T2 2 DWs + pxor \T3, \GH + pxor \T2, \T1 // <\T1:\GH> holds the result of the carry-less multiplication of \GH by \HK + + + //first phase of the reduction + movdqa \GH, \T2 + movdqa \GH, \T3 + movdqa \GH, \T4 // move \GH into \T2, \T3, \T4 in order to perform the three shifts independently + + pslld $31, \T2 // packed right shifting << 31 + pslld $30, \T3 // packed right shifting shift << 30 + pslld $25, \T4 // packed right shifting shift << 25 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + movdqa \T2, \T5 + psrldq $4, \T5 // shift-R \T5 1 DW + + pslldq $12, \T2 // shift-L \T2 3 DWs + pxor \T2, \GH // first phase of the reduction complete + //////////////////////////////////////////////////////////////////////// + + //second phase of the reduction + movdqa \GH, \T2 // make 3 copies of \GH (in in \T2, \T3, \T4) for doing three shift operations + movdqa \GH, \T3 + movdqa \GH, \T4 + + psrld $1, \T2 // packed left shifting >> 1 + psrld $2, \T3 // packed left shifting >> 2 + psrld $7, \T4 // packed left shifting >> 7 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + pxor \T5, \T2 + pxor \T2, \GH + pxor \T1, \GH // the result is in \T1 + +.endm // GHASH_MUL + +//////////////////////////////////////////////////////////////////////////////// +// PRECOMPUTE: Precompute HashKey_{2..8} and HashKey{,_{2..8}}_k. +// HasKey_i_k holds XORed values of the low and high parts of the Haskey_i. +//////////////////////////////////////////////////////////////////////////////// +.macro PRECOMPUTE GDATA, HK, T1, T2, T3, T4, T5, T6 + + movdqa \HK, \T4 + pshufd $0b01001110, \HK, \T1 + pxor \HK, \T1 + movdqu \T1, HashKey_k(\GDATA) + + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^2<<1 mod poly + movdqu \T4, HashKey_2(\GDATA) // [HashKey_2] = HashKey^2<<1 mod poly + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_2_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^3<<1 mod poly + movdqu \T4, HashKey_3(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_3_k(\GDATA) + + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^4<<1 mod poly + movdqu \T4, HashKey_4(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_4_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^5<<1 mod poly + movdqu \T4, HashKey_5(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_5_k(\GDATA) + + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^6<<1 mod poly + movdqu \T4, HashKey_6(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_6_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^7<<1 mod poly + movdqu \T4, HashKey_7(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_7_k(\GDATA) + + GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^8<<1 mod poly + movdqu \T4, HashKey_8(\GDATA) + pshufd $0b01001110, \T4, \T1 + pxor \T4, \T1 + movdqu \T1, HashKey_8_k(\GDATA) + +.endm // PRECOMPUTE + + +//////////////////////////////////////////////////////////////////////////////// +// READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less +// than 16 bytes. +// Returns 0 if data has length 0. +// Input: The input data (INPUT), that data's length (LENGTH). +// Output: The packed xmm register (OUTPUT). +//////////////////////////////////////////////////////////////////////////////// +.macro READ_SMALL_DATA_INPUT OUTPUT, INPUT, LENGTH, \ + END_READ_LOCATION, COUNTER, TMP1 + + // clang compat: no local support + // LOCAL _byte_loop_1, _byte_loop_2, _done + + pxor \OUTPUT, \OUTPUT + mov \LENGTH, \COUNTER + mov \INPUT, \END_READ_LOCATION + add \LENGTH, \END_READ_LOCATION + xor \TMP1, \TMP1 + + + cmp $8, \COUNTER + jl _byte_loop_2_\@ + pinsrq $0, (\INPUT), \OUTPUT //Read in 8 bytes if they exists + je _done_\@ + + sub $8, \COUNTER + +_byte_loop_1_\@: //Read in data 1 byte at a time while data is left + shl $8, \TMP1 //This loop handles when 8 bytes were already read in + dec \END_READ_LOCATION + + //// mov BYTE(\TMP1), BYTE [\END_READ_LOCATION] + bytereg \TMP1 + movb (\END_READ_LOCATION), breg + dec \COUNTER + jg _byte_loop_1_\@ + pinsrq $1, \TMP1, \OUTPUT + jmp _done_\@ + +_byte_loop_2_\@: //Read in data 1 byte at a time while data is left + cmp $0, \COUNTER + je _done_\@ + shl $8, \TMP1 //This loop handles when no bytes were already read in + dec \END_READ_LOCATION + //// mov BYTE(\TMP1), BYTE [\END_READ_LOCATION] + bytereg \TMP1 + movb (\END_READ_LOCATION), breg + dec \COUNTER + jg _byte_loop_2_\@ + pinsrq $0, \TMP1, \OUTPUT +_done_\@: + +.endm // READ_SMALL_DATA_INPUT + + +//////////////////////////////////////////////////////////////////////////////// +// CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +// Input: The input data (A_IN), that data's length (A_LEN), and the hash key +// (HASH_KEY). +// Output: The hash of the data (AAD_HASH). +//////////////////////////////////////////////////////////////////////////////// +.macro CALC_AAD_HASH A_IN, A_LEN, AAD_HASH, HASH_KEY, XTMP1, XTMP2, XTMP3, \ + XTMP4, XTMP5, T1, T2, T3, T4, T5 + + // clang compat: no local support + // LOCAL _get_AAD_loop16, _get_small_AAD_block, _CALC_AAD_done + + mov \A_IN, \T1 // T1 = AAD + mov \A_LEN, \T2 // T2 = aadLen + pxor \AAD_HASH, \AAD_HASH + + cmp $16, \T2 + jl _get_small_AAD_block_\@ + +_get_AAD_loop16_\@: + + movdqu (\T1), \XTMP1 + //byte-reflect the AAD data + pshufb SHUF_MASK(%rip), \XTMP1 + pxor \XTMP1, \AAD_HASH + GHASH_MUL \AAD_HASH, \HASH_KEY, \XTMP1, \XTMP2, \XTMP3, \XTMP4, \XTMP5 + + sub $16, \T2 + je _CALC_AAD_done_\@ + + add $16, \T1 + cmp $16, \T2 + jge _get_AAD_loop16_\@ + +_get_small_AAD_block_\@: + READ_SMALL_DATA_INPUT \XTMP1, \T1, \T2, \T3, \T4, \T5 + //byte-reflect the AAD data + pshufb SHUF_MASK(%rip), \XTMP1 + pxor \XTMP1, \AAD_HASH + GHASH_MUL \AAD_HASH, \HASH_KEY, \XTMP1, \XTMP2, \XTMP3, \XTMP4, \XTMP5 + +_CALC_AAD_done_\@: + +.endm // CALC_AAD_HASH + + + +//////////////////////////////////////////////////////////////////////////////// +// PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +// between update calls. Requires the input data be at least 1 byte long. +// Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text +// (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), the current data offset +// (DATA_OFFSET), and whether encoding or decoding (ENC_DEC). +// Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated +// GDATA_CTX. +// Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, +// xmm10, xmm11, xmm13 +//////////////////////////////////////////////////////////////////////////////// +.macro PARTIAL_BLOCK GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + PLAIN_CYPH_LEN, DATA_OFFSET, AAD_HASH, ENC_DEC + + // clang compat: no local support + // LOCAL _fewer_than_16_bytes, _data_read, _no_extra_mask_1 + // LOCAL _partial_incomplete_1, _dec_done, _no_extra_mask_2 + // LOCAL _partial_incomplete_2, _encode_done, _partial_fill + // LOCAL _count_set, _less_than_8_bytes_left, _partial_block_done + + mov PBlockLen(\GDATA_CTX), %r13 + cmp $0, %r13 + je _partial_block_done_\@ //Leave Macro if no partial blocks + + cmp $16, \PLAIN_CYPH_LEN //Read in input data without over reading + jl _fewer_than_16_bytes_\@ + XLDR (\PLAIN_CYPH_IN), %xmm1 //If more than 16 bytes of data, just fill the xmm register + jmp _data_read_\@ + +_fewer_than_16_bytes_\@: + lea (\PLAIN_CYPH_IN, \DATA_OFFSET), %r10 + READ_SMALL_DATA_INPUT %xmm1, %r10, \PLAIN_CYPH_LEN, %rax, %r12, %r15 + mov PBlockLen(\GDATA_CTX), %r13 + +_data_read_\@: //Finished reading in data + + + movdqu PBlockEncKey(\GDATA_CTX), %xmm9 //xmm9 = ctx_data.partial_block_enc_key + movdqu HashKey(\GDATA_KEY), %xmm13 + + lea SHIFT_MASK(%rip), %r12 + + add %r13, %r12 // adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 // get the appropriate shuffle mask + pshufb %xmm2, %xmm9 // shift right r13 bytes + + .ifc \ENC_DEC, DEC + + movdqa %xmm1, %xmm3 + pxor %xmm1, %xmm9 // Cyphertext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r15 + add %r13, %r15 + sub $16, %r15 //Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge _no_extra_mask_1_\@ //Determine if if partial block is not being filled and shift mask accordingly + sub %r15, %r12 +_no_extra_mask_1_\@: + + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out bottom r13 bytes of xmm9 + + pand %xmm1, %xmm3 + pshufb SHUF_MASK(%rip), %xmm3 + pshufb %xmm2, %xmm3 + pxor %xmm3, \AAD_HASH + + + cmp $0, %r15 + jl _partial_incomplete_1_\@ + + GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 //GHASH computation for the last <16 Byte block + xor %rax, %rax + mov %rax, PBlockLen(\GDATA_CTX) + jmp _dec_done_\@ +_partial_incomplete_1_\@: + add \PLAIN_CYPH_LEN, PBlockLen(\GDATA_CTX) +_dec_done_\@: + movdqu \AAD_HASH, AadHash(\GDATA_CTX) + + .else // .ifc \ENC_DEC, DEC + + pxor %xmm1, %xmm9 // Plaintext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r15 + add %r13, %r15 + sub $16, %r15 //Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge _no_extra_mask_2_\@ //Determine if if partial block is not being filled and shift mask accordingly + sub %r15, %r12 +_no_extra_mask_2_\@: + + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out bottom r13 bytes of xmm9 + + pshufb SHUF_MASK(%rip), %xmm9 + pshufb %xmm2, %xmm9 + pxor %xmm9, \AAD_HASH + + cmp $0, %r15 + jl _partial_incomplete_2_\@ + + GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 //GHASH computation for the last <16 Byte block + xor %rax, %rax + mov %rax, PBlockLen(\GDATA_CTX) + jmp _encode_done_\@ +_partial_incomplete_2_\@: + add \PLAIN_CYPH_LEN, PBlockLen(\GDATA_CTX) +_encode_done_\@: + movdqu \AAD_HASH, AadHash(\GDATA_CTX) + + pshufb SHUF_MASK(%rip), %xmm9 // shuffle xmm9 back to output as ciphertext + pshufb %xmm2, %xmm9 + + .endif // .ifc \ENC_DEC, DEC + + + ////////////////////////////////////////////////////////// + // output encrypted Bytes + cmp $0, %r15 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 + sub %r12, %r13 // Set r13 to be the number of bytes to write out + jmp _count_set_\@ +_partial_fill_\@: + mov \PLAIN_CYPH_LEN, %r13 +_count_set_\@: + movq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET) + add $8, \DATA_OFFSET + psrldq $8, %xmm9 + movq %xmm9, %rax + sub $8, %r13 +_less_than_8_bytes_left_\@: + mov %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET) + add $1, \DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ + ////////////////////////////////////////////////////////// +_partial_block_done_\@: +.endm // PARTIAL_BLOCK + +//////////////////////////////////////////////////////////////////////////////// +// INITIAL_BLOCKS: If a = number of total plaintext bytes; b = floor(a/16); +// \num_initial_blocks = b mod 8; encrypt the initial \num_initial_blocks +// blocks and apply ghash on the ciphertext. +// \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, r14 are used as a +// pointer only, not modified. +// Updated AAD_HASH is returned in \T3. +//////////////////////////////////////////////////////////////////////////////// +.macro INITIAL_BLOCKS GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + LENGTH, DATA_OFFSET, num_initial_blocks, T1, HASH_KEY, \ + T3, T4, T5, CTR, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, \ + XMM7, XMM8, T6, T_key, ENC_DEC + + // clang compat: no local support + // LOCAL _initial_blocks_done + +.altmacro +.set i, (8-\num_initial_blocks) + xmmreg i, %i + movdqu \XMM8, xmmi // move AAD_HASH to temp reg + + // start AES for \num_initial_blocks blocks + movdqu CurCount(\GDATA_CTX), \CTR // \CTR = Y0 + + +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, xmmi + pshufb SHUF_MASK(%rip), xmmi // perform a 16Byte swap +.set i, (i+1) +.endr + +movdqu 16*0(\GDATA_KEY), \T_key +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + pxor \T_key, xmmi +.set i, (i+1) +.endr + +.set j, 1 +.rept NROUNDS // encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu 16*j(\GDATA_KEY), \T_key +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + aesenc \T_key, xmmi +.set i, (i+1) +.endr + +.set j, (j+1) +.endr + +movdqu 16*j(\GDATA_KEY), \T_key // encrypt with last (14th) key round (12 for GCM192) +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + aesenclast \T_key, xmmi +.set i, (i+1) +.endr + +.set i, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + XLDR (\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, xmmi + XSTR xmmi, (\CYPH_PLAIN_OUT, \DATA_OFFSET) // write back ciphertext for \num_initial_blocks blocks + add $16, \DATA_OFFSET + .ifc \ENC_DEC, DEC + movdqa \T1, xmmi + .endif + pshufb SHUF_MASK(%rip), xmmi // prepare ciphertext for GHASH computations +.set i, (i+1) +.endr + + +.set i, (8-\num_initial_blocks) +.set j, (9-\num_initial_blocks) +.rept \num_initial_blocks + xmmreg i, %i + xmmreg j, %j + pxor xmmi, xmmj + GHASH_MUL xmmj, <\HASH_KEY>, <\T1>, <\T3>, <\T4>, <\T5>, <\T6> // apply GHASH on \num_initial_blocks blocks +.set i, (i+1) +.set j, (j+1) +.endr +.noaltmacro + + // \XMM8 has the current Hash Value + movdqa \XMM8, \T3 + + cmp $128, \LENGTH + jl _initial_blocks_done_\@ // no need for precomputed constants + +//////////////////////////////////////////////////////////////////////////////// +// Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM1 + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM2 + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM3 + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM4 + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM5 + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM6 + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM7 + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + + paddd ONE(%rip), \CTR // INCR Y0 + movdqa \CTR, \XMM8 + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + + movdqu 16*0(\GDATA_KEY), \T_key + pxor \T_key, \XMM1 + pxor \T_key, \XMM2 + pxor \T_key, \XMM3 + pxor \T_key, \XMM4 + pxor \T_key, \XMM5 + pxor \T_key, \XMM6 + pxor \T_key, \XMM7 + pxor \T_key, \XMM8 + +.set i, 1 +.rept NROUNDS // do early (13) rounds (11 for GCM192) + movdqu 16*i(\GDATA_KEY), \T_key + aesenc \T_key, \XMM1 + aesenc \T_key, \XMM2 + aesenc \T_key, \XMM3 + aesenc \T_key, \XMM4 + aesenc \T_key, \XMM5 + aesenc \T_key, \XMM6 + aesenc \T_key, \XMM7 + aesenc \T_key, \XMM8 +.set i, (i+1) +.endr + + movdqu 16*i(\GDATA_KEY), \T_key // do final key round + aesenclast \T_key, \XMM1 + aesenclast \T_key, \XMM2 + aesenclast \T_key, \XMM3 + aesenclast \T_key, \XMM4 + aesenclast \T_key, \XMM5 + aesenclast \T_key, \XMM6 + aesenclast \T_key, \XMM7 + aesenclast \T_key, \XMM8 + + XLDR 16*0(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM1 + XSTR \XMM1, 16*0(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM1 + .endif + + XLDR 16*1(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM2 + XSTR \XMM2, 16*1(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM2 + .endif + + XLDR 16*2(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM3 + XSTR \XMM3, 16*2(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM3 + .endif + + XLDR 16*3(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM4 + XSTR \XMM4, 16*3(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM4 + .endif + + XLDR 16*4(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM5 + XSTR \XMM5, 16*4(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM5 + .endif + + XLDR 16*5(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM6 + XSTR \XMM6, 16*5(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM6 + .endif + + XLDR 16*6(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM7 + XSTR \XMM7, 16*6(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM7 + .endif + + XLDR 16*7(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + pxor \T1, \XMM8 + XSTR \XMM8, 16*7(\CYPH_PLAIN_OUT, \DATA_OFFSET) + .ifc \ENC_DEC, DEC + movdqa \T1, \XMM8 + .endif + + add $128, \DATA_OFFSET + + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + pxor \T3, \XMM1 // combine GHASHed value with the corresponding ciphertext + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + +//////////////////////////////////////////////////////////////////////////////// + +_initial_blocks_done_\@: +.noaltmacro +.endm // INITIAL_BLOCKS + + +//////////////////////////////////////////////////////////////////////////////// +// GHASH_8_ENCRYPT_8_PARALLEL: Encrypt 8 blocks at a time and ghash the 8 +// previously encrypted ciphertext blocks. +// \GDATA (KEY), \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN are used as pointers only, +// not modified. +// \DATA_OFFSET is the data offset value +//////////////////////////////////////////////////////////////////////////////// +.macro GHASH_8_ENCRYPT_8_PARALLEL GDATA, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + DATA_OFFSET, T1, T2, T3, T4, T5, T6, CTR, \ + XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, \ + XMM8, T7, loop_idx, ENC_DEC + + + movdqa \XMM1, \T7 + movdqu \XMM2, TMP2(%rsp) + movdqu \XMM3, TMP3(%rsp) + movdqu \XMM4, TMP4(%rsp) + movdqu \XMM5, TMP5(%rsp) + movdqu \XMM6, TMP6(%rsp) + movdqu \XMM7, TMP7(%rsp) + movdqu \XMM8, TMP8(%rsp) + + //////////////////////////////////////////////////////////////////////// + //// Karatsuba Method + + movdqa \T7, \T4 + pshufd $0b01001110, \T7, \T6 + pxor \T7, \T6 + .ifc \loop_idx, in_order + paddd ONE(%rip), \CTR // INCR CNT + .else + paddd ONEf(%rip), \CTR // INCR CNT + .endif + movdqu HashKey_8(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T4 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T7 // \T7 = a0*b0 + movdqu HashKey_8_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T6 // \T2 = (a1+a0)*(b1+b0) + movdqa \CTR, \XMM1 + + .ifc \loop_idx, in_order + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM2 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM3 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM4 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM5 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM6 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM7 + + paddd ONE(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM8 + + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + + .else // .ifc \loop_idx, in_order + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM2 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM3 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM4 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM5 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM6 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM7 + + paddd ONEf(%rip), \CTR // INCR CNT + movdqa \CTR, \XMM8 + + .endif // .ifc \loop_idx, in_order + //////////////////////////////////////////////////////////////////////// + + movdqu 16*0(\GDATA), \T1 + pxor \T1, \XMM1 + pxor \T1, \XMM2 + pxor \T1, \XMM3 + pxor \T1, \XMM4 + pxor \T1, \XMM5 + pxor \T1, \XMM6 + pxor \T1, \XMM7 + pxor \T1, \XMM8 + + // \XMM6, \T5 hold the values for the two operands which are + // carry-less multiplied + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqu TMP2(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_7(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_7_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*1(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*2(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqu TMP3(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_6(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_6_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*3(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu TMP4(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_5(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_5_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*4(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*5(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu TMP5(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_4(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_4_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*6(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + + movdqu TMP6(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_3(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_3_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*7(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu TMP7(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey_2(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_2_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + pxor \T3, \T7 + pxor \T2, \T6 + + movdqu 16*8(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + + // \XMM8, \T5 hold the values for the two operands which are + // carry-less multiplied. + //////////////////////////////////////////////////////////////////////// + // Karatsuba Method + movdqu TMP8(%rsp), \T1 + movdqa \T1, \T3 + + pshufd $0b01001110, \T3, \T2 + pxor \T3, \T2 + movdqu HashKey(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 + movdqu HashKey_k(\GDATA), \T5 + pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) + pxor \T3, \T7 + pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part + + movdqu 16*9(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + +#ifdef GCM128_MODE + movdqu 16*10(\GDATA), \T5 +#endif +#ifdef GCM192_MODE + movdqu 16*10(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*11(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*12(\GDATA), \T5 // finish last key round +#endif +#ifdef GCM256_MODE + movdqu 16*10(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*11(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*12(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*13(\GDATA), \T1 + aesenc \T1, \XMM1 + aesenc \T1, \XMM2 + aesenc \T1, \XMM3 + aesenc \T1, \XMM4 + aesenc \T1, \XMM5 + aesenc \T1, \XMM6 + aesenc \T1, \XMM7 + aesenc \T1, \XMM8 + + movdqu 16*14(\GDATA), \T5 // finish last key round +#endif + +.altmacro +.set i, 0 +.set j, 1 +.rept 8 + xmmreg j, %j + XLDR 16*i(\PLAIN_CYPH_IN, \DATA_OFFSET), \T1 + + .ifc \ENC_DEC, DEC + movdqa \T1, \T3 + .endif + + pxor \T5, \T1 + aesenclast \T1, xmmj // XMM1:XMM8 + XSTR xmmj, 16*i(\CYPH_PLAIN_OUT, \DATA_OFFSET) // Write to the Output buffer + + .ifc \ENC_DEC, DEC + movdqa \T3, xmmj + .endif +.set i, (i+1) +.set j, (j+1) +.endr +.noaltmacro + + pxor \T6, \T2 + pxor \T4, \T2 + pxor \T7, \T2 + + + movdqa \T2, \T3 + pslldq $8, \T3 // shift-L \T3 2 DWs + psrldq $8, \T2 // shift-R \T2 2 DWs + pxor \T3, \T7 + pxor \T2, \T4 // accumulate the results in \T4:\T7 + + + + //first phase of the reduction + movdqa \T7, \T2 + movdqa \T7, \T3 + movdqa \T7, \T1 // move \T7 into \T2, \T3, \T1 in order to perform the three shifts independently + + pslld $31, \T2 // packed right shifting << 31 + pslld $30, \T3 // packed right shifting shift << 30 + pslld $25, \T1 // packed right shifting shift << 25 + pxor \T3, \T2 // xor the shifted versions + pxor \T1, \T2 + + movdqa \T2, \T5 + psrldq $4, \T5 // shift-R \T5 1 DW + + pslldq $12, \T2 // shift-L \T2 3 DWs + pxor \T2, \T7 // first phase of the reduction complete + + //////////////////////////////////////////////////////////////////////// + + pshufb SHUF_MASK(%rip), \XMM1 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM2 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM3 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM4 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM5 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM6 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM7 // perform a 16Byte swap + pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap + + //second phase of the reduction + movdqa \T7, \T2 // make 3 copies of \T7 (in in \T2, \T3, \T1) for doing three shift operations + movdqa \T7, \T3 + movdqa \T7, \T1 + + psrld $1, \T2 // packed left shifting >> 1 + psrld $2, \T3 // packed left shifting >> 2 + psrld $7, \T1 // packed left shifting >> 7 + pxor \T3, \T2 // xor the shifted versions + pxor \T1, \T2 + + pxor \T5, \T2 + pxor \T2, \T7 + pxor \T4, \T7 // the result is in \T4 + + + pxor \T7, \XMM1 + +.endm // GHASH_8_ENCRYPT_8_PARALLEL + +//////////////////////////////////////////////////////////////////////////////// +// GHASH_LAST_8: GHASH the last 8 ciphertext blocks. +//////////////////////////////////////////////////////////////////////////////// +.macro GHASH_LAST_8 GDATA, T1, T2, T3, T4, T5, T6, T7, \ + XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8 + + + // Karatsuba Method + movdqa \XMM1, \T6 + pshufd $0b01001110, \XMM1, \T2 + pxor \XMM1, \T2 + movdqu HashKey_8(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T6 // \T6 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM1 // \XMM1 = a0*b0 + movdqu HashKey_8_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + movdqa \XMM1, \T7 + movdqa \T2, \XMM1 // result in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM2, \T1 + pshufd $0b01001110, \XMM2, \T2 + pxor \XMM2, \T2 + movdqu HashKey_7(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM2 // \XMM2 = a0*b0 + movdqu HashKey_7_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM2, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM3, \T1 + pshufd $0b01001110, \XMM3, \T2 + pxor \XMM3, \T2 + movdqu HashKey_6(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM3 // \XMM3 = a0*b0 + movdqu HashKey_6_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM3, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM4, \T1 + pshufd $0b01001110, \XMM4, \T2 + pxor \XMM4, \T2 + movdqu HashKey_5(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM4 // \XMM4 = a0*b0 + movdqu HashKey_5_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM4, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM5, \T1 + pshufd $0b01001110, \XMM5, \T2 + pxor \XMM5, \T2 + movdqu HashKey_4(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM5 // \XMM5 = a0*b0 + movdqu HashKey_4_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM5, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM6, \T1 + pshufd $0b01001110, \XMM6, \T2 + pxor \XMM6, \T2 + movdqu HashKey_3(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM6 // \XMM6 = a0*b0 + movdqu HashKey_3_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM6, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + // Karatsuba Method + movdqa \XMM7, \T1 + pshufd $0b01001110, \XMM7, \T2 + pxor \XMM7, \T2 + movdqu HashKey_2(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM7 // \XMM7 = a0*b0 + movdqu HashKey_2_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM7, \T7 + pxor \T2, \XMM1 // results accumulated in \T6, \T7, \XMM1 + + + // Karatsuba Method + movdqa \XMM8, \T1 + pshufd $0b01001110, \XMM8, \T2 + pxor \XMM8, \T2 + movdqu HashKey(\GDATA), \T5 + pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 + + pclmulqdq $0x00, \T5, \XMM8 // \XMM8 = a0*b0 + movdqu HashKey_k(\GDATA), \T4 + pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) + + pxor \T1, \T6 + pxor \XMM8, \T7 + pxor \XMM1, \T2 + pxor \T6, \T2 + pxor \T7, \T2 // middle section of the temp results combined as in Karatsuba algorithm + + + movdqa \T2, \T4 + pslldq $8, \T4 // shift-L \T4 2 DWs + psrldq $8, \T2 // shift-R \T2 2 DWs + pxor \T4, \T7 + pxor \T2, \T6 // <\T6:\T7> holds the result of the accumulated carry-less multiplications + + + //first phase of the reduction + movdqa \T7, \T2 + movdqa \T7, \T3 + movdqa \T7, \T4 // move \T7 into \T2, \T3, \T4 in order to perform the three shifts independently + + pslld $31, \T2 // packed right shifting << 31 + pslld $30, \T3 // packed right shifting shift << 30 + pslld $25, \T4 // packed right shifting shift << 25 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + movdqa \T2, \T1 + psrldq $4, \T1 // shift-R \T1 1 DW + + pslldq $12, \T2 // shift-L \T2 3 DWs + pxor \T2, \T7 // first phase of the reduction complete + //////////////////////////////////////////////////////////////////////// + + //second phase of the reduction + movdqa \T7, \T2 // make 3 copies of \T7 (in in \T2, \T3, \T4) for doing three shift operations + movdqa \T7, \T3 + movdqa \T7, \T4 + + psrld $1, \T2 // packed left shifting >> 1 + psrld $2, \T3 // packed left shifting >> 2 + psrld $7, \T4 // packed left shifting >> 7 + pxor \T3, \T2 // xor the shifted versions + pxor \T4, \T2 + + pxor \T1, \T2 + pxor \T2, \T7 + pxor \T7, \T6 // the result is in \T6 + +.endm // GHASH_LAST_8 + +//////////////////////////////////////////////////////////////////////////////// +// ENCRYPT_SINGLE_BLOCK: Encrypt a single block. +//////////////////////////////////////////////////////////////////////////////// +.macro ENCRYPT_SINGLE_BLOCK GDATA, ST, T1 + + movdqu 16*0(\GDATA), \T1 + pxor \T1, \ST + +.set i, 1 +.rept NROUNDS + movdqu 16*i(\GDATA), \T1 + aesenc \T1, \ST + +.set i, (i+1) +.endr + movdqu 16*i(\GDATA), \T1 + aesenclast \T1, \ST +.endm // ENCRYPT_SINGLE_BLOCK + + +//////////////////////////////////////////////////////////////////////////////// +// FUNC_SAVE: Save clobbered regs on the stack. +//////////////////////////////////////////////////////////////////////////////// +.macro FUNC_SAVE + //// Required for Update/GMC_ENC + //the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + push %rsi + mov %rsp, %r14 + + sub $(VARIABLE_OFFSET), %rsp + and $~63, %rsp + +#if __OUTPUT_FORMAT__ == win64 + // xmm6:xmm15 need to be maintained for Windows + movdqu %xmm6, (LOCAL_STORAGE + 0*16)(%rsp) + movdqu %xmm7, (LOCAL_STORAGE + 1*16)(%rsp) + movdqu %xmm8, (LOCAL_STORAGE + 2*16)(%rsp) + movdqu %xmm9, (LOCAL_STORAGE + 3*16)(%rsp) + movdqu %xmm10, (LOCAL_STORAGE + 4*16)(%rsp) + movdqu %xmm11, (LOCAL_STORAGE + 5*16)(%rsp) + movdqu %xmm12, (LOCAL_STORAGE + 6*16)(%rsp) + movdqu %xmm13, (LOCAL_STORAGE + 7*16)(%rsp) + movdqu %xmm14, (LOCAL_STORAGE + 8*16)(%rsp) + movdqu %xmm15, (LOCAL_STORAGE + 9*16)(%rsp) + + mov arg(5), arg5 // XXXX [r14 + STACK_OFFSET + 8*5] +#endif +.endm // FUNC_SAVE + +//////////////////////////////////////////////////////////////////////////////// +// FUNC_RESTORE: Restore clobbered regs from the stack. +//////////////////////////////////////////////////////////////////////////////// +.macro FUNC_RESTORE + +#if __OUTPUT_FORMAT__ == win64 + movdqu (LOCAL_STORAGE + 9*16)(%rsp), %xmm15 + movdqu (LOCAL_STORAGE + 8*16)(%rsp), %xmm14 + movdqu (LOCAL_STORAGE + 7*16)(%rsp), %xmm13 + movdqu (LOCAL_STORAGE + 6*16)(%rsp), %xmm12 + movdqu (LOCAL_STORAGE + 5*16)(%rsp), %xmm11 + movdqu (LOCAL_STORAGE + 4*16)(%rsp), %xmm10 + movdqu (LOCAL_STORAGE + 3*16)(%rsp), %xmm9 + movdqu (LOCAL_STORAGE + 2*16)(%rsp), %xmm8 + movdqu (LOCAL_STORAGE + 1*16)(%rsp), %xmm7 + movdqu (LOCAL_STORAGE + 0*16)(%rsp), %xmm6 +#endif + + // Required for Update/GMC_ENC + mov %r14, %rsp + pop %rsi + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm // FUNC_RESTORE + + +//////////////////////////////////////////////////////////////////////////////// +// GCM_INIT: Initializes a gcm_context_data struct to prepare for +// encoding/decoding. +// Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +// Additional Authentication data (A_IN), Additional Data length (A_LEN). +// Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized +// other parts of GDATA. +// Clobbers rax, r10-r13 and xmm0-xmm6 +//////////////////////////////////////////////////////////////////////////////// +.macro GCM_INIT GDATA_KEY, GDATA_CTX, IV, A_IN, A_LEN + +#define AAD_HASH %xmm0 +#define SUBHASH %xmm1 + + movdqu HashKey(\GDATA_KEY), SUBHASH + + CALC_AAD_HASH \A_IN, \A_LEN, AAD_HASH, SUBHASH, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %r10, %r11, %r12, %r13, %rax + pxor %xmm3, %xmm2 + mov \A_LEN, %r10 + + movdqu AAD_HASH, AadHash(\GDATA_CTX) // ctx_data.aad hash = aad_hash + mov %r10, AadLen(\GDATA_CTX) // ctx_data.aad_length = aad_length + xor %r10, %r10 + mov %r10, InLen(\GDATA_CTX) // ctx_data.in_length = 0 + mov %r10, PBlockLen(\GDATA_CTX) // ctx_data.partial_block_length = 0 + movdqu %xmm2, PBlockEncKey(\GDATA_CTX) // ctx_data.partial_block_enc_key = 0 + mov \IV, %r10 + movdqa ONEf(%rip), %xmm2 // read 12 IV bytes and pad with 0x00000001 + pinsrq $0, (%r10), %xmm2 + pinsrd $2, 8(%r10), %xmm2 + movdqu %xmm2, OrigIV(\GDATA_CTX) // ctx_data.orig_IV = iv + + pshufb SHUF_MASK(%rip), %xmm2 + + movdqu %xmm2, CurCount(\GDATA_CTX) // ctx_data.current_counter = iv +.endm // GCM_INIT + + +//////////////////////////////////////////////////////////////////////////////// +// GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed +// gcm_context_data struct has been initialized by GCM_INIT. +// Requires the input data be at least 1 byte long because of +// READ_SMALL_INPUT_DATA. +// Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), +// input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN) and whether +// encoding or decoding (ENC_DEC). +// Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated +// GDATA_CTX +// Clobbers rax, r10-r15, and xmm0-xmm15 +//////////////////////////////////////////////////////////////////////////////// +.macro GCM_ENC_DEC GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ + PLAIN_CYPH_LEN, ENC_DEC + +#define DATA_OFFSET %r11 + + // clang compat: no local support + // LOCAL _initial_num_blocks_is_7, _initial_num_blocks_is_6 + // LOCAL _initial_num_blocks_is_5, _initial_num_blocks_is_4 + // LOCAL _initial_num_blocks_is_3, _initial_num_blocks_is_2 + // LOCAL _initial_num_blocks_is_1, _initial_num_blocks_is_0 + // LOCAL _initial_blocks_encrypted, _encrypt_by_8_new, _encrypt_by_8 + // LOCAL _eight_cipher_left, _zero_cipher_left, _large_enough_update + // LOCAL _data_read, _less_than_8_bytes_left, _multiple_of_16_bytes + +// Macro flow: +// calculate the number of 16byte blocks in the message +// process (number of 16byte blocks) mod 8 '_initial_num_blocks_is_# .. _initial_blocks_encrypted' +// process 8 16 byte blocks at a time until all are done '_encrypt_by_8_new .. _eight_cipher_left' +// if there is a block of less tahn 16 bytes process it '_zero_cipher_left .. _multiple_of_16_bytes' + + cmp $0, \PLAIN_CYPH_LEN + je _multiple_of_16_bytes_\@ + + xor DATA_OFFSET, DATA_OFFSET + add \PLAIN_CYPH_LEN, InLen(\GDATA_CTX) //Update length of data processed + movdqu HashKey(\GDATA_KEY), %xmm13 // xmm13 = HashKey + movdqu AadHash(\GDATA_CTX), %xmm8 + + + PARTIAL_BLOCK \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, \PLAIN_CYPH_LEN, DATA_OFFSET, %xmm8, \ENC_DEC + + mov \PLAIN_CYPH_LEN, %r13 // save the number of bytes of plaintext/ciphertext + sub DATA_OFFSET, %r13 + mov %r13, %r10 //save the amount of data left to process in r10 + and $-16, %r13 // r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0_\@ + + + cmp $7, %r12 + je _initial_num_blocks_is_7_\@ + cmp $6, %r12 + je _initial_num_blocks_is_6_\@ + cmp $5, %r12 + je _initial_num_blocks_is_5_\@ + cmp $4, %r12 + je _initial_num_blocks_is_4_\@ + cmp $3, %r12 + je _initial_num_blocks_is_3_\@ + cmp $2, %r12 + je _initial_num_blocks_is_2_\@ + + jmp _initial_num_blocks_is_1_\@ + +_initial_num_blocks_is_7_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*7), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_6_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*6), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_5_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*5), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_4_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*4), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_3_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*3), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_2_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*2), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_1_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $(16*1), %r13 + jmp _initial_blocks_encrypted_\@ + +_initial_num_blocks_is_0_\@: + INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + +_initial_blocks_encrypted_\@: + cmp $0, %r13 + je _zero_cipher_left_\@ + + sub $128, %r13 + je _eight_cipher_left_\@ + + movd %xmm9, %r15d + and $255, %r15d + pshufb SHUF_MASK(%rip), %xmm9 + + +_encrypt_by_8_new_\@: + cmp $(255-8), %r15d + jg _encrypt_by_8_\@ + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL \GDATA_KEY, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, DATA_OFFSET + sub $128, %r13 + jne _encrypt_by_8_new_\@ + + pshufb SHUF_MASK(%rip), %xmm9 + jmp _eight_cipher_left_\@ + +_encrypt_by_8_\@: + pshufb SHUF_MASK(%rip), %xmm9 + add $8, %r15b + + GHASH_8_ENCRYPT_8_PARALLEL \GDATA_KEY, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + pshufb SHUF_MASK(%rip), %xmm9 + add $128, DATA_OFFSET + sub $128, %r13 + jne _encrypt_by_8_new_\@ + + pshufb SHUF_MASK(%rip), %xmm9 + + + +_eight_cipher_left_\@: + GHASH_LAST_8 \GDATA_KEY, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left_\@: + movdqu %xmm14, AadHash(\GDATA_CTX) + movdqu %xmm9, CurCount(\GDATA_CTX) + + mov %r10, %r13 + and $15, %r13 // r13 = (\PLAIN_CYPH_LEN mod 16) + + je _multiple_of_16_bytes_\@ + + mov %r13, PBlockLen(\GDATA_CTX) // my_ctx.data.partial_blck_length = r13 + // handle the last <16 Byte block seperately + + paddd ONE(%rip), %xmm9 // INCR CNT to get Yn + movdqu %xmm9, CurCount(\GDATA_CTX) // my_ctx.data.current_counter = xmm9 + pshufb SHUF_MASK(%rip), %xmm9 + ENCRYPT_SINGLE_BLOCK \GDATA_KEY, %xmm9, %xmm2 // E(K, Yn) + movdqu %xmm9, PBlockEncKey(\GDATA_CTX) // my_ctx_data.partial_block_enc_key = xmm9 + + cmp $16, \PLAIN_CYPH_LEN + jge _large_enough_update_\@ + + lea (\PLAIN_CYPH_IN, DATA_OFFSET), %r10 + READ_SMALL_DATA_INPUT %xmm1, %r10, %r13, %r12, %r15, %rax + lea (SHIFT_MASK + 16)(%rip), %r12 + sub %r13, %r12 + jmp _data_read_\@ + +_large_enough_update_\@: + sub $16, DATA_OFFSET + add %r13, DATA_OFFSET + + movdqu (\PLAIN_CYPH_IN, DATA_OFFSET), %xmm1 // receive the last <16 Byte block + + sub %r13, DATA_OFFSET + add $16, DATA_OFFSET + + lea (SHIFT_MASK + 16)(%rip), %r12 + sub %r13, %r12 // adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 // get the appropriate shuffle mask + pshufb %xmm2, %xmm1 // shift right 16-r13 bytes +_data_read_\@: + .ifc \ENC_DEC, DEC + + movdqa %xmm1, %xmm2 + pxor %xmm1, %xmm9 // Plaintext XOR E(K, Yn) + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out top 16-r13 bytes of xmm9 + pand %xmm1, %xmm2 + pshufb SHUF_MASK(%rip), %xmm2 + pxor %xmm2, %xmm14 + movdqu %xmm14, AadHash(\GDATA_CTX) + + .else // .ifc \ENC_DEC, DEC + + pxor %xmm1, %xmm9 // Plaintext XOR E(K, Yn) + movdqu (ALL_F - SHIFT_MASK)(%r12), %xmm1 // get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand %xmm1, %xmm9 // mask out top 16-r13 bytes of xmm9 + pshufb SHUF_MASK(%rip), %xmm9 + pxor %xmm9, %xmm14 + movdqu %xmm14, AadHash(\GDATA_CTX) + + pshufb SHUF_MASK(%rip), %xmm9 // shuffle xmm9 back to output as ciphertext + + .endif // .ifc \ENC_DEC, DEC + + + ////////////////////////////////////////////////////////// + // output r13 Bytes + movq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + + mov %rax, (\CYPH_PLAIN_OUT, DATA_OFFSET) + add $8, DATA_OFFSET + psrldq $8, %xmm9 + movq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left_\@: + movb %al, (\CYPH_PLAIN_OUT, DATA_OFFSET) + add $1, DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ + ////////////////////////////////////////////////////////// + +_multiple_of_16_bytes_\@: + +.endm // GCM_ENC_DEC + + +//////////////////////////////////////////////////////////////////////////////// +// GCM_COMPLETE: Finishes Encyrption/Decryption of last partial block after +// GCM_UPDATE finishes. +// Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and +// whether encoding or decoding (ENC_DEC). +// Output: Authorization Tag (AUTH_TAG) and Authorization Tag length +// (AUTH_TAG_LEN) +// Clobbers %rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +//////////////////////////////////////////////////////////////////////////////// +.macro GCM_COMPLETE GDATA_KEY, GDATA_CTX, AUTH_TAG, AUTH_TAG_LEN, ENC_DEC + +#define PLAIN_CYPH_LEN %rax + + // clang compat: no local support + // LOCAL _partial_done, _return_T, _T_8, _T_12, _T_16, _return_T_done + + mov PBlockLen(\GDATA_CTX), %r12 // r12 = aadLen (number of bytes) + movdqu AadHash(\GDATA_CTX), %xmm14 + movdqu HashKey(\GDATA_KEY), %xmm13 + + cmp $0, %r12 + + je _partial_done_\@ + + GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 //GHASH computation for the last <16 Byte block + movdqu %xmm14, AadHash(\GDATA_CTX) + +_partial_done_\@: + + mov AadLen(\GDATA_CTX), %r12 // r12 = aadLen (number of bytes) + mov InLen(\GDATA_CTX), PLAIN_CYPH_LEN + + shl $3, %r12 // convert into number of bits + movd %r12d, %xmm15 // len(A) in xmm15 + + shl $3, PLAIN_CYPH_LEN // len(C) in bits (*128) + movq PLAIN_CYPH_LEN, %xmm1 + pslldq $8, %xmm15 // xmm15 = len(A)|| 0x0000000000000000 + pxor %xmm1, %xmm15 // xmm15 = len(A)||len(C) + + pxor %xmm15, %xmm14 + GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 // final GHASH computation + pshufb SHUF_MASK(%rip), %xmm14 // perform a 16Byte swap + movdqu OrigIV(\GDATA_CTX), %xmm9 // xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK \GDATA_KEY, %xmm9, %xmm2 // E(K, Y0) + + pxor %xmm14, %xmm9 + +_return_T_\@: + mov \AUTH_TAG, %r10 // r10 = authTag + mov \AUTH_TAG_LEN, %r11 // r11 = auth_tag_len + + cmp $16, %r11 + je _T_16_\@ + + cmp $12, %r11 + je _T_12_\@ + +_T_8_\@: + movq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done_\@ + +_T_12_\@: + movq %xmm9, %rax + mov %rax, (%r10) + psrldq $8, %xmm9 + movd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done_\@ + +_T_16_\@: + movdqu %xmm9, (%r10) + +_return_T_done_\@: +.endm //GCM_COMPLETE + + +#if 1 + + .balign 16 +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_precomp_{128,256}_sse +// (struct gcm_key_data *key_data); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(precomp,_) +FN_NAME(precomp,_): + + endbranch + + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + sub $(VARIABLE_OFFSET), %rsp + and $(~63), %rsp // align rsp to 64 bytes + +#if __OUTPUT_FORMAT__ == win64 + // only xmm6 needs to be maintained + movdqu %xmm6, (LOCAL_STORAGE + 0*16)(%rsp) +#endif + + pxor %xmm6, %xmm6 + ENCRYPT_SINGLE_BLOCK arg1, %xmm6, %xmm2 // xmm6 = HashKey + + pshufb SHUF_MASK(%rip), %xmm6 + /////////////// PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + movdqa %xmm6, %xmm2 + psllq $1, %xmm6 + psrlq $63, %xmm2 + movdqa %xmm2, %xmm1 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + por %xmm2, %xmm6 + + //reduction + pshufd $0b00100100, %xmm1, %xmm2 + pcmpeqd TWOONE(%rip), %xmm2 + pand POLY(%rip), %xmm2 + pxor %xmm2, %xmm6 // xmm6 holds the HashKey<<1 mod poly + /////////////////////////////////////////////////////////////////////// + movdqu %xmm6, HashKey(arg1) // store HashKey<<1 mod poly + + PRECOMPUTE arg1, %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + +#if __OUTPUT_FORMAT__ == win64 + movdqu (LOCAL_STORAGE + 0*16)(%rsp), %xmm6 +#endif + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_init_128_sse / aes_gcm_init_256_sse ( +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *iv, +// const u8 *aad, +// u64 aad_len); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + + push %r12 + push %r13 +#if __OUTPUT_FORMAT__ == win64 + push arg5 + sub $(1*16), %rsp + movdqu %xmm6, (0*16)(%rsp) + mov (1*16 + 8*3 + 8*5)(%rsp), arg5 +#endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +#if __OUTPUT_FORMAT__ == win64 + movdqu (0*16)(%rsp), %xmm6 + add $(1*16), %rsp + pop arg5 +#endif + pop %r13 + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_enc_128_update_sse / aes_gcm_enc_256_update_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len); +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_dec_256_update_sse / aes_gcm_dec_256_update_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len); +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_256_finalize_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *auth_tag, +// u64 auth_tag_len); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + + endbranch + + push %r12 + +#if __OUTPUT_FORMAT__ == win64 + // xmm6:xmm15 need to be maintained for Windows + sub $(5*16), %rsp + movdqu %xmm6, (0*16)(%rsp) + movdqu %xmm9, (1*16)(%rsp) + movdqu %xmm11, (2*16)(%rsp) + movdqu %xmm14, (3*16)(%rsp) + movdqu %xmm15, (4*16)(%rsp) +#endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +#if __OUTPUT_FORMAT__ == win64 + movdqu (4*16)(%rsp), %xmm15 + movdqu (3*16)(%rsp), %xmm14 + movdqu (2*16)(%rsp), %xmm11 + movdqu (1*16)(%rsp), %xmm9 + movdqu (0*16)(%rsp), %xmm6 + add $(5*16), %rsp +#endif + + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_256_finalize_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *auth_tag, +// u64 auth_tag_len); +//////////////////////////////////////////////////////////////////////////////// +#if FUNCT_EXTENSION != _nt +.global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + + endbranch + + push %r12 + +#if __OUTPUT_FORMAT == win64 + // xmm6:xmm15 need to be maintained for Windows + sub $(5*16), %rsp + movdqu %xmm6, (0*16)(%rsp) + movdqu %xmm9, (1*16)(%rsp) + movdqu %xmm11, (2*16)(%rsp) + movdqu %xmm14, (3*16)(%rsp) + movdqu %xmm15, (4*16)(%rsp) +#endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +#if __OUTPUT_FORMAT__ == win64 + movdqu (4*16)(%rsp), %xmm15 + movdqu (3*16)(%rsp), %xmm14 + movdqu (2*16)(%rsp), %xmm11 + movdqu (1*16)(%rsp), %xmm9 + movdqu (0*16)(%rsp), %xmm6 + add $(5*16), %rsp +#endif + + pop %r12 + ret +#endif // _nt + + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_enc_128_sse / aes_gcm_enc_256_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len, +// u8 *iv, +// const u8 *aad, +// u64 aad_len, +// u8 *auth_tag, +// u64 auth_tag_len)// +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + FUNC_RESTORE + + ret + +//////////////////////////////////////////////////////////////////////////////// +//void aes_gcm_dec_128_sse / aes_gcm_dec_256_sse +// const struct gcm_key_data *key_data, +// struct gcm_context_data *context_data, +// u8 *out, +// const u8 *in, +// u64 plaintext_len, +// u8 *iv, +// const u8 *aad, +// u64 aad_len, +// u8 *auth_tag, +// u64 auth_tag_len)// +//////////////////////////////////////////////////////////////////////////////// +.global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + FUNC_RESTORE + + ret + +.global FN_NAME(this_is_gas,_) +FN_NAME(this_is_gas,_): + endbranch + FUNC_SAVE + FUNC_RESTORE + ret + +#else + // GAS doesnt't provide the linenuber in the macro + //////////////////////// + // GHASH_MUL xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + // PRECOMPUTE rax, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 + // READ_SMALL_DATA_INPUT xmm1, r10, 8, rax, r12, r15 + // ENCRYPT_SINGLE_BLOCK rax, xmm0, xmm1 + // INITIAL_BLOCKS rdi,rsi,rdx,rcx,r13,r11,7,xmm12,xmm13,xmm14,xmm15,xmm11,xmm9,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm10,xmm0,ENC + // CALC_AAD_HASH [r14+8*5+8*1],[r14+8*5+8*2],xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,r10,r11,r12,r13,rax + // READ_SMALL_DATA_INPUT xmm2, r10, r11, r12, r13, rax + // PARTIAL_BLOCK rdi,rsi,rdx,rcx,r8,r11,xmm8,ENC + // GHASH_8_ENCRYPT_8_PARALLEL rdi,rdx,rcx,r11,xmm0,xmm10,xmm11,xmm12,xmm13,xmm14,xmm9,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm15,out_order,ENC + //GHASH_LAST_8 rdi,xmm0,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8 +#endif diff --git a/module/icp/asm-x86_64/modes/isalc_reg_sizes.S b/module/icp/asm-x86_64/modes/isalc_reg_sizes.S new file mode 100644 index 000000000000..d77291ce58a1 --- /dev/null +++ b/module/icp/asm-x86_64/modes/isalc_reg_sizes.S @@ -0,0 +1,221 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Copyright(c) 2011-2019 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES// LOSS OF USE, +// DATA, OR PROFITS// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifndef _REG_SIZES_ASM_ +#define _REG_SIZES_ASM_ + + +// define d, w and b variants for registers + +.macro dwordreg reg + .if \reg == %r8 || \reg == %r9 || \reg == %r10 || \reg == %r11 || \reg == %r12 || \reg == %r13 || \reg == %r14 || \reg == %r15 + .set dreg, \reg\()d + .elseif \reg == %rax + .set dreg, %eax + .elseif \reg == %rcx + .set dreg, %ecx + .elseif \reg == %rdx + .set dreg, %edx + .elseif \reg == %rbx + .set dreg, %ebx + .elseif \reg == %rsp + .set dreg, %esp + .elseif \reg == %rbp + .set dreg, %ebp + .elseif \reg == %rsi + .set dreg, %esi + .elseif \reg == %rdi + .set dreg, %edi + .else + .error "Invalid register '\reg\()' while expanding macro 'dwordreg\()'" + .endif +.endm + +.macro wordreg reg + .if \reg == %r8 || \reg == %r9 || \reg == %r10 || \reg == %r11 || \reg == %r12 || \reg == %r13 || \reg == %r14 || \reg == %r15 + .set wreg, \reg\()w + .elseif \reg == %rax + .set wreg, %ax + .elseif \reg == %rcx + .set wreg, %cx + .elseif \reg == %rdx + .set wreg, %dx + .elseif \reg == %rbx + .set wreg, %bx + .elseif \reg == %rsp + .set wreg, %sp + .elseif \reg == %rbp + .set wreg, %bp + .elseif \reg == %rsi + .set wreg, %si + .elseif \reg == %rdi + .set wreg, %di + .else + .error "Invalid register '\reg\()' while expanding macro 'wordreg\()'" + .endif +.endm + + +.macro bytereg reg + .if \reg == %r8 || \reg == %r9 || \reg == %r10 || \reg == %r11 || \reg == %r12 || \reg == %r13 || \reg == %r14 || \reg == %r15 + .set breg, \reg\()b + .elseif \reg == %rax + .set breg, %al + .elseif \reg == %rcx + .set breg, %cl + .elseif \reg == %rdx + .set breg, %dl + .elseif \reg == %rbx + .set breg, %bl + .elseif \reg == rsp + .set breg, %spl + .elseif \reg == %rbp + .set breg, %bpl + .elseif \reg == rsi + .set breg, %sil + .elseif \reg == rdi + .set breg, %dil + .else + .error "Invalid register '\reg\()' while expanding macro 'bytereg\()'" + .endif +.endm + +// clang compat: Below won't owrk with clang; do it a bit different +// #define ZERO_TO_THIRTYONE \ +// 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, \ +// 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + +// .macro xword reg +// .irep i, ZERO_TO_THIRTYONE +// .if \reg == %xmm\i || \reg == %ymm\i || \reg == %zmm\i +// .set xmmreg, %xmm\i +// .endif +// .endr +// .endm + +// .macro yword reg +// .irep i, ZERO_TO_THIRTYONE +// .if \reg == %xmm\i || \reg == %ymm\i || \reg == %zmm\i +// .set ymmreg, %ymm\i +// .endif +// .endr +// .endm + +// .macro zword reg +// .irep i, ZERO_TO_THIRTYONE +// .if \reg == %xmm\i || \reg == %ymm\i || \reg == %zmm\i +// .set zmmreg, %zmm\i +// .endif +// .endr +// .endm + +// Example usage: +// xword %zmm12 +// pxor xmmreg, xmmreg // => pxor %xmm12, %xmm12 +.macro xword reg + .set i, 0 + .rep 32 + .altmacro + do_xyzword <\reg>, xmm, %i + .noaltmacro + .set i, (i+1) + .endr +.endm + +.macro yword reg + .set i, 0 + .rep 32 + .altmacro + do_xyzword <\reg>, ymm, %i + .noaltmacro + .set i, (i+1) + .endr +.endm + +.macro zword reg + .set i, 0 + .rep 32 + .altmacro + do_xyzword <\reg>, zmm, %i + .noaltmacro + .set i, (i+1) + .endr +.endm + +.macro do_xyzword creg, prfx, idx + .if \creg == %xmm\idx || \creg == %ymm\idx || \creg == %zmm\idx + .set \prfx\()reg, %\prfx\idx + .endif +.endm + + +// FIXME: handle later +#define elf32 1 +#define elf64 2 +#define win64 3 +#define machos64 4 + +#ifndef __OUTPUT_FORMAT__ +#define __OUTPUT_FORMAT__ elf64 +#endif + +#if __OUTPUT_FORMAT__ == elf32 +.section .note.GNU-stack,"",%progbits +.section .text +#endif +#if __OUTPUT_FORMAT__ == elf64 +#ifndef __x86_64__ +#define __x86_64__ +#endif +.section .note.GNU-stack,"",%progbits +.section .text +#endif +#if __OUTPUT_FORMAT__ == win64 +#define __x86_64__ +#endif +#if __OUTPUT_FORMAT__ == macho64 +#define __x86_64__ +#endif + + +#ifdef __x86_64__ +#define endbranch .byte 0xf3, 0x0f, 0x1e, 0xfa +#else +#define endbranch .byte 0xf3, 0x0f, 0x1e, 0xfb +#endif + +#ifdef REL_TEXT +#define WRT_OPT +#elif __OUTPUT_FORMAT__ == elf64 +#define WRT_OPT wrt ..plt +#else +#define WRT_OPT +#endif + +#endif // ifndef _REG_SIZES_ASM_ From de13d7cd75869df6375da318b53fad36aa644bdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Fri, 10 Feb 2023 00:09:09 +0100 Subject: [PATCH 2/2] ICP: AES_GCM: Add sse4 asm routines, first stab MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add asm_linkage.h and .cfi use macros for push and pop add gcm192 support add gcm192 support adapt to gcm_ctx_t offsets adapt to gcm_ctx_t keysched and htab adapt to gcm_ctx_t keysched and htab adapt to gcm_ctx_t keysched and htab integrate to build sys builds, next is the fun part: debugging passes cursory SSE/AVX cross testing various cleanup cstyle cleanup avoid triggering meaningless asserts cmn_err implies newline oc there are bugs in the debugging code as well fix merge error update moved gcm_clear_ctx() minor comment cleanup Signed-off-by: Attila Fülöp --- Makefile.am | 2 + lib/libicp/Makefile.am | 3 + module/Kbuild.in | 5 +- module/icp/algs/modes/gcm.c | 1004 +++++++++++++++-- module/icp/algs/modes/modes.c | 18 +- .../icp/asm-x86_64/modes/isalc_gcm128_sse.S | 7 +- .../icp/asm-x86_64/modes/isalc_gcm192_sse.S | 36 + .../icp/asm-x86_64/modes/isalc_gcm256_sse.S | 7 +- .../icp/asm-x86_64/modes/isalc_gcm_defines.S | 193 +++- module/icp/asm-x86_64/modes/isalc_gcm_sse.S | 759 ++++++------- module/icp/asm-x86_64/modes/isalc_reg_sizes.S | 13 +- module/icp/include/modes/modes.h | 57 +- module/icp/io/aes.c | 1 - 14 files changed, 1529 insertions(+), 577 deletions(-) create mode 100644 module/icp/asm-x86_64/modes/isalc_gcm192_sse.S diff --git a/Makefile.am b/Makefile.am index 11e45dae8255..1fb636972566 100644 --- a/Makefile.am +++ b/Makefile.am @@ -51,6 +51,8 @@ dist_noinst_DATA += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl dist_noinst_DATA += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip +dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel +dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.intel.descrip dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2 diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index 4ba55b2158bc..0c9994c3a7b2 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -74,6 +74,9 @@ nodist_libicp_la_SOURCES += \ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \ module/icp/asm-x86_64/modes/ghash-x86_64.S \ + module/icp/asm-x86_64/modes/isalc_gcm128_sse.S \ + module/icp/asm-x86_64/modes/isalc_gcm192_sse.S \ + module/icp/asm-x86_64/modes/isalc_gcm256_sse.S \ module/icp/asm-x86_64/sha2/sha256-x86_64.S \ module/icp/asm-x86_64/sha2/sha512-x86_64.S \ module/icp/asm-x86_64/blake3/blake3_avx2.S \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 8d29f56c2fb8..cbfce110d322 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -125,7 +125,10 @@ ICP_OBJS_X86_64 := \ asm-x86_64/sha2/sha512-x86_64.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ - asm-x86_64/modes/ghash-x86_64.o + asm-x86_64/modes/ghash-x86_64.o \ + asm-x86_64/modes/isalc_gcm128_sse.o \ + asm-x86_64/modes/isalc_gcm192_sse.o \ + asm-x86_64/modes/isalc_gcm256_sse.o \ ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index dd8db6f97460..f6ceb49fb393 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -35,7 +35,13 @@ #include #endif -#define GHASH(c, d, t, o) \ +#ifdef DEBUG_GCM_ASM +/* Can't attach to inline funcs with bpftrace */ +#undef inline +#define inline __attribute__((__noinline__)) +#endif + +#define GHASH(c, d, t, o) \ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ (uint64_t *)(void *)(t)); @@ -43,9 +49,14 @@ /* Select GCM implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX-1) -#ifdef CAN_USE_GCM_ASM +#ifdef CAN_USE_GCM_ASM_AVX #define IMPL_AVX (UINT32_MAX-2) #endif +#ifdef CAN_USE_GCM_ASM_SSE +#define IMPL_SSE4_1 (UINT32_MAX-3) +#endif +/* TODO: add AVX2, VAES */ + #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t icp_gcm_impl = IMPL_FASTEST; static uint32_t user_sel_impl = IMPL_FASTEST; @@ -55,30 +66,269 @@ static inline int gcm_init_ctx_impl(boolean_t, gcm_ctx_t *, char *, size_t, void (*)(uint8_t *, uint8_t *), void (*)(uint8_t *, uint8_t *)); +/* TODO: move below to seperate header (gcm_simd.h) ? */ #ifdef CAN_USE_GCM_ASM +#ifdef CAN_USE_GCM_ASM_AVX /* Does the architecture we run on support the MOVBE instruction? */ boolean_t gcm_avx_can_use_movbe = B_FALSE; +extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); +#endif /* - * Whether to use the optimized openssl gcm and ghash implementations. - * Set to true if module parameter icp_gcm_impl == "avx". + * Which optimized gcm SIMD assembly implementations to use. + * Set to the SIMD implementation contained in icp_gcm_impl unless it's + * IMPL_CYCLE or IMPL_FASTEST. For IMPL_CYCLE we cycle through all available + * SIMD implementations on each call to gcm_init_ctx. For IMPL_FASTEST we set + * it to the fastest supported SIMD implementation. gcm_init__ctx() uses + * this to decide which SIMD implementation to use. */ -static boolean_t gcm_use_avx = B_FALSE; -#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) +static gcm_simd_impl_t gcm_simd_impl = GSI_NONE; +#define GCM_SIMD_IMPL_READ (*(volatile gcm_simd_impl_t *)&gcm_simd_impl) + +static inline void gcm_set_simd_impl(gcm_simd_impl_t); +static inline gcm_simd_impl_t gcm_cycle_simd_impl(void); +static inline size_t gcm_simd_get_htab_size(gcm_simd_impl_t); +static inline int get_isalc_gcm_keylen_index(const gcm_ctx_t *ctx); +static inline int get_isalc_gcm_impl_index(const gcm_ctx_t *ctx); + +/* TODO: move later */ + +extern void ASMABI icp_isalc_gcm_precomp_128_sse(gcm_ctx_t *ctx); +extern void ASMABI icp_isalc_gcm_precomp_192_sse(gcm_ctx_t *ctx); +extern void ASMABI icp_isalc_gcm_precomp_256_sse(gcm_ctx_t *ctx); +typedef void ASMABI (*isalc_gcm_precomp_fp)(gcm_ctx_t *); + +extern void ASMABI icp_isalc_gcm_init_128_sse(gcm_ctx_t *ctx, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +extern void ASMABI icp_isalc_gcm_init_192_sse(gcm_ctx_t *ctx, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +extern void ASMABI icp_isalc_gcm_init_256_sse(gcm_ctx_t *ctx, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +typedef void ASMABI (*isalc_gcm_init_fp)(gcm_ctx_t *, const uint8_t *, + const uint8_t *, uint64_t, uint64_t); + +extern void ASMABI icp_isalc_gcm_enc_128_update_sse(gcm_ctx_t *ctx, + uint8_t *out, const uint8_t *in, uint64_t plaintext_len); +extern void ASMABI icp_isalc_gcm_enc_192_update_sse(gcm_ctx_t *ctx, + uint8_t *out, const uint8_t *in, uint64_t plaintext_len); +extern void ASMABI icp_isalc_gcm_enc_256_update_sse(gcm_ctx_t *ctx, + uint8_t *out, const uint8_t *in, uint64_t plaintext_len); +typedef void ASMABI (*isalc_gcm_enc_update_fp)(gcm_ctx_t *, uint8_t *, + const uint8_t *, uint64_t); + +extern void ASMABI icp_isalc_gcm_dec_128_update_sse(gcm_ctx_t *ctx, + uint8_t *out, const uint8_t *in, uint64_t plaintext_len); +extern void ASMABI icp_isalc_gcm_dec_192_update_sse(gcm_ctx_t *ctx, + uint8_t *out, const uint8_t *in, uint64_t plaintext_len); +extern void ASMABI icp_isalc_gcm_dec_256_update_sse(gcm_ctx_t *ctx, + uint8_t *out, const uint8_t *in, uint64_t plaintext_len); +typedef void ASMABI (*isalc_gcm_dec_update_fp)(gcm_ctx_t *, uint8_t *, + const uint8_t *, uint64_t); + +extern void ASMABI icp_isalc_gcm_enc_128_finalize_sse(gcm_ctx_t *ctx); +extern void ASMABI icp_isalc_gcm_enc_192_finalize_sse(gcm_ctx_t *ctx); +extern void ASMABI icp_isalc_gcm_enc_256_finalize_sse(gcm_ctx_t *ctx); +typedef void ASMABI (*isalc_gcm_enc_finalize_fp)(gcm_ctx_t *); + +extern void ASMABI icp_isalc_gcm_dec_128_finalize_sse(gcm_ctx_t *ctx); +extern void ASMABI icp_isalc_gcm_dec_192_finalize_sse(gcm_ctx_t *ctx); +extern void ASMABI icp_isalc_gcm_dec_256_finalize_sse(gcm_ctx_t *ctx); +typedef void ASMABI (*isalc_gcm_dec_finalize_fp)(gcm_ctx_t *); + +extern void ASMABI icp_isalc_gcm_enc_128_sse(gcm_ctx_t *ctx, uint8_t *out, + const uint8_t *in, uint64_t plaintext_len, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +extern void ASMABI icp_isalc_gcm_enc_192_sse(gcm_ctx_t *ctx, uint8_t *out, + const uint8_t *in, uint64_t plaintext_len, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +extern void ASMABI icp_isalc_gcm_enc_256_sse(gcm_ctx_t *ctx, uint8_t *out, + const uint8_t *in, uint64_t plaintext_len, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +typedef void ASMABI (*isalc_gcm_enc_fp)(gcm_ctx_t *, uint8_t *, const uint8_t *, + uint64_t, const uint8_t *, const uint8_t *, uint64_t, uint64_t); + +extern void ASMABI icp_isalc_gcm_dec_128_sse(gcm_ctx_t *ctx, uint8_t *out, + const uint8_t *in, uint64_t plaintext_len, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +extern void ASMABI icp_isalc_gcm_dec_192_sse(gcm_ctx_t *ctx, uint8_t *out, + const uint8_t *in, uint64_t plaintext_len, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +extern void ASMABI icp_isalc_gcm_dec_256_sse(gcm_ctx_t *ctx, uint8_t *out, + const uint8_t *in, uint64_t plaintext_len, const uint8_t *iv, + const uint8_t *aad, uint64_t aad_len, uint64_t tag_len); +typedef void ASMABI (*isalc_gcm_dec_fp)(gcm_ctx_t *, uint8_t *, const uint8_t *, + uint64_t, const uint8_t *, const uint8_t *, uint64_t, uint64_t); + +/* struct isalc_ops holds arrays for all isalc asm functions ... */ +typedef struct isalc_gcm_ops { + isalc_gcm_precomp_fp igo_precomp[GSI_ISALC_NUM_IMPL][3]; + isalc_gcm_init_fp igo_init[GSI_ISALC_NUM_IMPL][3]; + isalc_gcm_enc_update_fp igo_enc_update[GSI_ISALC_NUM_IMPL][3]; + isalc_gcm_dec_update_fp igo_dec_update[GSI_ISALC_NUM_IMPL][3]; + isalc_gcm_enc_finalize_fp igo_enc_finalize[GSI_ISALC_NUM_IMPL][3]; + isalc_gcm_dec_finalize_fp igo_dec_finalize[GSI_ISALC_NUM_IMPL][3]; + isalc_gcm_enc_fp igo_enc[GSI_ISALC_NUM_IMPL][3]; + isalc_gcm_dec_fp igo_dec[GSI_ISALC_NUM_IMPL][3]; +} isalc_gcm_ops_t; + +static isalc_gcm_ops_t isalc_ops = { + .igo_precomp = { + [0][0] = icp_isalc_gcm_precomp_128_sse, + [0][1] = icp_isalc_gcm_precomp_192_sse, + [0][2] = icp_isalc_gcm_precomp_256_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + }, + .igo_init = { + [0][0] = icp_isalc_gcm_init_128_sse, + [0][1] = icp_isalc_gcm_init_192_sse, + [0][2] = icp_isalc_gcm_init_256_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + }, + .igo_enc_update = { + [0][0] = icp_isalc_gcm_enc_128_update_sse, + [0][1] = icp_isalc_gcm_enc_192_update_sse, + [0][2] = icp_isalc_gcm_enc_256_update_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + }, + .igo_dec_update = { + [0][0] = icp_isalc_gcm_dec_128_update_sse, + [0][1] = icp_isalc_gcm_dec_192_update_sse, + [0][2] = icp_isalc_gcm_dec_256_update_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + }, + .igo_enc_finalize = { + [0][0] = icp_isalc_gcm_enc_128_finalize_sse, + [0][1] = icp_isalc_gcm_enc_192_finalize_sse, + [0][2] = icp_isalc_gcm_enc_256_finalize_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + }, + .igo_dec_finalize = { + [0][0] = icp_isalc_gcm_dec_128_finalize_sse, + [0][1] = icp_isalc_gcm_dec_192_finalize_sse, + [0][2] = icp_isalc_gcm_dec_256_finalize_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + }, + .igo_enc = { + [0][0] = icp_isalc_gcm_enc_128_sse, + [0][1] = icp_isalc_gcm_enc_192_sse, + [0][2] = icp_isalc_gcm_enc_256_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + }, + .igo_dec = { + [0][0] = icp_isalc_gcm_dec_128_sse, + [0][1] = icp_isalc_gcm_dec_192_sse, + [0][2] = icp_isalc_gcm_dec_256_sse, + /* TODO: add [1][0..2] for AVX2 ... */ + } +}; -extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); +/* + * Return B_TRUE if impl is a isalc implementation. + */ +static inline boolean_t +is_isalc_impl(gcm_simd_impl_t impl) +{ + int i = (int)impl; -static inline boolean_t gcm_avx_will_work(void); -static inline void gcm_set_avx(boolean_t); -static inline boolean_t gcm_toggle_avx(void); -static inline size_t gcm_simd_get_htab_size(boolean_t); + if (i >= GSI_ISALC_FIRST_IMPL && i <= GSI_ISALC_LAST_IMPL) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* + * Get the index into the isalc function pointer array for the different + * SIMD (SSE, AVX2, VAES) isalc implementations. + */ +static inline int +get_isalc_gcm_impl_index(const gcm_ctx_t *ctx) +{ + gcm_simd_impl_t impl = ctx->gcm_simd_impl; + int index = (int)impl - GSI_ISALC_FIRST_IMPL; + + ASSERT3S(index, >=, 0); + ASSERT3S(index, <, GSI_ISALC_NUM_IMPL); + + return (index); +} + +/* + * Get the index (0..2) into the isalc function pointer array for the GCM + * key length (128,192,256) the given ctx uses. + */ +static inline int +get_isalc_gcm_keylen_index(const gcm_ctx_t *ctx) +{ + const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; + int aes_rounds = ((aes_key_t *)keysched)->nr; + /* AES uses 10,12,14 rounds for AES-{128,192,256}. */ + int index = (aes_rounds - 10) >> 1; + + ASSERT3S(index, >=, 0); + ASSERT3S(index, <=, 2); + + return (index); +} + +static inline boolean_t gcm_sse_will_work(void); + +#ifdef DEBUG_GCM_ASM +/* + * Call this in gcm_init_ctx before doing anything else. The shadowed ctx + * is stored in ctx->gcm_shadow_ctx. + */ +static __attribute__((__noinline__)) gcm_ctx_t * +gcm_duplicate_ctx(gcm_ctx_t *ctx) +{ + ASSERT3P(ctx->gcm_pt_buf, ==, NULL); + ASSERT3P(ctx->gcm_shadow_ctx, ==, NULL); /* No nested ctxs allowed. */ + + gcm_ctx_t *new_ctx; + size_t sz = sizeof (gcm_ctx_t); + + if ((new_ctx = kmem_zalloc(sz, KM_SLEEP)) == NULL) + return (NULL); + + (void) memcpy(new_ctx, ctx, sz); + new_ctx->gcm_simd_impl = DEBUG_GCM_ASM; + size_t htab_len = gcm_simd_get_htab_size(new_ctx->gcm_simd_impl); + if (htab_len == 0) { + kmem_free(new_ctx, sz); + return (NULL); + } + new_ctx->gcm_htab_len = htab_len; + new_ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP); + if (new_ctx->gcm_Htable == NULL) { + kmem_free(new_ctx, sz); + return (NULL); + } + new_ctx->gcm_is_shadow = B_TRUE; -static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, - crypto_data_t *, size_t); + ctx->gcm_shadow_ctx = new_ctx; + return (new_ctx); +} +#endif /* ifndef DEBUG_GCM_ASM */ + +static inline void gcm_init_isalc(gcm_ctx_t *, const uint8_t *, size_t, + const uint8_t *, size_t); + +static inline int gcm_mode_encrypt_contiguous_blocks_isalc(gcm_ctx_t *, + const uint8_t *, size_t, crypto_data_t *); + +static inline int gcm_encrypt_final_isalc(gcm_ctx_t *, crypto_data_t *); +static inline int gcm_decrypt_final_isalc(gcm_ctx_t *, crypto_data_t *); + +#ifdef CAN_USE_GCM_ASM_AVX +static inline boolean_t gcm_avx_will_work(void); +static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, const uint8_t *, + size_t, crypto_data_t *, size_t); static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); -static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *, +static void gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *, size_t, size_t); + +#endif /* ifdef CAN_USE_GCM_ASM_AVX */ #endif /* ifdef CAN_USE_GCM_ASM */ /* @@ -93,11 +343,19 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) + if (is_isalc_impl(ctx->gcm_simd_impl) == B_TRUE) + return (gcm_mode_encrypt_contiguous_blocks_isalc( + ctx, (const uint8_t *)data, length, out)); + +#ifdef CAN_USE_GCM_ASM_AVX + if (ctx->gcm_simd_impl == GSI_OSSL_AVX) return (gcm_mode_encrypt_contiguous_blocks_avx( - ctx, data, length, out, block_size)); + ctx, (const uint8_t *)data, length, out, block_size)); #endif + ASSERT3S(ctx->gcm_simd_impl, ==, GSI_NONE); +#endif /* ifdef CAN_USE_GCM_ASM */ + const gcm_impl_ops_t *gops; size_t remainder = length; size_t need = 0; @@ -211,11 +469,19 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, void (*xor_block)(uint8_t *, uint8_t *)) { (void) copy_block; + #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) + if (is_isalc_impl(ctx->gcm_simd_impl) == B_TRUE) + return (gcm_encrypt_final_isalc(ctx, out)); + +#ifdef CAN_USE_GCM_ASM_AVX + if (ctx->gcm_simd_impl == GSI_OSSL_AVX) return (gcm_encrypt_final_avx(ctx, out, block_size)); #endif + ASSERT3S(ctx->gcm_simd_impl, ==, GSI_NONE); +#endif /* ifdef CAN_USE_GCM_ASM */ + const gcm_impl_ops_t *gops; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint8_t *ghash, *macp = NULL; @@ -367,8 +633,8 @@ gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, length); ctx->gcm_processed_data_len += length; } - ctx->gcm_remainder_len = 0; + return (CRYPTO_SUCCESS); } @@ -378,10 +644,17 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) + if (is_isalc_impl(ctx->gcm_simd_impl) == B_TRUE) + return (gcm_decrypt_final_isalc(ctx, out)); + +#ifdef CAN_USE_GCM_ASM_AVX + if (ctx->gcm_simd_impl == GSI_OSSL_AVX) return (gcm_decrypt_final_avx(ctx, out, block_size)); #endif + ASSERT3S(ctx->gcm_simd_impl, ==, GSI_NONE); +#endif /* ifdef CAN_USE_GCM_ASM */ + const gcm_impl_ops_t *gops; size_t pt_len; size_t remainder; @@ -622,6 +895,7 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, * Init the GCM context struct. Handle the cycle and avx implementations here. * Initialization of a GMAC context differs slightly from a GCM context. */ +/* XXXX: inline __attribute__((__always_inline__) ??? */ static inline int gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, @@ -629,6 +903,7 @@ gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, void (*xor_block)(uint8_t *, uint8_t *)) { CK_AES_GCM_PARAMS *gcm_param; + boolean_t can_use_isalc = B_TRUE; int rv = CRYPTO_SUCCESS; size_t tag_len, iv_len; @@ -640,23 +915,32 @@ gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, if ((rv = gcm_validate_args(gcm_param)) != 0) { return (rv); } + /* XXXX: redundant? already done in gcm_alloc_ctx */ gcm_ctx->gcm_flags |= GCM_MODE; - + /* + * The isalc implementations do not support a IV lenght + * other than 12 bytes and only 8, 12 and 16 bytes tag + * length. + */ size_t tbits = gcm_param->ulTagBits; + if (gcm_param->ulIvLen != 12 || + (tbits != 64 && tbits != 96 && tbits != 128)) { + can_use_isalc = B_FALSE; + } tag_len = CRYPTO_BITS2BYTES(tbits); iv_len = gcm_param->ulIvLen; } else { /* GMAC mode. */ + ASSERT3U(AES_GMAC_TAG_BITS, ==, 128); + ASSERT3U(AES_GMAC_IV_LEN, ==, 12); + + /* XXXX: redundant? already done in gmac_alloc_ctx */ gcm_ctx->gcm_flags |= GMAC_MODE; tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); iv_len = AES_GMAC_IV_LEN; } - gcm_ctx->gcm_tag_len = tag_len; gcm_ctx->gcm_processed_data_len = 0; - - /* these values are in bits */ - gcm_ctx->gcm_len_a_len_c[0] - = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); + gcm_ctx->gcm_tag_len = tag_len; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } @@ -670,40 +954,46 @@ gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap; if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { - gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; + gcm_ctx->gcm_simd_impl = GCM_SIMD_IMPL_READ; } else { /* - * Handle the "cycle" implementation by creating avx and - * non-avx contexts alternately. + * Handle the "cycle" implementation by cycling through all + * supported SIMD implementation. This can only be done once + * per context since they differ in requirements. */ - gcm_ctx->gcm_use_avx = gcm_toggle_avx(); - - /* The avx impl. doesn't handle byte swapped key schedules. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { - gcm_ctx->gcm_use_avx = B_FALSE; + gcm_ctx->gcm_simd_impl = gcm_cycle_simd_impl(); + /* + * We don't handle byte swapped key schedules in the SIMD + * code paths. + */ + aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; + if (ks->ops->needs_byteswap == B_TRUE) { + gcm_ctx->gcm_simd_impl = GSI_NONE; } +#ifdef CAN_USE_GCM_ASM_AVX /* * If this is a GCM context, use the MOVBE and the BSWAP * variants alternately. GMAC contexts code paths do not * use the MOVBE instruction. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && gmac_mode == B_FALSE && - zfs_movbe_available() == B_TRUE) { + if (gcm_ctx->gcm_simd_impl == GSI_OSSL_AVX && + gmac_mode == B_FALSE && zfs_movbe_available() == B_TRUE) { (void) atomic_toggle_boolean_nv( (volatile boolean_t *)&gcm_avx_can_use_movbe); } +#endif } /* - * We don't handle byte swapped key schedules in the avx code path, + * We don't handle byte swapped key schedules in the SIMD code paths, * still they could be created by the aes generic implementation. * Make sure not to use them since we'll corrupt data if we do. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { - gcm_ctx->gcm_use_avx = B_FALSE; + if (gcm_ctx->gcm_simd_impl != GSI_NONE && needs_bswap == B_TRUE) { + gcm_ctx->gcm_simd_impl = GSI_NONE; cmn_err_once(CE_WARN, "ICP: Can't use the aes generic or cycle implementations " - "in combination with the gcm avx implementation!"); + "in combination with the gcm SIMD implementations!"); cmn_err_once(CE_WARN, "ICP: Falling back to a compatible implementation, " "aes-gcm performance will likely be degraded."); @@ -711,10 +1001,17 @@ gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, "ICP: Choose at least the x86_64 aes implementation to " "restore performance."); } - + /* + * Only use isalc if the given IV and tag lengths match what we support. + * This will almost always be the case. + */ + if (can_use_isalc == B_FALSE && is_isalc_impl(gcm_ctx->gcm_simd_impl)) { + gcm_ctx->gcm_simd_impl = GSI_NONE; + } /* Allocate Htab memory as needed. */ - if (gcm_ctx->gcm_use_avx == B_TRUE) { - size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + if (gcm_ctx->gcm_simd_impl != GSI_NONE) { + size_t htab_len = + gcm_simd_get_htab_size(gcm_ctx->gcm_simd_impl); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); @@ -727,20 +1024,31 @@ gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param, return (CRYPTO_HOST_MEMORY); } } - /* Avx and non avx context initialization differs from here on. */ - if (gcm_ctx->gcm_use_avx == B_FALSE) { + /* Avx and non avx context initialization differ from here on. */ + if (gcm_ctx->gcm_simd_impl == GSI_NONE) { #endif /* ifdef CAN_USE_GCM_ASM */ + /* these values are in bits */ + gcm_ctx->gcm_len_a_len_c[0] = + htonll(CRYPTO_BYTES2BITS(aad_len)); + if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size, encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } #ifdef CAN_USE_GCM_ASM - } else { - if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, - block_size) != CRYPTO_SUCCESS) { - rv = CRYPTO_MECHANISM_PARAM_INVALID; - } } + if (is_isalc_impl(gcm_ctx->gcm_simd_impl) == B_TRUE) { + gcm_init_isalc(gcm_ctx, iv, iv_len, aad, aad_len); + } +#ifdef CAN_USE_GCM_ASM_AVX + if (gcm_ctx->gcm_simd_impl == GSI_OSSL_AVX) { + /* these values are in bits */ + gcm_ctx->gcm_len_a_len_c[0] = + htonll(CRYPTO_BYTES2BITS(aad_len)); + + gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, block_size); + } +#endif /* ifdef CAN_USE_GCM_ASM_AVX */ #endif /* ifdef CAN_USE_GCM_ASM */ return (rv); @@ -876,21 +1184,34 @@ gcm_impl_init(void) strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); #ifdef CAN_USE_GCM_ASM + /* Statically select the fastest SIMD implementation: (AVX > SSE). */ + /* TODO: Use a benchmark like other SIMD implementations do. */ + gcm_simd_impl_t fastest_simd = GSI_NONE; + + if (gcm_sse_will_work()) { + fastest_simd = GSI_ISALC_SSE; + } + +#ifdef CAN_USE_GCM_ASM_AVX /* * Use the avx implementation if it's available and the implementation * hasn't changed from its default value of fastest on module load. */ if (gcm_avx_will_work()) { + fastest_simd = GSI_OSSL_AVX; #ifdef HAVE_MOVBE if (zfs_movbe_available() == B_TRUE) { atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); } -#endif - if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { - gcm_set_avx(B_TRUE); - } +#endif /* ifdef HAVE_MOVBE */ } -#endif +#endif /* CAN_USE_GCM_ASM_AVX */ + + if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { + gcm_set_simd_impl(fastest_simd); + } +#endif /* ifdef CAN_USE_GCM_ASM */ + /* Finish initialization */ atomic_swap_32(&icp_gcm_impl, user_sel_impl); gcm_impl_initialized = B_TRUE; @@ -902,9 +1223,12 @@ static const struct { } gcm_impl_opts[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, -#ifdef CAN_USE_GCM_ASM +#ifdef CAN_USE_GCM_ASM_AVX { "avx", IMPL_AVX }, #endif +#ifdef CAN_USE_GCM_ASM + { "sse4_1", IMPL_SSE4_1 }, +#endif }; /* @@ -934,16 +1258,24 @@ gcm_impl_set(const char *val) strlcpy(req_name, val, GCM_IMPL_NAME_MAX); while (i > 0 && isspace(req_name[i-1])) i--; + req_name[i] = '\0'; /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM + /* Ignore sse implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_SSE4_1 && + !gcm_sse_will_work()) { + continue; + } +#ifdef CAN_USE_GCM_ASM_AVX /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } -#endif +#endif /* ifdef CAN_USE_GCM_ASM_AVX */ +#endif /* ifdef CAN_USE_GCM_ASM */ if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { impl = gcm_impl_opts[i].sel; err = 0; @@ -964,16 +1296,23 @@ gcm_impl_set(const char *val) } #ifdef CAN_USE_GCM_ASM /* - * Use the avx implementation if available and the requested one is - * avx or fastest. + * Use the requested SIMD implementation if available. + * If the requested one is fastest, use the fastest SIMD impl. */ + gcm_simd_impl_t simd_impl = GSI_NONE; + + if (gcm_sse_will_work() == B_TRUE && + (impl == IMPL_SSE4_1 || impl == IMPL_FASTEST)) { + simd_impl = GSI_ISALC_SSE; + } +#ifdef CAN_USE_GCM_ASM_AVX if (gcm_avx_will_work() == B_TRUE && (impl == IMPL_AVX || impl == IMPL_FASTEST)) { - gcm_set_avx(B_TRUE); - } else { - gcm_set_avx(B_FALSE); + simd_impl = GSI_OSSL_AVX; } -#endif +#endif /* ifdef CAN_USE_GCM_ASM_AVX */ + gcm_set_simd_impl(simd_impl); +#endif /* ifdef CAN_USE_GCM_ASM */ if (err == 0) { if (gcm_impl_initialized) @@ -1005,11 +1344,17 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM + if (gcm_impl_opts[i].sel == IMPL_SSE4_1 && + !gcm_sse_will_work()) { + continue; + } +#ifdef CAN_USE_GCM_ASM_AVX /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } -#endif +#endif /* ifdef CAN_USE_GCM_ASM_AVX */ +#endif /* ifdef CAN_USE_GCM_ASM */ fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, gcm_impl_opts[i].name); @@ -1028,10 +1373,122 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, NULL, 0644); MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); -#endif /* defined(__KERNEL) */ +#endif /* defined(__KERNEL) && defined(__linux__) */ + #ifdef CAN_USE_GCM_ASM + +static inline boolean_t +gcm_sse_will_work(void) +{ + /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ + return (kfpu_allowed() && + zfs_sse4_1_available() && zfs_aes_available() && + zfs_pclmulqdq_available()); +} + +static inline size_t +gcm_simd_get_htab_size(gcm_simd_impl_t simd_mode) +{ + switch (simd_mode) { + case GSI_NONE: + return (0); + break; + case GSI_OSSL_AVX: + return (2 * 6 * 2 * sizeof (uint64_t)); + break; + case GSI_ISALC_SSE: + return (2 * 8 * 2 * sizeof (uint64_t)); + break; + default: +#ifdef _KERNEL + cmn_err(CE_WARN, "Undefined simd_mode %d!", (int)simd_mode); +#endif + return (0); + } +} + +/* TODO: it's an enum now: adapt */ +static inline void +gcm_set_simd_impl(gcm_simd_impl_t val) +{ + atomic_swap_32(&gcm_simd_impl, val); +} + +/* + * Cycle through all supported SIMD implementations, used by IMPL_CYCLE. + * The cycle must be done atomically since multiple threads may try to do it + * concurrently. So we do a atomic compare and swap for each possible value, + * trying n_tries times to cycle the value. + * + * Please note that since higher level SIMD instruction sets include the lower + * level ones, the code for newer ones must be placed at the top of this + * function. + */ +static inline gcm_simd_impl_t +gcm_cycle_simd_impl(void) +{ + int n_tries = 10; + + /* TODO: Add here vaes and avx2 with vaes beeing top most */ + +#ifdef CAN_USE_GCM_ASM_AVX + if (gcm_avx_will_work() == B_TRUE) { + for (int i = 0; i < n_tries; ++i) { + if (atomic_cas_32(&GCM_SIMD_IMPL_READ, + GSI_NONE, GSI_ISALC_SSE) == GSI_NONE) + return (GSI_ISALC_SSE); + + if (atomic_cas_32(&GCM_SIMD_IMPL_READ, + GSI_ISALC_SSE, GSI_OSSL_AVX) == GSI_ISALC_SSE) + return (GSI_OSSL_AVX); + + if (atomic_cas_32(&GCM_SIMD_IMPL_READ, + GSI_OSSL_AVX, GSI_NONE) == GSI_OSSL_AVX) + return (GSI_NONE); + } + /* We failed to cycle, return current value. */ + return (GCM_SIMD_IMPL_READ); + } +#endif +#ifdef CAN_USE_GCM_ASM_SSE + if (gcm_sse_will_work() == B_TRUE) { + for (int i = 0; i < n_tries; ++i) { + if (atomic_cas_32(&GCM_SIMD_IMPL_READ, + GSI_NONE, GSI_ISALC_SSE) == GSI_NONE) + return (GSI_ISALC_SSE); + + if (atomic_cas_32(&GCM_SIMD_IMPL_READ, + GSI_ISALC_SSE, GSI_NONE) == GSI_ISALC_SSE) + return (GSI_NONE); + + } + /* We failed to cycle, return current value. */ + return (GCM_SIMD_IMPL_READ); + } +#endif + /* No supported SIMD implementations. */ + return (GSI_NONE); +} + +#define GCM_ISALC_MIN_CHUNK_SIZE 1024 /* 64 16 byte blocks */ +#define GCM_ISALC_MAX_CHUNK_SIZE 1024*1024 /* XXXXXX */ +/* Get the chunk size module parameter. */ +#define GCM_ISALC_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_isalc_chunk_size + +/* + * Module parameter: number of bytes to process at once while owning the FPU. + * Rounded down to the next multiple of 512 bytes and ensured to be greater + * or equal to GCM_ISALC_MIN_CHUNK_SIZE and less or equal to + * GCM_ISALC_MAX_CHUNK_SIZE. It defaults to 32 kiB. + */ +static uint32_t gcm_isalc_chunk_size = 32 * 1024; + + + +#ifdef CAN_USE_GCM_ASM_AVX #define GCM_BLOCK_LEN 16 + /* * The openssl asm routines are 6x aggregated and need that many bytes * at minimum. @@ -1054,7 +1511,7 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) /* Get the chunk size module parameter. */ -#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size +#define GCM_AVX_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size /* * Module parameter: number of bytes to process at once while owning the FPU. @@ -1079,6 +1536,15 @@ extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); + +/* XXXX: DEBUG: don't disable preemption while debugging */ +#if 0 +#undef kfpu_begin +#undef kfpu_end +#define kfpu_begin() +#define kfpu_end() +#endif + static inline boolean_t gcm_avx_will_work(void) { @@ -1088,37 +1554,6 @@ gcm_avx_will_work(void) zfs_pclmulqdq_available()); } -static inline void -gcm_set_avx(boolean_t val) -{ - if (gcm_avx_will_work() == B_TRUE) { - atomic_swap_32(&gcm_use_avx, val); - } -} - -static inline boolean_t -gcm_toggle_avx(void) -{ - if (gcm_avx_will_work() == B_TRUE) { - return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); - } else { - return (B_FALSE); - } -} - -static inline size_t -gcm_simd_get_htab_size(boolean_t simd_mode) -{ - switch (simd_mode) { - case B_TRUE: - return (2 * 6 * 2 * sizeof (uint64_t)); - - default: - return (0); - } -} - - /* Increment the GCM counter block by n. */ static inline void gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) @@ -1137,14 +1572,14 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) * if possible. While processing a chunk the FPU is "locked". */ static int -gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, +gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, const uint8_t *data, size_t length, crypto_data_t *out, size_t block_size) { size_t bleft = length; size_t need = 0; size_t done = 0; uint8_t *datap = (uint8_t *)data; - size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + size_t chunk_size = (size_t)GCM_AVX_CHUNK_SIZE_READ; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint64_t *ghash = ctx->gcm_ghash; uint64_t *cb = ctx->gcm_cb; @@ -1276,6 +1711,36 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, out: clear_fpu_regs(); kfpu_end(); + +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_shadow_ctx != NULL) { + gcm_ctx_t *sc = ctx->gcm_shadow_ctx; + + (void) gcm_mode_encrypt_contiguous_blocks_isalc( + sc, data, length, NULL); + + if (ctx->gcm_remainder_len != sc->gcm_remainder_len) { + cmn_err(CE_WARN, + "AVX vs SSE: encrypt: remainder_len differs!"); + } + /* + * Handling of partial GCM blocks differ between AVX and SSE, + * so the tags will not match in this case. + */ + if (ctx->gcm_remainder_len == 0) { + /* Byte swap the SSE tag, it is in host byte order. */ + uint64_t shadow_ghash[2]; + shadow_ghash[0] = htonll(sc->gcm_ghash[1]); + shadow_ghash[1] = htonll(sc->gcm_ghash[0]); + + if (memcmp(ghash, shadow_ghash, ctx->gcm_tag_len)) { + cmn_err(CE_WARN, + "AVX vs SSE: encrypt: tags differ!"); + } + } + } +#endif + out_nofpu: if (ct_buf != NULL) { vmem_free(ct_buf, chunk_size); @@ -1331,6 +1796,15 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) clear_fpu_regs(); kfpu_end(); +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_shadow_ctx != NULL) { + (void) gcm_encrypt_final_isalc(ctx->gcm_shadow_ctx, NULL); + if (memcmp(ghash, ctx->gcm_shadow_ctx->gcm_ghash, + ctx->gcm_tag_len)) { + cmn_err(CE_WARN, "AVX vs SSE: enc_final: tags differ!"); + } + } +#endif /* Output remainder. */ if (rem_len > 0) { rv = crypto_put_output_data(remainder, out, rem_len); @@ -1359,7 +1833,35 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, B_FALSE); - size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; +#ifdef DEBUG_GCM_ASM + /* Copy over the plaintext buf to the shadow context. */ + if (ctx->gcm_shadow_ctx != NULL) { + gcm_ctx_t *sc = ctx->gcm_shadow_ctx; + size_t sc_buf_len = ctx->gcm_pt_buf_len; + uint8_t *sc_pt_buf = vmem_alloc(sc_buf_len, KM_SLEEP); + + if (sc_pt_buf != NULL) { + memcpy(sc_pt_buf, ctx->gcm_pt_buf, sc_buf_len); + sc->gcm_pt_buf = sc_pt_buf; + sc->gcm_pt_buf_len = sc_buf_len; + sc->gcm_processed_data_len = sc_buf_len; + /* Not strictly needed, for completeness. */ + sc->gcm_remainder_len = 0; + } else { + /* + * Memory allocation failed, just drop this shadow + * context and leave a note in the log. + */ + gcm_clear_ctx(sc); + kmem_free(sc, sizeof (gcm_ctx_t)); + ctx->gcm_shadow_ctx = NULL; + cmn_err(CE_WARN, + "Failed to alloc pt_buf for shadow context!"); + } + } +#endif /* DEBUG_GCM_ASM */ + + size_t chunk_size = (size_t)GCM_AVX_CHUNK_SIZE_READ; size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; uint8_t *datap = ctx->gcm_pt_buf; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); @@ -1428,6 +1930,7 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) datap += block_size; bleft -= block_size; } + /* TODO: Remove later, we don't set rv up to here. */ if (rv != CRYPTO_SUCCESS) { clear_fpu_regs(); kfpu_end(); @@ -1445,6 +1948,21 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) clear_fpu_regs(); kfpu_end(); +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_shadow_ctx != NULL) { + (void) gcm_decrypt_final_isalc(ctx->gcm_shadow_ctx, NULL); + /* Ensure decrypted plaintext and tag are identical. */ + if (memcmp(ctx->gcm_pt_buf, ctx->gcm_shadow_ctx->gcm_pt_buf, + pt_len)) { + cmn_err(CE_WARN, + "AVX vs SSE: decrypt: plaintexts differ!"); + } + if (memcmp(ghash, ctx->gcm_shadow_ctx->gcm_ghash, + ctx->gcm_tag_len)) { + cmn_err(CE_WARN, "AVX vs SSE: decrypt: tags differ!"); + } + } +#endif /* Compare the input authentication tag with what we calculated. */ if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { /* They don't match. */ @@ -1462,7 +1980,7 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) * Initialize the GCM params H, Htabtle and the counter block. Save the * initial counter block. */ -static int +static void gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, const uint8_t *auth_data, size_t auth_data_len, size_t block_size) { @@ -1471,7 +1989,7 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; const uint8_t *datap = auth_data; - size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + size_t chunk_size = (size_t)GCM_AVX_CHUNK_SIZE_READ; size_t bleft; ASSERT(block_size == GCM_BLOCK_LEN); @@ -1539,10 +2057,291 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, } clear_fpu_regs(); kfpu_end(); +#ifdef DEBUG_GCM_ASM + if (gcm_duplicate_ctx(ctx) != NULL) { + gcm_init_isalc(ctx->gcm_shadow_ctx, iv, iv_len, auth_data, + auth_data_len); + + if (memcmp(ctx->gcm_J0, ctx->gcm_shadow_ctx->gcm_J0, 16)) { + cmn_err(CE_WARN, "AVX vs SSE: init: ICBs differ!"); + } + if (memcmp(ctx->gcm_H, ctx->gcm_shadow_ctx->gcm_H, 16)) { + cmn_err(CE_WARN, + "AVX vs SSE: init: hash keys differ!"); + } + } +#endif + +} +#endif /* ifdef CAN_USE_GCM_ASM_AVX */ + +/* + * Initialize the GCM params H, Htabtle and the counter block. Save the + * initial counter block. + * + */ + +static inline void +gcm_init_isalc(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, + const uint8_t *auth_data, size_t auth_data_len) +{ + /* + * We know that iv_len must be 12 since that's the only iv_len isalc + * supports, and we made sure it's 12 before calling here. + */ + ASSERT3U(iv_len, ==, 12UL); + + const uint8_t *aad = auth_data; + size_t aad_len = auth_data_len; + size_t tag_len = ctx->gcm_tag_len; + + int impl = get_isalc_gcm_impl_index((const gcm_ctx_t *)ctx); + int keylen = get_isalc_gcm_keylen_index((const gcm_ctx_t *)ctx); + + kfpu_begin(); + (*(isalc_ops.igo_precomp[impl][keylen]))(ctx); /* Init H and Htab */ + (*(isalc_ops.igo_init[impl][keylen]))(ctx, iv, aad, aad_len, tag_len); + kfpu_end(); +} + + +/* + * Encrypt multiple blocks of data in GCM mode. + * This is done in gcm_isalc_chunk_size chunks, utilizing ported Intel(R) + * Intelligent Storage Acceleration Library Crypto Version SIMD assembler + * routines. While processing a chunk the FPU is "locked". + */ +static inline int +gcm_mode_encrypt_contiguous_blocks_isalc(gcm_ctx_t *ctx, const uint8_t *data, + size_t length, crypto_data_t *out) +{ + size_t bleft = length; + size_t chunk_size = (size_t)GCM_ISALC_CHUNK_SIZE_READ; + uint8_t *ct_buf = NULL; + int ct_buf_size; + + /* + * XXXX: It may make sense to allocate a multiple of 'chunk_size' + * up to 'length' to reduce the overhead of crypto_put_output_data() + * and to keep the caches warm. + */ + /* Allocate a buffer to encrypt to. */ + if (bleft >= chunk_size) { + ct_buf_size = chunk_size; + } else { + ct_buf_size = bleft; + } + ct_buf = vmem_alloc(ct_buf_size, KM_SLEEP); + if (ct_buf == NULL) { + return (CRYPTO_HOST_MEMORY); + } + + /* Do the bulk encryption in chunk_size blocks. */ + int impl = get_isalc_gcm_impl_index((const gcm_ctx_t *)ctx); + int keylen = get_isalc_gcm_keylen_index((const gcm_ctx_t *)ctx); + const uint8_t *datap = data; + int rv = CRYPTO_SUCCESS; + + for (; bleft >= chunk_size; bleft -= chunk_size) { + kfpu_begin(); + (*(isalc_ops.igo_enc_update[impl][keylen]))( + ctx, ct_buf, datap, chunk_size); + + kfpu_end(); + datap += chunk_size; +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_is_shadow == B_TRUE) { + continue; + } +#endif + rv = crypto_put_output_data(ct_buf, out, chunk_size); + if (rv != CRYPTO_SUCCESS) { + /* Indicate that we're done. */ + bleft = 0; + break; + } + out->cd_offset += chunk_size; + + } + /* Check if we are already done. */ + if (bleft > 0) { + /* Bulk encrypt the remaining data. */ + kfpu_begin(); + (*(isalc_ops.igo_enc_update[impl][keylen]))( + ctx, ct_buf, datap, bleft); + + kfpu_end(); + +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_is_shadow == B_TRUE) { + if (ct_buf != NULL) { + vmem_free(ct_buf, ct_buf_size); + } + return (CRYPTO_SUCCESS); + + } +#endif + rv = crypto_put_output_data(ct_buf, out, bleft); + if (rv == CRYPTO_SUCCESS) { + out->cd_offset += bleft; + } + } + if (ct_buf != NULL) { + vmem_free(ct_buf, ct_buf_size); + } + return (rv); +} + +/* + * XXXX: IIRC inplace ops have a performance penalty in isalc but I can't + * find it anymore + */ +/* + * Finalize decryption: We just have accumulated crypto text, so now we + * decrypt it here inplace. + */ +static inline int +gcm_decrypt_final_isalc(gcm_ctx_t *ctx, crypto_data_t *out) +{ + ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); + + size_t chunk_size = (size_t)GCM_ISALC_CHUNK_SIZE_READ; + size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; + uint8_t *datap = ctx->gcm_pt_buf; + + /* + * The isalc routines will increment ctx->gcm_processed_data_len + * on decryption, so reset it. + */ + ctx->gcm_processed_data_len = 0; + + int impl = get_isalc_gcm_impl_index((const gcm_ctx_t *)ctx); + int keylen = get_isalc_gcm_keylen_index((const gcm_ctx_t *)ctx); + + /* Decrypt in chunks of gcm_avx_chunk_size. */ + size_t bleft; + for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { + kfpu_begin(); + (*(isalc_ops.igo_dec_update[impl][keylen]))( + ctx, datap, datap, chunk_size); + kfpu_end(); + datap += chunk_size; + } + /* + * Decrypt remainder, which is less than chunk size, in one go and + * finish the tag. Since this won't consume much time, do it in a + * single kfpu block. dec_update() will handle a zero bleft properly. + */ + kfpu_begin(); + (*(isalc_ops.igo_dec_update[impl][keylen]))(ctx, datap, datap, bleft); + datap += bleft; + (*(isalc_ops.igo_dec_finalize[impl][keylen]))(ctx); + kfpu_end(); + + ASSERT3U(ctx->gcm_processed_data_len, ==, pt_len); + + /* + * Compare the input authentication tag with what we calculated. + * datap points to the expected tag at the end of ctx->gcm_pt_buf. + */ + if (memcmp(datap, ctx->gcm_ghash, ctx->gcm_tag_len)) { + /* They don't match. */ + return (CRYPTO_INVALID_MAC); + } +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_is_shadow == B_TRUE) { + return (CRYPTO_SUCCESS); + } +#endif + int rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); + if (rv != CRYPTO_SUCCESS) { + return (rv); + } + out->cd_offset += pt_len; + /* io/aes.c asserts this, so be nice and meet expectations. */ + ctx->gcm_remainder_len = 0; + + /* Sensitive data in the context is cleared on ctx destruction. */ + return (CRYPTO_SUCCESS); +} + +/* + * Finalize the encryption: We have already written out all encrypted data. + * We update the hash with the last incomplete block, calculate + * len(A) || len (C), encrypt gcm->gcm_J0 (initial counter block), calculate + * the tag and store it in gcm->ghash and finally output the tag. + */ +static inline int +gcm_encrypt_final_isalc(gcm_ctx_t *ctx, crypto_data_t *out) +{ + uint64_t tag_len = ctx->gcm_tag_len; + +/* For security measures we pass NULL as the out pointer for shadow contexts. */ +#ifndef DEBUG_GCM_ASM + if (out->cd_length < tag_len) { + return (CRYPTO_DATA_LEN_RANGE); + } +#endif + + int impl = get_isalc_gcm_impl_index((const gcm_ctx_t *)ctx); + int keylen = get_isalc_gcm_keylen_index((const gcm_ctx_t *)ctx); + + kfpu_begin(); + (*(isalc_ops.igo_enc_finalize[impl][keylen]))(ctx); + kfpu_end(); + +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_is_shadow == B_TRUE) { + return (CRYPTO_SUCCESS); + } +#endif + + /* Write the tag out. */ + uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; + int rv = crypto_put_output_data(ghash, out, tag_len); + + if (rv != CRYPTO_SUCCESS) + return (rv); + + out->cd_offset += tag_len; + /* io/aes.c asserts this, so be nice and meet expectations. */ + ctx->gcm_remainder_len = 0; + + /* Sensitive data in the context is cleared on ctx destruction. */ return (CRYPTO_SUCCESS); } #if defined(_KERNEL) + +static int +icp_gcm_isalc_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + char val_rounded[16]; + int error = 0; + + error = kstrtoul(buf, 0, &val); + if (error) + return (error); + + /* XXXX; introduce #def */ + val = val & ~(512UL - 1UL); + + if (val < GCM_ISALC_MIN_CHUNK_SIZE || val > GCM_ISALC_MAX_CHUNK_SIZE) + return (-EINVAL); + + snprintf(val_rounded, 16, "%u", (uint32_t)val); + error = param_set_uint(val_rounded, kp); + return (error); +} + +module_param_call(icp_gcm_isalc_chunk_size, icp_gcm_isalc_set_chunk_size, + param_get_uint, &gcm_isalc_chunk_size, 0644); + +MODULE_PARM_DESC(icp_gcm_isalc_chunk_size, + "The number of bytes the isalc routines process while owning the FPU"); + +#ifdef CAN_USE_GCM_ASM_AVX static int icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) { @@ -1568,7 +2367,8 @@ module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, param_get_uint, &gcm_avx_chunk_size, 0644); MODULE_PARM_DESC(icp_gcm_avx_chunk_size, - "How many bytes to process while owning the FPU"); + "The number of bytes the avx routines process while owning the FPU"); +#endif /* ifdef CAN_USE_GCM_ASM_AVX */ #endif /* defined(__KERNEL) */ #endif /* ifdef CAN_USE_GCM_ASM */ diff --git a/module/icp/algs/modes/modes.c b/module/icp/algs/modes/modes.c index 6f6649b3b58b..31a19d2aa594 100644 --- a/module/icp/algs/modes/modes.c +++ b/module/icp/algs/modes/modes.c @@ -180,7 +180,7 @@ gcm_clear_ctx(gcm_ctx_t *ctx) explicit_memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); explicit_memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); #if defined(CAN_USE_GCM_ASM) - if (ctx->gcm_use_avx == B_TRUE) { + if (ctx->gcm_simd_impl != GSI_NONE) { ASSERT3P(ctx->gcm_Htable, !=, NULL); memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len); kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); @@ -193,4 +193,20 @@ gcm_clear_ctx(gcm_ctx_t *ctx) /* Optional */ explicit_memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0)); explicit_memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp)); + +#ifdef DEBUG_GCM_ASM + if (ctx->gcm_shadow_ctx != NULL) { + /* No need to clear data while debugging, just free memory. */ + gcm_ctx_t *sc = ctx->gcm_shadow_ctx; + + if (sc->gcm_Htable != NULL) { + kmem_free(sc->gcm_Htable, sc->gcm_htab_len); + } + if (sc->gcm_pt_buf != NULL) { + vmem_free(sc->gcm_pt_buf, sc->gcm_pt_buf_len); + } + kmem_free(sc, sizeof (gcm_ctx_t)); + ctx->gcm_shadow_ctx = NULL; + } +#endif } diff --git a/module/icp/asm-x86_64/modes/isalc_gcm128_sse.S b/module/icp/asm-x86_64/modes/isalc_gcm128_sse.S index f552d8630073..0d924cf6428f 100644 --- a/module/icp/asm-x86_64/modes/isalc_gcm128_sse.S +++ b/module/icp/asm-x86_64/modes/isalc_gcm128_sse.S @@ -27,5 +27,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //####################################################################### +#if defined(__x86_64__) && defined(HAVE_SSE4_1) && defined(HAVE_AES) && \ + defined(HAVE_PCLMULQDQ) + #define GCM128_MODE 1 -#include "isalc_gcm_sse_att.S" +#include "isalc_gcm_sse.S" + +#endif diff --git a/module/icp/asm-x86_64/modes/isalc_gcm192_sse.S b/module/icp/asm-x86_64/modes/isalc_gcm192_sse.S new file mode 100644 index 000000000000..851837a34dd5 --- /dev/null +++ b/module/icp/asm-x86_64/modes/isalc_gcm192_sse.S @@ -0,0 +1,36 @@ +//####################################################################### +// Copyright(c) 2011-2016 Intel Corporation All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in +// the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, +// DATA, OR PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//####################################################################### + +#if defined(__x86_64__) && defined(HAVE_SSE4_1) && defined(HAVE_AES) && \ + defined(HAVE_PCLMULQDQ) + +#define GCM192_MODE 1 +#include "isalc_gcm_sse.S" + +#endif diff --git a/module/icp/asm-x86_64/modes/isalc_gcm256_sse.S b/module/icp/asm-x86_64/modes/isalc_gcm256_sse.S index c88cb0ed055f..75b99f664348 100644 --- a/module/icp/asm-x86_64/modes/isalc_gcm256_sse.S +++ b/module/icp/asm-x86_64/modes/isalc_gcm256_sse.S @@ -27,5 +27,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ////////////////////////////////////////////////////////////////////////// +#if defined(__x86_64__) && defined(HAVE_SSE4_1) && defined(HAVE_AES) && \ + defined(HAVE_PCLMULQDQ) + #define GCM256_MODE 1 -#include "isalc_gcm_sse_att.S" +#include "isalc_gcm_sse.S" + +#endif diff --git a/module/icp/asm-x86_64/modes/isalc_gcm_defines.S b/module/icp/asm-x86_64/modes/isalc_gcm_defines.S index 00ec4c654d9f..825f46a52dc6 100644 --- a/module/icp/asm-x86_64/modes/isalc_gcm_defines.S +++ b/module/icp/asm-x86_64/modes/isalc_gcm_defines.S @@ -36,10 +36,10 @@ // Vinodh Gopal // James Guilford +// Port to GNU as, translation to GNU as att-syntax and adoptions for the ICP +// Copyright(c) 2023 Attila Fülöp -//////////// - -.section .rodata +SECTION_STATIC .balign 16 POLY: .quad 0x0000000000000001, 0xC200000000000000 @@ -181,76 +181,146 @@ mask_out_top_block: .section .text +// #define KEYSCHED_LEN (15 * GCM_BLOCKSIZE) +// #define AES_KEY_LEN (2 * KEYSCHED_LEN + 16 + 8 + 4 + 4) // 512 -////define the fields of gcm_data struct -//typedef struct gcm_data -//{ -// u8 expanded_keys[16*15]// -// u8 shifted_hkey_1[16]// // store HashKey <<1 mod poly here -// u8 shifted_hkey_2[16]// // store HashKey^2 <<1 mod poly here -// u8 shifted_hkey_3[16]// // store HashKey^3 <<1 mod poly here -// u8 shifted_hkey_4[16]// // store HashKey^4 <<1 mod poly here -// u8 shifted_hkey_5[16]// // store HashKey^5 <<1 mod poly here -// u8 shifted_hkey_6[16]// // store HashKey^6 <<1 mod poly here -// u8 shifted_hkey_7[16]// // store HashKey^7 <<1 mod poly here -// u8 shifted_hkey_8[16]// // store HashKey^8 <<1 mod poly here -// u8 shifted_hkey_1_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) -// u8 shifted_hkey_2_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -// u8 shifted_hkey_3_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -// u8 shifted_hkey_4_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -// u8 shifted_hkey_5_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -// u8 shifted_hkey_6_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -// u8 shifted_hkey_7_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -// u8 shifted_hkey_8_k[16]// // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) -//} gcm_data// +// Offsets into struct gcm_ctx: +// +// typedef struct gcm_ctx { +// void *gcm_keysched; OFFSET: 0 = 0 +// size_t gcm_keysched_len; OFFSET: 1*8 = 8 +// uint64_t gcm_cb[2]; OFFSET: 2*8 = 16 +// uint64_t gcm_remainder[2]; OFFSET: 4*8 = 32 +// size_t gcm_remainder_len; OFFSET: 6*8 = 48 +// uint8_t *gcm_lastp; OFFSET: 7*8 = 56 +// uint8_t *gcm_copy_to; OFFSET: 8*8 = 64 +// uint32_t gcm_flags; OFFSET: 9*8 = 72 +// size_t gcm_tag_len; OFFSET: 10*8 = 80 +// size_t gcm_processed_data_len; OFFSET: 11*8 = 88 +// size_t gcm_pt_buf_len; OFFSET: 12*8 = 96 +// uint32_t gcm_tmp[4]; OFFSET: 13*8 = 104 +// uint64_t gcm_ghash[2]; OFFSET: 15*8 = 120 +// uint64_t gcm_H[2]; OFFSET: 17*8 = 136 +// uint64_t *gcm_Htable; OFFSET: 19*8 = 152 +// size_t gcm_htab_len; OFFSET: 20*8 = 160 +// uint64_t gcm_J0[2]; OFFSET: 21*8 = 168 +// uint64_t gcm_len_a_len_c[2]; OFFSET: 23*8 = 184 +// uint8_t *gcm_pt_buf; OFFSET: 25*8 = 200 +// gcm_simd_impl_t gcm_simd_impl; OFFSET: 26*8 = 208 +// } gcm_ctx_t; SIZE: = 216 + +// AadHash: +// Store current Hash of data which has been input: gcm_ctx->ghash. +// +// AadLen: +// Store length of input data which will not be encrypted or decrypted: +// gcm_ctx->gcm_tag_len. +// +// InLen: +// Store length of input data which will be encrypted or decrypted: +// gcm_ctx->gcm_processed_data_len. +// +// PBlockEncKey: +// Encryption key for the partial block at the end of the previous update: +// no real match, use: gcm_ctx->gcm_remainder. +// +// OrigIV: +// The initial counter: 12 bytes IV with (int32_t) 1 appended: +// gcm_ctx->gcm_J0. +// +// CurCount: +// Current counter for generation of encryption key: gcm_ctx->gcm_cb. +// +// PBlockLen: +// Length of partial block at the end of the previous update: +// gcm_ctx->gcm_remainder_len. + +#define KeySched 0 // gcm_ctx->gcm_keysched +#define AadHash (15*8) // gcm_ctx->gcm_ghash +#define AadLen (23*8) // gcm_ctx->gcm_len_a_len_c[0] +#define TagLen (10*8) // gcm_ctx->gcm_tag_len +#define InLen (11*8) // gcm_ctx->gcm_processed_data_len +#define PBlockEncKey (4*8) // gcm_ctx->gcm_remainder +#define OrigIV (21*8) // gcm_ctx->gcm_J0 +#define CurCount (2*8) // gcm_ctx->gcm_cb +#define PBlockLen (6*8) // gcm_ctx->gcm_remainder_len +#define GcmH (17*8) // gcm_ctx->gcm_H +#define GcmHtab (19*8) // gcm_ctx->gcm_Htable +#define LenALenC (23*8) // gcm_ctx->gcm_len_a_len_c + +// Define the offsets into gcm_ctx of the fields fields of gcm_htab. +// u8 shifted_hkey_1[16] store HashKey <<1 mod poly here +// u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here +// u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here +// u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here +// u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here +// u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here +// u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here +// u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here +// u8 shifted_hkey_1_k[16] store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_2_k[16] store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_3_k[16] store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_4_k[16] store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_5_k[16] store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_6_k[16] store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_7_k[16] store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +// u8 shifted_hkey_8_k[16] store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) + +#define GCM_BLOCKSIZE 16 #ifndef GCM_KEYS_VAES_AVX512_INCLUDED -#define HashKey 16*15 // store HashKey <<1 mod poly here -#define HashKey_1 16*15 // store HashKey <<1 mod poly here -#define HashKey_2 16*16 // store HashKey^2 <<1 mod poly here -#define HashKey_3 16*17 // store HashKey^3 <<1 mod poly here -#define HashKey_4 16*18 // store HashKey^4 <<1 mod poly here -#define HashKey_5 16*19 // store HashKey^5 <<1 mod poly here -#define HashKey_6 16*20 // store HashKey^6 <<1 mod poly here -#define HashKey_7 16*21 // store HashKey^7 <<1 mod poly here -#define HashKey_8 16*22 // store HashKey^8 <<1 mod poly here -#define HashKey_k 16*23 // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) -#define HashKey_2_k 16*24 // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -#define HashKey_3_k 16*25 // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -#define HashKey_4_k 16*26 // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -#define HashKey_5_k 16*27 // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -#define HashKey_6_k 16*28 // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -#define HashKey_7_k 16*29 // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -#define HashKey_8_k 16*30 // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +#define HashKey (GCM_BLOCKSIZE * 0) +#define HashKey_1 (GCM_BLOCKSIZE * 0) +#define HashKey_2 (GCM_BLOCKSIZE * 1) +#define HashKey_3 (GCM_BLOCKSIZE * 2) +#define HashKey_4 (GCM_BLOCKSIZE * 3) +#define HashKey_5 (GCM_BLOCKSIZE * 4) +#define HashKey_6 (GCM_BLOCKSIZE * 5) +#define HashKey_7 (GCM_BLOCKSIZE * 6) +#define HashKey_8 (GCM_BLOCKSIZE * 7) +#define HashKey_k (GCM_BLOCKSIZE * 8) +#define HashKey_2_k (GCM_BLOCKSIZE * 9) +#define HashKey_3_k (GCM_BLOCKSIZE * 10) +#define HashKey_4_k (GCM_BLOCKSIZE * 11) +#define HashKey_5_k (GCM_BLOCKSIZE * 12) +#define HashKey_6_k (GCM_BLOCKSIZE * 13) +#define HashKey_7_k (GCM_BLOCKSIZE * 14) +#define HashKey_8_k (GCM_BLOCKSIZE * 15) #endif -#define AadHash 16*0 // store current Hash of data which has been input -#define AadLen 16*1 // store length of input data which will not be encrypted or decrypted -#define InLen (16*1)+8 // store length of input data which will be encrypted or decrypted -#define PBlockEncKey 16*2 // encryption key for the partial block at the end of the previous update -#define OrigIV 16*3 // input IV -#define CurCount 16*4 // Current counter for generation of encryption key -#define PBlockLen 16*5 // length of partial block at the end of the previous update - .macro xmmreg name, num .set xmm\name, %xmm\num .endm +// Push a 64 bit register to the stack and generate the needed CFI directives. +.macro CFI_PUSHQ REG, OFFS + pushq \REG + .cfi_adjust_cfa_offset 8 + .cfi_offset \REG, \OFFS +.endm + +// Pop a 64 bit register from the stack and generate the needed CFI directives. +.macro CFI_POPQ REG + popq \REG + .cfi_restore \REG + .cfi_adjust_cfa_offset -8 +.endm + #define arg(x) (STACK_OFFSET + 8*(x))(%r14) +/* +.macro STACK_FRAME_NON_STANDARD func:req + .pushsection .discard.func_stack_frame_non_standard, "aw" +- .long \func - . ++#ifdef CONFIG_64BIT ++ .quad \func ++#else ++ .long \func ++#endif + .popsection +.endm +*/ -#if __OUTPUT_FORMAT__ != elf64 -#define arg1 %rcx -#define arg2 %rdx -#define arg3 %r8 -#define arg4 %r9 -#define arg5 %rsi -#define arg6 (STACK_OFFSET + 8*6)(%r14) -#define arg7 (STACK_OFFSET + 8*7)(%r14) -#define arg8 (STACK_OFFSET + 8*8)(%r14) -#define arg9 (STACK_OFFSET + 8*9)(%r14) -#define arg10 (STACK_OFFSET + 8*10)(%r14) -#else #define arg1 %rdi #define arg2 %rsi #define arg3 %rdx @@ -261,7 +331,6 @@ mask_out_top_block: #define arg8 ((STACK_OFFSET) + 8*2)(%r14) #define arg9 ((STACK_OFFSET) + 8*3)(%r14) #define arg10 ((STACK_OFFSET) + 8*4)(%r14) -#endif #ifdef NT_LDST #define NT_LD diff --git a/module/icp/asm-x86_64/modes/isalc_gcm_sse.S b/module/icp/asm-x86_64/modes/isalc_gcm_sse.S index 5d5be5068904..fab97e7d8408 100644 --- a/module/icp/asm-x86_64/modes/isalc_gcm_sse.S +++ b/module/icp/asm-x86_64/modes/isalc_gcm_sse.S @@ -116,30 +116,59 @@ // for GHASH part, two tabs is for AES part. // -// .altmacro +// Port to GNU as, translation to GNU as att-syntax and adoptions for the ICP +// Copyright(c) 2023 Attila Fülöp + .att_syntax prefix -#include "isalc_reg_sizes_att.S" -#include "isalc_gcm_defines_att.S" +#define _ASM +#include -#if !defined(GCM128_MODE) && !defined(GCM256_MODE) +#if !defined(GCM128_MODE) && !defined(GCM192_MODE) && !defined(GCM256_MODE) #error "No GCM mode selected for gcm_sse.S!" #endif -#if defined(FUNCT_EXTENSION) -#error "No support for non-temporal versions yet!" +#if 0 +#ifdef GCM128_MODE +#define FN_NAME(x,y) ENTRY_NP(icp_isalc_gcm_ ## x ## _128 ## y ## sse) +//#define FN_NAME(x,y) aes_gcm_ ## x ## _128 ## y ## sse: +#define NROUNDS 9 +#endif + +#ifdef GCM192_MODE +#define FN_NAME(x,y) ENTRY(icp_isalc_gcm_ ## x ## _192 ## y ## sse) +#define NROUNDS 11 #endif -#define _nt 1 +#ifdef GCM256_MODE +#define FN_NAME(x,y) ENTRY(icp_isalc_gcm_ ## x ## _256 ## y ## sse) +#define NROUNDS 13 +#endif +#else #ifdef GCM128_MODE -#define FN_NAME(x,y) aes_gcm_ ## x ## _128 ## y ## sse +#define FN_NAME(x,y) icp_isalc_gcm_ ## x ## _128 ## y ## sse #define NROUNDS 9 #endif +#ifdef GCM192_MODE +#define FN_NAME(x,y) icp_isalc_gcm_ ## x ## _192 ## y ## sse +#define NROUNDS 11 +#endif + #ifdef GCM256_MODE -#define FN_NAME(x,y) aes_gcm_ ## x ## _256 ## y ## sse +#define FN_NAME(x,y) icp_isalc_gcm_ ## x ## _256 ## y ## sse #define NROUNDS 13 #endif +#endif + +#include "isalc_reg_sizes.S" +#include "isalc_gcm_defines.S" + + +#if defined(FUNCT_EXTENSION) +#error "No support for non-temporal versions yet!" +#endif +#define _nt 1 // need to push 5 registers into stack to maintain @@ -235,59 +264,59 @@ //////////////////////////////////////////////////////////////////////////////// // PRECOMPUTE: Precompute HashKey_{2..8} and HashKey{,_{2..8}}_k. -// HasKey_i_k holds XORed values of the low and high parts of the Haskey_i. +// HashKey_i_k holds XORed values of the low and high parts of the Haskey_i. //////////////////////////////////////////////////////////////////////////////// -.macro PRECOMPUTE GDATA, HK, T1, T2, T3, T4, T5, T6 +.macro PRECOMPUTE HTAB, HK, T1, T2, T3, T4, T5, T6 movdqa \HK, \T4 pshufd $0b01001110, \HK, \T1 pxor \HK, \T1 - movdqu \T1, HashKey_k(\GDATA) + movdqu \T1, HashKey_k(\HTAB) GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^2<<1 mod poly - movdqu \T4, HashKey_2(\GDATA) // [HashKey_2] = HashKey^2<<1 mod poly + movdqu \T4, HashKey_2(\HTAB) // [HashKey_2] = HashKey^2<<1 mod poly pshufd $0b01001110, \T4, \T1 pxor \T4, \T1 - movdqu \T1, HashKey_2_k(\GDATA) + movdqu \T1, HashKey_2_k(\HTAB) GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^3<<1 mod poly - movdqu \T4, HashKey_3(\GDATA) + movdqu \T4, HashKey_3(\HTAB) pshufd $0b01001110, \T4, \T1 pxor \T4, \T1 - movdqu \T1, HashKey_3_k(\GDATA) + movdqu \T1, HashKey_3_k(\HTAB) GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^4<<1 mod poly - movdqu \T4, HashKey_4(\GDATA) + movdqu \T4, HashKey_4(\HTAB) pshufd $0b01001110, \T4, \T1 pxor \T4, \T1 - movdqu \T1, HashKey_4_k(\GDATA) + movdqu \T1, HashKey_4_k(\HTAB) GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^5<<1 mod poly - movdqu \T4, HashKey_5(\GDATA) + movdqu \T4, HashKey_5(\HTAB) pshufd $0b01001110, \T4, \T1 pxor \T4, \T1 - movdqu \T1, HashKey_5_k(\GDATA) + movdqu \T1, HashKey_5_k(\HTAB) GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^6<<1 mod poly - movdqu \T4, HashKey_6(\GDATA) + movdqu \T4, HashKey_6(\HTAB) pshufd $0b01001110, \T4, \T1 pxor \T4, \T1 - movdqu \T1, HashKey_6_k(\GDATA) + movdqu \T1, HashKey_6_k(\HTAB) GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^7<<1 mod poly - movdqu \T4, HashKey_7(\GDATA) + movdqu \T4, HashKey_7(\HTAB) pshufd $0b01001110, \T4, \T1 pxor \T4, \T1 - movdqu \T1, HashKey_7_k(\GDATA) + movdqu \T1, HashKey_7_k(\HTAB) GHASH_MUL \T4, \HK, \T1, \T2, \T3, \T5, \T6 // \T4 = HashKey^8<<1 mod poly - movdqu \T4, HashKey_8(\GDATA) + movdqu \T4, HashKey_8(\HTAB) pshufd $0b01001110, \T4, \T1 pxor \T4, \T1 - movdqu \T1, HashKey_8_k(\GDATA) + movdqu \T1, HashKey_8_k(\HTAB) .endm // PRECOMPUTE @@ -397,7 +426,7 @@ _CALC_AAD_done_\@: //////////////////////////////////////////////////////////////////////////////// // PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks // between update calls. Requires the input data be at least 1 byte long. -// Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text +// Input: gcm_key_data (GCM_HTAB), gcm_context_data (GDATA_CTX), input text // (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), the current data offset // (DATA_OFFSET), and whether encoding or decoding (ENC_DEC). // Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated @@ -405,7 +434,7 @@ _CALC_AAD_done_\@: // Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, // xmm10, xmm11, xmm13 //////////////////////////////////////////////////////////////////////////////// -.macro PARTIAL_BLOCK GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ +.macro PARTIAL_BLOCK GCM_HTAB, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ PLAIN_CYPH_LEN, DATA_OFFSET, AAD_HASH, ENC_DEC // clang compat: no local support @@ -432,7 +461,7 @@ _data_read_\@: //Finished reading in data movdqu PBlockEncKey(\GDATA_CTX), %xmm9 //xmm9 = ctx_data.partial_block_enc_key - movdqu HashKey(\GDATA_KEY), %xmm13 + movdqu HashKey(\GCM_HTAB), %xmm13 lea SHIFT_MASK(%rip), %r12 @@ -440,7 +469,7 @@ _data_read_\@: //Finished reading in data movdqu (%r12), %xmm2 // get the appropriate shuffle mask pshufb %xmm2, %xmm9 // shift right r13 bytes - .ifc \ENC_DEC, DEC + .ifc \ENC_DEC, DEC // We are decrypting. movdqa %xmm1, %xmm3 pxor %xmm1, %xmm9 // Cyphertext XOR E(K, Yn) @@ -473,7 +502,7 @@ _partial_incomplete_1_\@: _dec_done_\@: movdqu \AAD_HASH, AadHash(\GDATA_CTX) - .else // .ifc \ENC_DEC, DEC + .else // .ifc \ENC_DEC, DEC; We are encrypting. pxor %xmm1, %xmm9 // Plaintext XOR E(K, Yn) @@ -542,11 +571,11 @@ _partial_block_done_\@: // INITIAL_BLOCKS: If a = number of total plaintext bytes; b = floor(a/16); // \num_initial_blocks = b mod 8; encrypt the initial \num_initial_blocks // blocks and apply ghash on the ciphertext. -// \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, r14 are used as a +// \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, r14 are used as a // pointer only, not modified. // Updated AAD_HASH is returned in \T3. //////////////////////////////////////////////////////////////////////////////// -.macro INITIAL_BLOCKS GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ +.macro INITIAL_BLOCKS KEYSCHED, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ LENGTH, DATA_OFFSET, num_initial_blocks, T1, HASH_KEY, \ T3, T4, T5, CTR, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, \ XMM7, XMM8, T6, T_key, ENC_DEC @@ -566,13 +595,13 @@ _partial_block_done_\@: .set i, (9-\num_initial_blocks) .rept \num_initial_blocks xmmreg i, %i - paddd ONE(%rip), \CTR // INCR Y0 + paddd ONE(%rip), \CTR // INCR Y0 movdqa \CTR, xmmi - pshufb SHUF_MASK(%rip), xmmi // perform a 16Byte swap + pshufb SHUF_MASK(%rip), xmmi // perform a 16Byte swap .set i, (i+1) .endr -movdqu 16*0(\GDATA_KEY), \T_key +movdqu 16*0(\KEYSCHED), \T_key .set i, (9-\num_initial_blocks) .rept \num_initial_blocks xmmreg i, %i @@ -581,8 +610,8 @@ movdqu 16*0(\GDATA_KEY), \T_key .endr .set j, 1 -.rept NROUNDS // encrypt N blocks with 13 key rounds (11 for GCM192) -movdqu 16*j(\GDATA_KEY), \T_key +.rept NROUNDS // encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu 16*j(\KEYSCHED), \T_key .set i, (9-\num_initial_blocks) .rept \num_initial_blocks xmmreg i, %i @@ -593,7 +622,7 @@ movdqu 16*j(\GDATA_KEY), \T_key .set j, (j+1) .endr -movdqu 16*j(\GDATA_KEY), \T_key // encrypt with last (14th) key round (12 for GCM192) +movdqu 16*j(\KEYSCHED), \T_key // encrypt with last (14th) key round (12 for GCM192) .set i, (9-\num_initial_blocks) .rept \num_initial_blocks xmmreg i, %i @@ -668,7 +697,7 @@ movdqu 16*j(\GDATA_KEY), \T_key // encrypt with last (14th) key round (12 for movdqa \CTR, \XMM8 pshufb SHUF_MASK(%rip), \XMM8 // perform a 16Byte swap - movdqu 16*0(\GDATA_KEY), \T_key + movdqu 16*0(\KEYSCHED), \T_key pxor \T_key, \XMM1 pxor \T_key, \XMM2 pxor \T_key, \XMM3 @@ -680,7 +709,7 @@ movdqu 16*j(\GDATA_KEY), \T_key // encrypt with last (14th) key round (12 for .set i, 1 .rept NROUNDS // do early (13) rounds (11 for GCM192) - movdqu 16*i(\GDATA_KEY), \T_key + movdqu 16*i(\KEYSCHED), \T_key aesenc \T_key, \XMM1 aesenc \T_key, \XMM2 aesenc \T_key, \XMM3 @@ -692,7 +721,7 @@ movdqu 16*j(\GDATA_KEY), \T_key // encrypt with last (14th) key round (12 for .set i, (i+1) .endr - movdqu 16*i(\GDATA_KEY), \T_key // do final key round + movdqu 16*i(\KEYSCHED), \T_key // do final key round aesenclast \T_key, \XMM1 aesenclast \T_key, \XMM2 aesenclast \T_key, \XMM3 @@ -780,14 +809,14 @@ _initial_blocks_done_\@: //////////////////////////////////////////////////////////////////////////////// // GHASH_8_ENCRYPT_8_PARALLEL: Encrypt 8 blocks at a time and ghash the 8 // previously encrypted ciphertext blocks. -// \GDATA (KEY), \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN are used as pointers only, -// not modified. +// \KEYSCHED, \GCM_HTAB, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN are used as pointers +// only, not modified. // \DATA_OFFSET is the data offset value //////////////////////////////////////////////////////////////////////////////// -.macro GHASH_8_ENCRYPT_8_PARALLEL GDATA, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ - DATA_OFFSET, T1, T2, T3, T4, T5, T6, CTR, \ - XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, \ - XMM8, T7, loop_idx, ENC_DEC +.macro GHASH_8_ENCRYPT_8_PARALLEL KEYSCHED, GCM_HTAB, CYPH_PLAIN_OUT, \ + PLAIN_CYPH_IN, DATA_OFFSET, T1, T2, T3, T4, \ + T5, T6, CTR, XMM1, XMM2, XMM3, XMM4, XMM5, \ + XMM6, XMM7, XMM8, T7, loop_idx, ENC_DEC movdqa \XMM1, \T7 @@ -810,10 +839,10 @@ _initial_blocks_done_\@: .else paddd ONEf(%rip), \CTR // INCR CNT .endif - movdqu HashKey_8(\GDATA), \T5 + movdqu HashKey_8(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T4 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T7 // \T7 = a0*b0 - movdqu HashKey_8_k(\GDATA), \T5 + movdqu HashKey_8_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T6 // \T2 = (a1+a0)*(b1+b0) movdqa \CTR, \XMM1 @@ -875,7 +904,7 @@ _initial_blocks_done_\@: .endif // .ifc \loop_idx, in_order //////////////////////////////////////////////////////////////////////// - movdqu 16*0(\GDATA), \T1 + movdqu 16*0(\KEYSCHED), \T1 pxor \T1, \XMM1 pxor \T1, \XMM2 pxor \T1, \XMM3 @@ -894,16 +923,16 @@ _initial_blocks_done_\@: pshufd $0b01001110, \T3, \T2 pxor \T3, \T2 - movdqu HashKey_7(\GDATA), \T5 + movdqu HashKey_7(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 - movdqu HashKey_7_k(\GDATA), \T5 + movdqu HashKey_7_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part pxor \T3, \T7 pxor \T2, \T6 - movdqu 16*1(\GDATA), \T1 + movdqu 16*1(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -913,7 +942,7 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*2(\GDATA), \T1 + movdqu 16*2(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -930,16 +959,16 @@ _initial_blocks_done_\@: pshufd $0b01001110, \T3, \T2 pxor \T3, \T2 - movdqu HashKey_6(\GDATA), \T5 + movdqu HashKey_6(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 - movdqu HashKey_6_k(\GDATA), \T5 + movdqu HashKey_6_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part pxor \T3, \T7 pxor \T2, \T6 - movdqu 16*3(\GDATA), \T1 + movdqu 16*3(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -954,16 +983,16 @@ _initial_blocks_done_\@: pshufd $0b01001110, \T3, \T2 pxor \T3, \T2 - movdqu HashKey_5(\GDATA), \T5 + movdqu HashKey_5(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 - movdqu HashKey_5_k(\GDATA), \T5 + movdqu HashKey_5_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part pxor \T3, \T7 pxor \T2, \T6 - movdqu 16*4(\GDATA), \T1 + movdqu 16*4(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -973,7 +1002,7 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*5(\GDATA), \T1 + movdqu 16*5(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -988,16 +1017,16 @@ _initial_blocks_done_\@: pshufd $0b01001110, \T3, \T2 pxor \T3, \T2 - movdqu HashKey_4(\GDATA), \T5 + movdqu HashKey_4(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 - movdqu HashKey_4_k(\GDATA), \T5 + movdqu HashKey_4_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part pxor \T3, \T7 pxor \T2, \T6 - movdqu 16*6(\GDATA), \T1 + movdqu 16*6(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1013,16 +1042,16 @@ _initial_blocks_done_\@: pshufd $0b01001110, \T3, \T2 pxor \T3, \T2 - movdqu HashKey_3(\GDATA), \T5 + movdqu HashKey_3(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 - movdqu HashKey_3_k(\GDATA), \T5 + movdqu HashKey_3_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part pxor \T3, \T7 pxor \T2, \T6 - movdqu 16*7(\GDATA), \T1 + movdqu 16*7(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1037,16 +1066,16 @@ _initial_blocks_done_\@: pshufd $0b01001110, \T3, \T2 pxor \T3, \T2 - movdqu HashKey_2(\GDATA), \T5 + movdqu HashKey_2(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 - movdqu HashKey_2_k(\GDATA), \T5 + movdqu HashKey_2_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part pxor \T3, \T7 pxor \T2, \T6 - movdqu 16*8(\GDATA), \T1 + movdqu 16*8(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1066,15 +1095,15 @@ _initial_blocks_done_\@: pshufd $0b01001110, \T3, \T2 pxor \T3, \T2 - movdqu HashKey(\GDATA), \T5 + movdqu HashKey(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \T3 // \T3 = a0*b0 - movdqu HashKey_k(\GDATA), \T5 + movdqu HashKey_k(\GCM_HTAB), \T5 pclmulqdq $0x00, \T5, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T3, \T7 pxor \T1, \T4 // accumulate the results in \T4:\T7, \T6 holds the middle part - movdqu 16*9(\GDATA), \T1 + movdqu 16*9(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1086,10 +1115,10 @@ _initial_blocks_done_\@: #ifdef GCM128_MODE - movdqu 16*10(\GDATA), \T5 + movdqu 16*10(\KEYSCHED), \T5 #endif #ifdef GCM192_MODE - movdqu 16*10(\GDATA), \T1 + movdqu 16*10(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1099,7 +1128,7 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*11(\GDATA), \T1 + movdqu 16*11(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1109,10 +1138,10 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*12(\GDATA), \T5 // finish last key round + movdqu 16*12(\KEYSCHED), \T5 // finish last key round #endif #ifdef GCM256_MODE - movdqu 16*10(\GDATA), \T1 + movdqu 16*10(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1122,7 +1151,7 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*11(\GDATA), \T1 + movdqu 16*11(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1132,7 +1161,7 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*12(\GDATA), \T1 + movdqu 16*12(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1142,7 +1171,7 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*13(\GDATA), \T1 + movdqu 16*13(\KEYSCHED), \T1 aesenc \T1, \XMM1 aesenc \T1, \XMM2 aesenc \T1, \XMM3 @@ -1152,7 +1181,7 @@ _initial_blocks_done_\@: aesenc \T1, \XMM7 aesenc \T1, \XMM8 - movdqu 16*14(\GDATA), \T5 // finish last key round + movdqu 16*14(\KEYSCHED), \T5 // finish last key round #endif .altmacro @@ -1242,7 +1271,7 @@ _initial_blocks_done_\@: //////////////////////////////////////////////////////////////////////////////// // GHASH_LAST_8: GHASH the last 8 ciphertext blocks. //////////////////////////////////////////////////////////////////////////////// -.macro GHASH_LAST_8 GDATA, T1, T2, T3, T4, T5, T6, T7, \ +.macro GHASH_LAST_8 GCM_HTAB, T1, T2, T3, T4, T5, T6, T7, \ XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8 @@ -1250,11 +1279,11 @@ _initial_blocks_done_\@: movdqa \XMM1, \T6 pshufd $0b01001110, \XMM1, \T2 pxor \XMM1, \T2 - movdqu HashKey_8(\GDATA), \T5 + movdqu HashKey_8(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T6 // \T6 = a1*b1 pclmulqdq $0x00, \T5, \XMM1 // \XMM1 = a0*b0 - movdqu HashKey_8_k(\GDATA), \T4 + movdqu HashKey_8_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) movdqa \XMM1, \T7 @@ -1264,11 +1293,11 @@ _initial_blocks_done_\@: movdqa \XMM2, \T1 pshufd $0b01001110, \XMM2, \T2 pxor \XMM2, \T2 - movdqu HashKey_7(\GDATA), \T5 + movdqu HashKey_7(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \XMM2 // \XMM2 = a0*b0 - movdqu HashKey_7_k(\GDATA), \T4 + movdqu HashKey_7_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T6 @@ -1279,11 +1308,11 @@ _initial_blocks_done_\@: movdqa \XMM3, \T1 pshufd $0b01001110, \XMM3, \T2 pxor \XMM3, \T2 - movdqu HashKey_6(\GDATA), \T5 + movdqu HashKey_6(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \XMM3 // \XMM3 = a0*b0 - movdqu HashKey_6_k(\GDATA), \T4 + movdqu HashKey_6_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T6 @@ -1294,11 +1323,11 @@ _initial_blocks_done_\@: movdqa \XMM4, \T1 pshufd $0b01001110, \XMM4, \T2 pxor \XMM4, \T2 - movdqu HashKey_5(\GDATA), \T5 + movdqu HashKey_5(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \XMM4 // \XMM4 = a0*b0 - movdqu HashKey_5_k(\GDATA), \T4 + movdqu HashKey_5_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T6 @@ -1309,11 +1338,11 @@ _initial_blocks_done_\@: movdqa \XMM5, \T1 pshufd $0b01001110, \XMM5, \T2 pxor \XMM5, \T2 - movdqu HashKey_4(\GDATA), \T5 + movdqu HashKey_4(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \XMM5 // \XMM5 = a0*b0 - movdqu HashKey_4_k(\GDATA), \T4 + movdqu HashKey_4_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T6 @@ -1324,11 +1353,11 @@ _initial_blocks_done_\@: movdqa \XMM6, \T1 pshufd $0b01001110, \XMM6, \T2 pxor \XMM6, \T2 - movdqu HashKey_3(\GDATA), \T5 + movdqu HashKey_3(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \XMM6 // \XMM6 = a0*b0 - movdqu HashKey_3_k(\GDATA), \T4 + movdqu HashKey_3_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T6 @@ -1339,11 +1368,11 @@ _initial_blocks_done_\@: movdqa \XMM7, \T1 pshufd $0b01001110, \XMM7, \T2 pxor \XMM7, \T2 - movdqu HashKey_2(\GDATA), \T5 + movdqu HashKey_2(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \XMM7 // \XMM7 = a0*b0 - movdqu HashKey_2_k(\GDATA), \T4 + movdqu HashKey_2_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T6 @@ -1355,11 +1384,11 @@ _initial_blocks_done_\@: movdqa \XMM8, \T1 pshufd $0b01001110, \XMM8, \T2 pxor \XMM8, \T2 - movdqu HashKey(\GDATA), \T5 + movdqu HashKey(\GCM_HTAB), \T5 pclmulqdq $0x11, \T5, \T1 // \T1 = a1*b1 pclmulqdq $0x00, \T5, \XMM8 // \XMM8 = a0*b0 - movdqu HashKey_k(\GDATA), \T4 + movdqu HashKey_k(\GCM_HTAB), \T4 pclmulqdq $0x00, \T4, \T2 // \T2 = (a1+a0)*(b1+b0) pxor \T1, \T6 @@ -1414,19 +1443,19 @@ _initial_blocks_done_\@: //////////////////////////////////////////////////////////////////////////////// // ENCRYPT_SINGLE_BLOCK: Encrypt a single block. //////////////////////////////////////////////////////////////////////////////// -.macro ENCRYPT_SINGLE_BLOCK GDATA, ST, T1 +.macro ENCRYPT_SINGLE_BLOCK KEYSCHED, ST, T1 - movdqu 16*0(\GDATA), \T1 + movdqu 16*0(\KEYSCHED), \T1 pxor \T1, \ST .set i, 1 .rept NROUNDS - movdqu 16*i(\GDATA), \T1 + movdqu 16*i(\KEYSCHED), \T1 aesenc \T1, \ST .set i, (i+1) .endr - movdqu 16*i(\GDATA), \T1 + movdqu 16*i(\KEYSCHED), \T1 aesenclast \T1, \ST .endm // ENCRYPT_SINGLE_BLOCK @@ -1437,92 +1466,67 @@ _initial_blocks_done_\@: .macro FUNC_SAVE //// Required for Update/GMC_ENC //the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - push %rsi - mov %rsp, %r14 + CFI_PUSHQ %r12, -16 + CFI_PUSHQ %r13, -24 + CFI_PUSHQ %r14, -32 + CFI_PUSHQ %r15, -40 + CFI_PUSHQ %rsi, -48 // XXXX Why push %rsi ???? + mov %rsp, %r14 + .cfi_def_cfa_register %r14 sub $(VARIABLE_OFFSET), %rsp and $~63, %rsp -#if __OUTPUT_FORMAT__ == win64 - // xmm6:xmm15 need to be maintained for Windows - movdqu %xmm6, (LOCAL_STORAGE + 0*16)(%rsp) - movdqu %xmm7, (LOCAL_STORAGE + 1*16)(%rsp) - movdqu %xmm8, (LOCAL_STORAGE + 2*16)(%rsp) - movdqu %xmm9, (LOCAL_STORAGE + 3*16)(%rsp) - movdqu %xmm10, (LOCAL_STORAGE + 4*16)(%rsp) - movdqu %xmm11, (LOCAL_STORAGE + 5*16)(%rsp) - movdqu %xmm12, (LOCAL_STORAGE + 6*16)(%rsp) - movdqu %xmm13, (LOCAL_STORAGE + 7*16)(%rsp) - movdqu %xmm14, (LOCAL_STORAGE + 8*16)(%rsp) - movdqu %xmm15, (LOCAL_STORAGE + 9*16)(%rsp) - - mov arg(5), arg5 // XXXX [r14 + STACK_OFFSET + 8*5] -#endif .endm // FUNC_SAVE //////////////////////////////////////////////////////////////////////////////// // FUNC_RESTORE: Restore clobbered regs from the stack. //////////////////////////////////////////////////////////////////////////////// .macro FUNC_RESTORE - -#if __OUTPUT_FORMAT__ == win64 - movdqu (LOCAL_STORAGE + 9*16)(%rsp), %xmm15 - movdqu (LOCAL_STORAGE + 8*16)(%rsp), %xmm14 - movdqu (LOCAL_STORAGE + 7*16)(%rsp), %xmm13 - movdqu (LOCAL_STORAGE + 6*16)(%rsp), %xmm12 - movdqu (LOCAL_STORAGE + 5*16)(%rsp), %xmm11 - movdqu (LOCAL_STORAGE + 4*16)(%rsp), %xmm10 - movdqu (LOCAL_STORAGE + 3*16)(%rsp), %xmm9 - movdqu (LOCAL_STORAGE + 2*16)(%rsp), %xmm8 - movdqu (LOCAL_STORAGE + 1*16)(%rsp), %xmm7 - movdqu (LOCAL_STORAGE + 0*16)(%rsp), %xmm6 -#endif - // Required for Update/GMC_ENC - mov %r14, %rsp - pop %rsi - pop %r15 - pop %r14 - pop %r13 - pop %r12 + mov %r14, %rsp + .cfi_def_cfa_register %rsp + CFI_POPQ %rsi + CFI_POPQ %r15 + CFI_POPQ %r14 + CFI_POPQ %r13 + CFI_POPQ %r12 .endm // FUNC_RESTORE //////////////////////////////////////////////////////////////////////////////// // GCM_INIT: Initializes a gcm_context_data struct to prepare for // encoding/decoding. -// Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +// Input: gcm_ctx->gcm_Htable *(GCM_HTAB), gcm_ctx_t *(GDATA_CTX), IV, // Additional Authentication data (A_IN), Additional Data length (A_LEN). // Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized -// other parts of GDATA. +// other parts of GDATA_CTX. // Clobbers rax, r10-r13 and xmm0-xmm6 //////////////////////////////////////////////////////////////////////////////// -.macro GCM_INIT GDATA_KEY, GDATA_CTX, IV, A_IN, A_LEN +.macro GCM_INIT GCM_HTAB, GDATA_CTX, IV, A_IN, A_LEN, TAG_LEN #define AAD_HASH %xmm0 #define SUBHASH %xmm1 - movdqu HashKey(\GDATA_KEY), SUBHASH + movdqu HashKey(\GCM_HTAB), SUBHASH CALC_AAD_HASH \A_IN, \A_LEN, AAD_HASH, SUBHASH, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %r10, %r11, %r12, %r13, %rax pxor %xmm3, %xmm2 - mov \A_LEN, %r10 + movq \A_LEN, %r10 // %r10 = AAD length - movdqu AAD_HASH, AadHash(\GDATA_CTX) // ctx_data.aad hash = aad_hash - mov %r10, AadLen(\GDATA_CTX) // ctx_data.aad_length = aad_length + movdqu AAD_HASH, AadHash(\GDATA_CTX) // gcm_ctx.gcm_ghash = aad_hash + movq %r10, AadLen(\GDATA_CTX) // gcm_ctx->gcm_len_a_len_c[0] = aad_length + movq \TAG_LEN, %r10 // %r10 = aad_tag_len + movq %r10, TagLen(\GDATA_CTX) // gcm_ctx->gcm_tag_len = aad_tag_len xor %r10, %r10 - mov %r10, InLen(\GDATA_CTX) // ctx_data.in_length = 0 - mov %r10, PBlockLen(\GDATA_CTX) // ctx_data.partial_block_length = 0 - movdqu %xmm2, PBlockEncKey(\GDATA_CTX) // ctx_data.partial_block_enc_key = 0 - mov \IV, %r10 - movdqa ONEf(%rip), %xmm2 // read 12 IV bytes and pad with 0x00000001 + movq %r10, InLen(\GDATA_CTX) // gcm_ctx.gcm_processed_data_len = 0 + movq %r10, PBlockLen(\GDATA_CTX) // gcm_ctx.gcm_remainder_len = 0 + movdqu %xmm2, PBlockEncKey(\GDATA_CTX) // XXXX last counter block ???? gcm_ctx.gcm_remainder = 0 + movq \IV, %r10 + movdqa ONEf(%rip), %xmm2 // read 12 IV bytes and pad with 0x00000001 pinsrq $0, (%r10), %xmm2 pinsrd $2, 8(%r10), %xmm2 - movdqu %xmm2, OrigIV(\GDATA_CTX) // ctx_data.orig_IV = iv + movdqu %xmm2, OrigIV(\GDATA_CTX) // gcm_ctx.gcm_J0 = CTR0 pshufb SHUF_MASK(%rip), %xmm2 @@ -1535,15 +1539,15 @@ _initial_blocks_done_\@: // gcm_context_data struct has been initialized by GCM_INIT. // Requires the input data be at least 1 byte long because of // READ_SMALL_INPUT_DATA. -// Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), +// Input: gcm_key_data * (KEYSCHED, GCM_HTAB), gcm_context_data (GDATA_CTX), // input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN) and whether // encoding or decoding (ENC_DEC). // Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated // GDATA_CTX // Clobbers rax, r10-r15, and xmm0-xmm15 //////////////////////////////////////////////////////////////////////////////// -.macro GCM_ENC_DEC GDATA_KEY, GDATA_CTX, CYPH_PLAIN_OUT, PLAIN_CYPH_IN, \ - PLAIN_CYPH_LEN, ENC_DEC +.macro GCM_ENC_DEC KEYSCHED, GCM_HTAB, GDATA_CTX, CYPH_PLAIN_OUT, \ + PLAIN_CYPH_IN, PLAIN_CYPH_LEN, ENC_DEC #define DATA_OFFSET %r11 @@ -1567,11 +1571,10 @@ _initial_blocks_done_\@: xor DATA_OFFSET, DATA_OFFSET add \PLAIN_CYPH_LEN, InLen(\GDATA_CTX) //Update length of data processed - movdqu HashKey(\GDATA_KEY), %xmm13 // xmm13 = HashKey + movdqu HashKey(\GCM_HTAB), %xmm13 // xmm13 = HashKey movdqu AadHash(\GDATA_CTX), %xmm8 - - PARTIAL_BLOCK \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, \PLAIN_CYPH_LEN, DATA_OFFSET, %xmm8, \ENC_DEC + PARTIAL_BLOCK \GCM_HTAB, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, \PLAIN_CYPH_LEN, DATA_OFFSET, %xmm8, \ENC_DEC mov \PLAIN_CYPH_LEN, %r13 // save the number of bytes of plaintext/ciphertext sub DATA_OFFSET, %r13 @@ -1600,42 +1603,42 @@ _initial_blocks_done_\@: jmp _initial_num_blocks_is_1_\@ _initial_num_blocks_is_7_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $(16*7), %r13 jmp _initial_blocks_encrypted_\@ _initial_num_blocks_is_6_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $(16*6), %r13 jmp _initial_blocks_encrypted_\@ _initial_num_blocks_is_5_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $(16*5), %r13 jmp _initial_blocks_encrypted_\@ _initial_num_blocks_is_4_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $(16*4), %r13 jmp _initial_blocks_encrypted_\@ _initial_num_blocks_is_3_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $(16*3), %r13 jmp _initial_blocks_encrypted_\@ _initial_num_blocks_is_2_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $(16*2), %r13 jmp _initial_blocks_encrypted_\@ _initial_num_blocks_is_1_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $(16*1), %r13 jmp _initial_blocks_encrypted_\@ _initial_num_blocks_is_0_\@: - INITIAL_BLOCKS \GDATA_KEY, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + INITIAL_BLOCKS \KEYSCHED, \GDATA_CTX, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, %r13, DATA_OFFSET, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC _initial_blocks_encrypted_\@: cmp $0, %r13 @@ -1654,7 +1657,7 @@ _encrypt_by_8_new_\@: jg _encrypt_by_8_\@ add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL \GDATA_KEY, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + GHASH_8_ENCRYPT_8_PARALLEL \KEYSCHED, \GCM_HTAB, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC add $128, DATA_OFFSET sub $128, %r13 jne _encrypt_by_8_new_\@ @@ -1666,7 +1669,7 @@ _encrypt_by_8_\@: pshufb SHUF_MASK(%rip), %xmm9 add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL \GDATA_KEY, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + GHASH_8_ENCRYPT_8_PARALLEL \KEYSCHED, \GCM_HTAB, \CYPH_PLAIN_OUT, \PLAIN_CYPH_IN, DATA_OFFSET, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC pshufb SHUF_MASK(%rip), %xmm9 add $128, DATA_OFFSET sub $128, %r13 @@ -1677,12 +1680,12 @@ _encrypt_by_8_\@: _eight_cipher_left_\@: - GHASH_LAST_8 \GDATA_KEY, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + GHASH_LAST_8 \GCM_HTAB, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 _zero_cipher_left_\@: - movdqu %xmm14, AadHash(\GDATA_CTX) - movdqu %xmm9, CurCount(\GDATA_CTX) + movdqu %xmm14, AadHash(\GDATA_CTX) + movdqu %xmm9, CurCount(\GDATA_CTX) mov %r10, %r13 and $15, %r13 // r13 = (\PLAIN_CYPH_LEN mod 16) @@ -1695,7 +1698,7 @@ _zero_cipher_left_\@: paddd ONE(%rip), %xmm9 // INCR CNT to get Yn movdqu %xmm9, CurCount(\GDATA_CTX) // my_ctx.data.current_counter = xmm9 pshufb SHUF_MASK(%rip), %xmm9 - ENCRYPT_SINGLE_BLOCK \GDATA_KEY, %xmm9, %xmm2 // E(K, Yn) + ENCRYPT_SINGLE_BLOCK \KEYSCHED, %xmm9, %xmm2 // E(K, Yn) movdqu %xmm9, PBlockEncKey(\GDATA_CTX) // my_ctx_data.partial_block_enc_key = xmm9 cmp $16, \PLAIN_CYPH_LEN @@ -1774,13 +1777,12 @@ _multiple_of_16_bytes_\@: //////////////////////////////////////////////////////////////////////////////// // GCM_COMPLETE: Finishes Encyrption/Decryption of last partial block after // GCM_UPDATE finishes. -// Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and -// whether encoding or decoding (ENC_DEC). -// Output: Authorization Tag (AUTH_TAG) and Authorization Tag length -// (AUTH_TAG_LEN) +// Input: A gcm_key_data * (KEYSCHED, GCM_HTAB), gcm_context_data * (GDATA_CTX) +// and whether encoding or decoding (ENC_DEC). +// Output: Authorization Tag (AUTH_TAG) stored in gcm_ctx.gcm_ghash // Clobbers %rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 //////////////////////////////////////////////////////////////////////////////// -.macro GCM_COMPLETE GDATA_KEY, GDATA_CTX, AUTH_TAG, AUTH_TAG_LEN, ENC_DEC +.macro GCM_COMPLETE KEYSCHED, GCM_HTAB, GDATA_CTX, ENC_DEC #define PLAIN_CYPH_LEN %rax @@ -1789,7 +1791,7 @@ _multiple_of_16_bytes_\@: mov PBlockLen(\GDATA_CTX), %r12 // r12 = aadLen (number of bytes) movdqu AadHash(\GDATA_CTX), %xmm14 - movdqu HashKey(\GDATA_KEY), %xmm13 + movdqu HashKey(\GCM_HTAB), %xmm13 cmp $0, %r12 @@ -1803,26 +1805,32 @@ _partial_done_\@: mov AadLen(\GDATA_CTX), %r12 // r12 = aadLen (number of bytes) mov InLen(\GDATA_CTX), PLAIN_CYPH_LEN - shl $3, %r12 // convert into number of bits + shl $3, %r12 // convert into number of bits movd %r12d, %xmm15 // len(A) in xmm15 shl $3, PLAIN_CYPH_LEN // len(C) in bits (*128) movq PLAIN_CYPH_LEN, %xmm1 pslldq $8, %xmm15 // xmm15 = len(A)|| 0x0000000000000000 pxor %xmm1, %xmm15 // xmm15 = len(A)||len(C) - +#ifdef DEBUG + pshufb SHUF_MASK(%rip), %xmm15 // perform a 16Byte swap + movdqu %xmm15, LenALenC(\GDATA_CTX) + pshufb SHUF_MASK(%rip), %xmm15 // undo 16Byte swap +#endif pxor %xmm15, %xmm14 GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 // final GHASH computation pshufb SHUF_MASK(%rip), %xmm14 // perform a 16Byte swap movdqu OrigIV(\GDATA_CTX), %xmm9 // xmm9 = Y0 - ENCRYPT_SINGLE_BLOCK \GDATA_KEY, %xmm9, %xmm2 // E(K, Y0) + ENCRYPT_SINGLE_BLOCK \KEYSCHED, %xmm9, %xmm2 // E(K, Y0) pxor %xmm14, %xmm9 _return_T_\@: - mov \AUTH_TAG, %r10 // r10 = authTag - mov \AUTH_TAG_LEN, %r11 // r11 = auth_tag_len + // mov \AUTH_TAG, %r10 // r10 = authTag + // mov \AUTH_TAG_LEN, %r11 // r11 = auth_tag_len + lea AadHash(\GDATA_CTX), %r10 // r10 = authTag + movq TagLen(\GDATA_CTX), %r11 // r11 = auth_tag_len cmp $16, %r11 je _T_16_\@ @@ -1830,14 +1838,14 @@ _return_T_\@: cmp $12, %r11 je _T_12_\@ -_T_8_\@: - movq %xmm9, %rax - mov %rax, (%r10) +_T_8_\@: // XXXX: Why use intermediate reg %rax/%eax? + movq %xmm9, %rax // %rax (ret val) contains 8 bytes tag + movq %rax, (%r10) jmp _return_T_done_\@ _T_12_\@: - movq %xmm9, %rax - mov %rax, (%r10) + movq %xmm9, %rax // %rax (ret val) contains upper and lower 4 bytes of tag + movq %rax, (%r10) psrldq $8, %xmm9 movd %xmm9, %eax mov %eax, 8(%r10) @@ -1850,37 +1858,34 @@ _return_T_done_\@: .endm //GCM_COMPLETE -#if 1 - - .balign 16 //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_precomp_{128,256}_sse -// (struct gcm_key_data *key_data); +// void icp_isalc_gcm_precomp_{128,192,256}_sse( +// gcm_ctx_t *context_data /* arg1 */ +// ); //////////////////////////////////////////////////////////////////////////////// #if FUNCT_EXTENSION != _nt -.global FN_NAME(precomp,_) -FN_NAME(precomp,_): - endbranch +ENTRY_NP(FN_NAME(precomp,_)) +.cfi_startproc + ENDBR - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 + CFI_PUSHQ %r12, -16 + CFI_PUSHQ %r13, -24 + CFI_PUSHQ %r14, -32 + CFI_PUSHQ %r15, -40 + mov %rsp, %r14 + .cfi_def_cfa_register %r14 sub $(VARIABLE_OFFSET), %rsp - and $(~63), %rsp // align rsp to 64 bytes - -#if __OUTPUT_FORMAT__ == win64 - // only xmm6 needs to be maintained - movdqu %xmm6, (LOCAL_STORAGE + 0*16)(%rsp) -#endif + and $(~63), %rsp // align rsp to 64 bytes + mov KeySched(arg1), arg2 // arg2 = gcm_ctx->gcm_keysched + mov GcmHtab(arg1), arg3 // arg3 = gcm_ctx->gcm_Htable pxor %xmm6, %xmm6 - ENCRYPT_SINGLE_BLOCK arg1, %xmm6, %xmm2 // xmm6 = HashKey - + ENCRYPT_SINGLE_BLOCK arg2, %xmm6, %xmm2 // xmm6 = HashKey +#ifdef DEBUG + movdqu %xmm6, GcmH(arg1) // Save hash key to context. +#endif pshufb SHUF_MASK(%rip), %xmm6 /////////////// PRECOMPUTATION of HashKey<<1 mod poly from the HashKey movdqa %xmm6, %xmm2 @@ -1897,254 +1902,218 @@ FN_NAME(precomp,_): pand POLY(%rip), %xmm2 pxor %xmm2, %xmm6 // xmm6 holds the HashKey<<1 mod poly /////////////////////////////////////////////////////////////////////// - movdqu %xmm6, HashKey(arg1) // store HashKey<<1 mod poly + movdqu %xmm6, HashKey(arg3) // store HashKey<<1 mod poly - PRECOMPUTE arg1, %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + PRECOMPUTE arg3, %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 -#if __OUTPUT_FORMAT__ == win64 - movdqu (LOCAL_STORAGE + 0*16)(%rsp), %xmm6 -#endif - mov %r14, %rsp + mov %r14, %rsp + .cfi_def_cfa_register %rsp + CFI_POPQ %r15 + CFI_POPQ %r14 + CFI_POPQ %r13 + CFI_POPQ %r12 + RET +.cfi_endproc +SET_SIZE(FN_NAME(precomp,_)) - pop %r15 - pop %r14 - pop %r13 - pop %r12 - ret #endif // _nt //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_init_128_sse / aes_gcm_init_256_sse ( -// const struct gcm_key_data *key_data, -// struct gcm_context_data *context_data, -// u8 *iv, -// const u8 *aad, -// u64 aad_len); +// void icp_isalc_gcm_init_{128,192,256}_sse +// gcm_ctx_t *context_data, /* arg1 */ +// const uint8_t *iv, /* arg2 */ +// const uint8_t *aad, /* arg3 */ +// uint64_t aad_len /* arg4 */ +// uint64_t tag_len /* arg5 */ +// ); //////////////////////////////////////////////////////////////////////////////// #if FUNCT_EXTENSION != _nt -.global FN_NAME(init,_) -FN_NAME(init,_): - endbranch - - push %r12 - push %r13 -#if __OUTPUT_FORMAT__ == win64 - push arg5 - sub $(1*16), %rsp - movdqu %xmm6, (0*16)(%rsp) - mov (1*16 + 8*3 + 8*5)(%rsp), arg5 -#endif +ENTRY_NP(FN_NAME(init,_)) +.cfi_startproc + ENDBR - GCM_INIT arg1, arg2, arg3, arg4, arg5 + CFI_PUSHQ %r12, -16 + CFI_PUSHQ %r13, -24 -#if __OUTPUT_FORMAT__ == win64 - movdqu (0*16)(%rsp), %xmm6 - add $(1*16), %rsp - pop arg5 -#endif - pop %r13 - pop %r12 - ret -#endif // _nt + mov GcmHtab(arg1), arg6 // arg5 = gcm_ctx->gcm_Htable + GCM_INIT arg6, arg1, arg2, arg3, arg4, arg5 + CFI_POPQ %r13 + CFI_POPQ %r12 + RET +.cfi_endproc +SET_SIZE(FN_NAME(init,_)) +#endif // _nt //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_enc_128_update_sse / aes_gcm_enc_256_update_sse -// const struct gcm_key_data *key_data, -// struct gcm_context_data *context_data, -// u8 *out, -// const u8 *in, -// u64 plaintext_len); +// void icp_isalc_gcm_enc_{128,192,256}_update_sse( +// gcm_ctx_t *context_data, /* arg1 */ +// uint8_t *out, /* arg2 */ +// const uint8_t *in, /* arg3 */ +// uint64_t plaintext_len /* arg4 */ +// ); //////////////////////////////////////////////////////////////////////////////// -.global FN_NAME(enc,_update_) -FN_NAME(enc,_update_): - endbranch +ENTRY_NP(FN_NAME(enc,_update_)) +.cfi_startproc + ENDBR FUNC_SAVE - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + movq KeySched(arg1), arg5 // arg5 = gcm_ctx->gcm_keysched + movq GcmHtab(arg1), arg6 // arg6 = gcm_ctx->gcm_Htable - FUNC_RESTORE + GCM_ENC_DEC arg5, arg6, arg1, arg2, arg3, arg4, ENC - ret + FUNC_RESTORE + RET +.cfi_endproc +SET_SIZE(FN_NAME(enc,_update_)) //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_dec_256_update_sse / aes_gcm_dec_256_update_sse -// const struct gcm_key_data *key_data, -// struct gcm_context_data *context_data, -// u8 *out, -// const u8 *in, -// u64 plaintext_len); +// void icp_isalc_gcm_dec_{128,192,256}_update_sse( +// gcm_ctx_t *context_data, /* arg1 */ +// uint8_t *out, /* arg2 */ +// const uint8_t *in, /* arg3 */ +// uint64_t plaintext_len /* arg4 */ +// ); //////////////////////////////////////////////////////////////////////////////// -.global FN_NAME(dec,_update_) -FN_NAME(dec,_update_): - endbranch +ENTRY_NP(FN_NAME(dec,_update_)) +.cfi_startproc + ENDBR FUNC_SAVE - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + mov KeySched(arg1), arg5 // arg5 = gcm_ctx->gcm_keysched + mov GcmHtab(arg1), arg6 // arg6 = gcm_ctx->gcm_Htable + + GCM_ENC_DEC arg5, arg6, arg1, arg2, arg3, arg4, DEC FUNC_RESTORE - ret + RET +.cfi_endproc +SET_SIZE(FN_NAME(dec,_update_)) //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_256_finalize_sse -// const struct gcm_key_data *key_data, -// struct gcm_context_data *context_data, -// u8 *auth_tag, -// u64 auth_tag_len); +// void icp_isalc_gcm_enc_{128,192,256}_finalize_sse( +// gcm_ctx_t *context_data, /* arg1 */ +// ); //////////////////////////////////////////////////////////////////////////////// #if FUNCT_EXTENSION != _nt -.global FN_NAME(enc,_finalize_) -FN_NAME(enc,_finalize_): +ENTRY_NP(FN_NAME(enc,_finalize_)) +.cfi_startproc + ENDBR - endbranch + CFI_PUSHQ %r12, -16 - push %r12 + movq KeySched(arg1), arg2 // arg4 = gcm_ctx->gcm_keysched + movq GcmHtab(arg1), arg3 // arg5 = gcm_ctx->gcm_Htable -#if __OUTPUT_FORMAT__ == win64 - // xmm6:xmm15 need to be maintained for Windows - sub $(5*16), %rsp - movdqu %xmm6, (0*16)(%rsp) - movdqu %xmm9, (1*16)(%rsp) - movdqu %xmm11, (2*16)(%rsp) - movdqu %xmm14, (3*16)(%rsp) - movdqu %xmm15, (4*16)(%rsp) -#endif - GCM_COMPLETE arg1, arg2, arg3, arg4, ENC - -#if __OUTPUT_FORMAT__ == win64 - movdqu (4*16)(%rsp), %xmm15 - movdqu (3*16)(%rsp), %xmm14 - movdqu (2*16)(%rsp), %xmm11 - movdqu (1*16)(%rsp), %xmm9 - movdqu (0*16)(%rsp), %xmm6 - add $(5*16), %rsp -#endif + GCM_COMPLETE arg2, arg3, arg1, ENC - pop %r12 - ret + CFI_POPQ %r12 + RET +.cfi_endproc +SET_SIZE(FN_NAME(enc,_finalize_)) #endif // _nt //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_256_finalize_sse -// const struct gcm_key_data *key_data, -// struct gcm_context_data *context_data, -// u8 *auth_tag, -// u64 auth_tag_len); +// void icp_isalc_gcm_dec_{128,129,256}_finalize_sse( +// gcm_ctx_t *context_data, /* arg1 */ +// ); //////////////////////////////////////////////////////////////////////////////// #if FUNCT_EXTENSION != _nt -.global FN_NAME(dec,_finalize_) -FN_NAME(dec,_finalize_): +ENTRY_NP(FN_NAME(dec,_finalize_)) +.cfi_startproc + ENDBR - endbranch + CFI_PUSHQ %r12, -16 - push %r12 + movq KeySched(arg1), arg2 // arg4 = gcm_ctx->gcm_keysched + movq GcmHtab(arg1), arg3 // arg5 = gcm_ctx->gcm_Htable -#if __OUTPUT_FORMAT == win64 - // xmm6:xmm15 need to be maintained for Windows - sub $(5*16), %rsp - movdqu %xmm6, (0*16)(%rsp) - movdqu %xmm9, (1*16)(%rsp) - movdqu %xmm11, (2*16)(%rsp) - movdqu %xmm14, (3*16)(%rsp) - movdqu %xmm15, (4*16)(%rsp) -#endif - GCM_COMPLETE arg1, arg2, arg3, arg4, DEC - -#if __OUTPUT_FORMAT__ == win64 - movdqu (4*16)(%rsp), %xmm15 - movdqu (3*16)(%rsp), %xmm14 - movdqu (2*16)(%rsp), %xmm11 - movdqu (1*16)(%rsp), %xmm9 - movdqu (0*16)(%rsp), %xmm6 - add $(5*16), %rsp -#endif + GCM_COMPLETE arg2, arg3, arg1, DEC - pop %r12 - ret + CFI_POPQ %r12 + RET +.cfi_endproc +SET_SIZE(FN_NAME(dec,_finalize_)) #endif // _nt - //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_enc_128_sse / aes_gcm_enc_256_sse -// const struct gcm_key_data *key_data, -// struct gcm_context_data *context_data, -// u8 *out, -// const u8 *in, -// u64 plaintext_len, -// u8 *iv, -// const u8 *aad, -// u64 aad_len, -// u8 *auth_tag, -// u64 auth_tag_len)// +// void icp_isalc_gcm_enc_{128,192,256}_sse( +// gcm_ctx_t *context_data, /* arg1 */ +// uint8_t *out, /* arg2 */ +// const uint8_t *in, /* arg3 */ +// uint64_t plaintext_len, /* arg4 */ +// const uint8_t *iv, /* arg5 */ +// const uint8_t *aad, /* arg6 */ +// uint64_t aad_len, /* arg7 */ +// uint64_t tag_len, /* arg8 */ +// ); //////////////////////////////////////////////////////////////////////////////// -.global FN_NAME(enc,_) -FN_NAME(enc,_): - endbranch +ENTRY_NP(FN_NAME(enc,_)) +.cfi_startproc + ENDBR FUNC_SAVE - GCM_INIT arg1, arg2, arg6, arg7, arg8 + pushq arg2 + movq GcmHtab(arg1), arg2 // arg2 = gcm_ctx->gcm_Htable - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + GCM_INIT arg2, arg1, arg5, arg6, arg7, arg8 + + popq arg2 + mov KeySched(arg1), arg5 // arg5 = gcm_ctx->gcm_keysched + mov GcmHtab(arg1), arg6 // arg6 = gcm_ctx->gcm_Htable + + GCM_ENC_DEC arg5, arg6, arg1, arg2, arg3, arg4, ENC + GCM_COMPLETE arg5, arg6, arg1, ENC - GCM_COMPLETE arg1, arg2, arg9, arg10, ENC FUNC_RESTORE - ret + RET +.cfi_endproc +SET_SIZE(FN_NAME(enc,_)) //////////////////////////////////////////////////////////////////////////////// -//void aes_gcm_dec_128_sse / aes_gcm_dec_256_sse -// const struct gcm_key_data *key_data, -// struct gcm_context_data *context_data, -// u8 *out, -// const u8 *in, -// u64 plaintext_len, -// u8 *iv, -// const u8 *aad, -// u64 aad_len, -// u8 *auth_tag, -// u64 auth_tag_len)// +// void icp_isalc_gcm_dec_{128,192,256}_sse( +// gcm_ctx_t *context_data, /* arg1 */ +// u8 *out, /* arg2 */ +// const u8 *in, /* arg3 */ +// u64 plaintext_len, /* arg4 */ +// u8 *iv, /* arg5 */ +// const u8 *aad, /* arg6 */ +// u64 aad_len, /* arg7 */ +// u64 tag_len, /* arg8 */ +// ); //////////////////////////////////////////////////////////////////////////////// -.global FN_NAME(dec,_) -FN_NAME(dec,_): - endbranch +ENTRY_NP(FN_NAME(dec,_)) +.cfi_startproc + ENDBR FUNC_SAVE - GCM_INIT arg1, arg2, arg6, arg7, arg8 + pushq arg2 + movq GcmHtab(arg1), arg2 // arg2 = gcm_ctx->gcm_Htable - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + GCM_INIT arg2, arg1, arg5, arg6, arg7, arg8 - GCM_COMPLETE arg1, arg2, arg9, arg10, DEC - FUNC_RESTORE + popq arg2 + mov KeySched(arg1), arg5 // arg5 = gcm_ctx->gcm_keysched + mov GcmHtab(arg1), arg6 // arg6 = gcm_ctx->gcm_Htable - ret + GCM_ENC_DEC arg5, arg6, arg1, arg2, arg3, arg4, DEC + GCM_COMPLETE arg5, arg6, arg1, DEC -.global FN_NAME(this_is_gas,_) -FN_NAME(this_is_gas,_): - endbranch - FUNC_SAVE FUNC_RESTORE - ret -#else - // GAS doesnt't provide the linenuber in the macro - //////////////////////// - // GHASH_MUL xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 - // PRECOMPUTE rax, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 - // READ_SMALL_DATA_INPUT xmm1, r10, 8, rax, r12, r15 - // ENCRYPT_SINGLE_BLOCK rax, xmm0, xmm1 - // INITIAL_BLOCKS rdi,rsi,rdx,rcx,r13,r11,7,xmm12,xmm13,xmm14,xmm15,xmm11,xmm9,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm10,xmm0,ENC - // CALC_AAD_HASH [r14+8*5+8*1],[r14+8*5+8*2],xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,r10,r11,r12,r13,rax - // READ_SMALL_DATA_INPUT xmm2, r10, r11, r12, r13, rax - // PARTIAL_BLOCK rdi,rsi,rdx,rcx,r8,r11,xmm8,ENC - // GHASH_8_ENCRYPT_8_PARALLEL rdi,rdx,rcx,r11,xmm0,xmm10,xmm11,xmm12,xmm13,xmm14,xmm9,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8,xmm15,out_order,ENC - //GHASH_LAST_8 rdi,xmm0,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8 -#endif + RET +.cfi_endproc +SET_SIZE(FN_NAME(dec,_)) + +// -eof- diff --git a/module/icp/asm-x86_64/modes/isalc_reg_sizes.S b/module/icp/asm-x86_64/modes/isalc_reg_sizes.S index d77291ce58a1..3475264d2e78 100644 --- a/module/icp/asm-x86_64/modes/isalc_reg_sizes.S +++ b/module/icp/asm-x86_64/modes/isalc_reg_sizes.S @@ -1,4 +1,4 @@ -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // Copyright(c) 2011-2019 Intel Corporation All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -25,7 +25,10 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +// Port to GNU as, translation to GNU as att-syntax and adoptions for the ICP +// Copyright(c) 2023 Attila Fülöp #ifndef _REG_SIZES_ASM_ #define _REG_SIZES_ASM_ @@ -204,12 +207,6 @@ #endif -#ifdef __x86_64__ -#define endbranch .byte 0xf3, 0x0f, 0x1e, 0xfa -#else -#define endbranch .byte 0xf3, 0x0f, 0x1e, 0xfb -#endif - #ifdef REL_TEXT #define WRT_OPT #elif __OUTPUT_FORMAT__ == elf64 diff --git a/module/icp/include/modes/modes.h b/module/icp/include/modes/modes.h index 23bf46ab51a0..81e66e178896 100644 --- a/module/icp/include/modes/modes.h +++ b/module/icp/include/modes/modes.h @@ -36,14 +36,28 @@ extern "C" { /* * Does the build chain support all instructions needed for the GCM assembler - * routines. AVX support should imply AES-NI and PCLMULQDQ, but make sure - * anyhow. + * routines. */ -#if defined(__x86_64__) && defined(HAVE_AVX) && \ - defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) +#if defined(__x86_64__) && defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) +/* XXXX: does AES + PCLMULQDQ really imply at least SSE4_1? */ #define CAN_USE_GCM_ASM + +#ifdef DEBUG +/* Defines this to the gcm_simd_impl_t to debug. */ +#define DEBUG_GCM_ASM GSI_ISALC_SSE +#endif +#if defined(HAVE_SSE4_1) +#define CAN_USE_GCM_ASM_SSE +#endif +#if defined(HAVE_AVX) +#define CAN_USE_GCM_ASM_AVX extern boolean_t gcm_avx_can_use_movbe; #endif +#if defined(HAVE_AVX2) +#define CAN_USE_GCM_ASM_AVX2 +#endif +/* TODO: Add VAES/AVX512 */ +#endif /* defined(__x86_64__) && defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) */ #define ECB_MODE 0x00000002 #define CBC_MODE 0x00000004 @@ -183,6 +197,35 @@ typedef struct ccm_ctx { #define ccm_copy_to ccm_common.cc_copy_to #define ccm_flags ccm_common.cc_flags +#if defined(CAN_USE_GCM_ASM) +/* + * enum gcm_simd_impl holds the types of the implemented gcm asm routines for + * the various x86 SIMD extensions. Please note that other parts of the code + * depends on the order given below, so do not change the order and append new + * implementations at the end, but before GSI_NUM_IMPL. + */ +typedef enum gcm_simd_impl { + GSI_NONE, + GSI_OSSL_AVX, + GSI_ISALC_SSE, + GSI_NUM_IMPL +} gcm_simd_impl_t; + +#define GSI_ISALC_FIRST_IMPL ((int)GSI_ISALC_SSE) +#define GSI_ISALC_LAST_IMPL ((int)GSI_ISALC_SSE) + +/* + * XXXX: Serves as a template to remind us what to do if adding an isalc impl + * #ifdef CAN_USE_GCM_ASM_AVX2 + * #undef GSI_ISALC_LAST_IMPL + * #define GSI_ISALC_LAST_IMPL ((int)GSI_ISALC_AVX2) + * #endif + */ + +#define GSI_ISALC_NUM_IMPL (GSI_ISALC_LAST_IMPL - GSI_ISALC_FIRST_IMPL +1) + +#endif /* if defined(CAN_USE_GCM_ASM) */ + /* * gcm_tag_len: Length of authentication tag. * @@ -228,7 +271,11 @@ typedef struct gcm_ctx { uint64_t gcm_len_a_len_c[2]; uint8_t *gcm_pt_buf; #ifdef CAN_USE_GCM_ASM - boolean_t gcm_use_avx; + gcm_simd_impl_t gcm_simd_impl; +#ifdef DEBUG_GCM_ASM + struct gcm_ctx *gcm_shadow_ctx; + boolean_t gcm_is_shadow; +#endif #endif } gcm_ctx_t; diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index d6f01304f56b..0e146a53d522 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -1095,7 +1095,6 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, } else if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE)) { gcm_clear_ctx((gcm_ctx_t *)&aes_ctx); } - return (ret); }