RetroZilla/security/nss/lib/freebl/intel-gcm-x64-masm.asm
roytam1 30d33aa8e8 cherry-picked mozilla NSS upstream changes (to rev f7a4c771997e, which is on par with 3.16.1 but without windows rand() changes):
9934c8faef29, 3c3b381c4865, 5a67f6beee9a, 1b1eb6d77728, a8b668fd72f7, bug962760, bug743700, bug857304, bug972653, bug972450, bug971358, bug903885, bug977073, bug976111, bug949939, bug947653, bug947572, bug903885, bug979106, bug966596, bug979004, bug979752, bug980848, bug938369, bug981170, bug668130, bug974693, bug975056, bug979132, bug370717, bug979070, bug985070, bug900067, bug977673, bug519255, bug989558, bug557299, bug987263, bug369802, a751a5146718, bug992343, bug952572, bug979703, bug994883, bug994869, bug993489, bug984608, bug977869, bug667371, bug672828, bug793347, bug977869
2018-07-14 21:22:29 +08:00

1296 lines
34 KiB
NASM

; LICENSE:
; This submission to NSS is to be made available under the terms of the
; Mozilla Public License, v. 2.0. You can obtain one at http:
; //mozilla.org/MPL/2.0/.
;###############################################################################
; Copyright(c) 2014, Intel Corp.
; Developers and authors:
; Shay Gueron and Vlad Krasnov
; Intel Corporation, Israel Development Centre, Haifa, Israel
; Please send feedback directly to crypto.feedback.alias@intel.com
.DATA
ALIGN 16
Lone dq 1,0
Ltwo dq 2,0
Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
Lpoly dq 01h, 0c200000000000000h
.CODE
GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
vpclmulqdq TMP1, SRC2, SRC1, 0h
vpclmulqdq TMP4, SRC2, SRC1, 011h
vpshufd TMP2, SRC2, 78
vpshufd TMP3, SRC1, 78
vpxor TMP2, TMP2, SRC2
vpxor TMP3, TMP3, SRC1
vpclmulqdq TMP2, TMP2, TMP3, 0h
vpxor TMP2, TMP2, TMP1
vpxor TMP2, TMP2, TMP4
vpslldq TMP3, TMP2, 8
vpsrldq TMP2, TMP2, 8
vpxor TMP1, TMP1, TMP3
vpxor TMP4, TMP4, TMP2
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
vpshufd TMP3, TMP1, 78
vpxor TMP1, TMP2, TMP3
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
vpshufd TMP3, TMP1, 78
vpxor TMP1, TMP2, TMP3
vpxor DST, TMP1, TMP4
ENDM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Generates the final GCM tag
; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
; unsigned char *Tp,
; unsigned int Mlen,
; unsigned int Alen,
; unsigned char *X0,
; unsigned char *TAG);
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
intel_aes_gcmTAG PROC
Htbl textequ <rcx>
Tp textequ <rdx>
Mlen textequ <r8>
Alen textequ <r9>
X0 textequ <r10>
TAG textequ <r11>
T textequ <xmm0>
TMP0 textequ <xmm1>
mov X0, [rsp + 1*8 + 4*8]
mov TAG, [rsp + 1*8 + 5*8]
vzeroupper
vmovdqu T, XMMWORD PTR[Tp]
vpxor TMP0, TMP0, TMP0
shl Mlen, 3
shl Alen, 3
;vpinsrq TMP0, TMP0, Mlen, 0
;vpinsrq TMP0, TMP0, Alen, 1
; workaround the ml64.exe vpinsrq issue
vpinsrd TMP0, TMP0, r8d, 0
vpinsrd TMP0, TMP0, r9d, 2
shr Mlen, 32
shr Alen, 32
vpinsrd TMP0, TMP0, r8d, 1
vpinsrd TMP0, TMP0, r9d, 3
vpxor T, T, TMP0
vmovdqu TMP0, XMMWORD PTR[Htbl]
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
vpshufb T, T, [Lbswap_mask]
vpxor T, T, [X0]
vmovdqu XMMWORD PTR[TAG], T
vzeroupper
ret
intel_aes_gcmTAG ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Generates the H table
; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
intel_aes_gcmINIT PROC
Htbl textequ <rcx>
KS textequ <rdx>
NR textequ <r8d>
T textequ <xmm0>
TMP0 textequ <xmm1>
vzeroupper
; AES-ENC(0)
vmovdqu T, XMMWORD PTR[KS]
lea KS, [16 + KS]
dec NR
Lenc_loop:
vaesenc T, T, [KS]
lea KS, [16 + KS]
dec NR
jnz Lenc_loop
vaesenclast T, T, [KS]
vpshufb T, T, [Lbswap_mask]
;Calculate H` = GFMUL(H, 2)
vpsrad xmm3, T, 31
vpshufd xmm3, xmm3, 0ffh
vpand xmm5, xmm3, [Lpoly]
vpsrld xmm3, T, 31
vpslld xmm4, T, 1
vpslldq xmm3, xmm3, 4
vpxor T, xmm4, xmm3
vpxor T, T, xmm5
vmovdqu TMP0, T
vmovdqu XMMWORD PTR[Htbl + 0*16], T
vpshufd xmm2, T, 78
vpxor xmm2, xmm2, T
vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
i = 1
WHILE i LT 8
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
vmovdqu XMMWORD PTR[Htbl + i*16], T
vpshufd xmm2, T, 78
vpxor xmm2, xmm2, T
vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
i = i+1
ENDM
vzeroupper
ret
intel_aes_gcmINIT ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Authenticate only
; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
intel_aes_gcmAAD PROC
Htbl textequ <rcx>
inp textequ <rdx>
len textequ <r8>
Tp textequ <r9>
hlp0 textequ <r10>
DATA textequ <xmm0>
T textequ <xmm1>
TMP0 textequ <xmm2>
TMP1 textequ <xmm3>
TMP2 textequ <xmm4>
TMP3 textequ <xmm5>
TMP4 textequ <xmm6>
Xhi textequ <xmm7>
KARATSUBA_AAD MACRO i
vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
vpxor TMP0, TMP0, TMP3
vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
vpxor TMP1, TMP1, TMP3
vpshufd TMP3, DATA, 78
vpxor TMP3, TMP3, DATA
vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
vpxor TMP2, TMP2, TMP3
ENDM
test len, len
jnz LbeginAAD
ret
LbeginAAD:
vzeroupper
sub rsp, 2*16
vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
vpxor Xhi, Xhi, Xhi
vmovdqu T, XMMWORD PTR[Tp]
;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
mov hlp0, len
and hlp0, 128-1
jz Lmod_loop
and len, -128
sub hlp0, 16
; Prefix block
vmovdqu DATA, XMMWORD PTR[inp]
vpshufb DATA, DATA, [Lbswap_mask]
vpxor DATA, DATA, T
vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h
vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h
vpshufd TMP3, DATA, 78
vpxor TMP3, TMP3, DATA
vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
lea inp, [inp+16]
test hlp0, hlp0
jnz Lpre_loop
jmp Lred1
;hash remaining prefix bocks (up to 7 total prefix blocks)
Lpre_loop:
sub hlp0, 16
vmovdqu DATA, XMMWORD PTR[inp]
vpshufb DATA, DATA, [Lbswap_mask]
vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h
vpxor TMP0, TMP0, TMP3
vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h
vpxor TMP1, TMP1, TMP3
vpshufd TMP3, DATA, 78
vpxor TMP3, TMP3, DATA
vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
vpxor TMP2, TMP2, TMP3
test hlp0, hlp0
lea inp, [inp+16]
jnz Lpre_loop
Lred1:
vpxor TMP2, TMP2, TMP0
vpxor TMP2, TMP2, TMP1
vpsrldq TMP3, TMP2, 8
vpslldq TMP2, TMP2, 8
vpxor Xhi, TMP1, TMP3
vpxor T, TMP0, TMP2
Lmod_loop:
sub len, 16*8
jb Ldone
; Block #0
vmovdqu DATA, XMMWORD PTR[inp + 16*7]
vpshufb DATA, DATA, [Lbswap_mask]
vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h
vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h
vpshufd TMP3, DATA, 78
vpxor TMP3, TMP3, DATA
vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
; Block #1
vmovdqu DATA, XMMWORD PTR[inp + 16*6]
vpshufb DATA, DATA, [Lbswap_mask]
KARATSUBA_AAD 1
; Block #2
vmovdqu DATA, XMMWORD PTR[inp + 16*5]
vpshufb DATA, DATA, [Lbswap_mask]
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
vpalignr T, T, T, 8
KARATSUBA_AAD 2
vpxor T, T, TMP4 ;reduction stage 1b
; Block #3
vmovdqu DATA, XMMWORD PTR[inp + 16*4]
vpshufb DATA, DATA, [Lbswap_mask]
KARATSUBA_AAD 3
; Block #4
vmovdqu DATA, XMMWORD PTR[inp + 16*3]
vpshufb DATA, DATA, [Lbswap_mask]
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
vpalignr T, T, T, 8
KARATSUBA_AAD 4
vpxor T, T, TMP4 ;reduction stage 2b
; Block #5
vmovdqu DATA, XMMWORD PTR[inp + 16*2]
vpshufb DATA, DATA, [Lbswap_mask]
KARATSUBA_AAD 5
vpxor T, T, Xhi ;reduction finalize
; Block #6
vmovdqu DATA, XMMWORD PTR[inp + 16*1]
vpshufb DATA, DATA, [Lbswap_mask]
KARATSUBA_AAD 6
; Block #7
vmovdqu DATA, XMMWORD PTR[inp + 16*0]
vpshufb DATA, DATA, [Lbswap_mask]
vpxor DATA, DATA, T
KARATSUBA_AAD 7
; Aggregated 8 blocks, now karatsuba fixup
vpxor TMP2, TMP2, TMP0
vpxor TMP2, TMP2, TMP1
vpsrldq TMP3, TMP2, 8
vpslldq TMP2, TMP2, 8
vpxor Xhi, TMP1, TMP3
vpxor T, TMP0, TMP2
lea inp, [inp + 16*8]
jmp Lmod_loop
Ldone:
vpclmulqdq TMP4, T, [Lpoly], 010h
vpalignr T, T, T, 8
vpxor T, T, TMP4
vpclmulqdq TMP4, T, [Lpoly], 010h
vpalignr T, T, T, 8
vpxor T, T, TMP4
vpxor T, T, Xhi
vmovdqu XMMWORD PTR[Tp], T
vzeroupper
vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
add rsp, 16*2
ret
intel_aes_gcmAAD ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Encrypt and Authenticate
; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
intel_aes_gcmENC PROC
PT textequ <rcx>
CT textequ <rdx>
Htbl textequ <r8>
Gctx textequ <r8>
len textequ <r9>
KS textequ <r10>
NR textequ <eax>
aluCTR textequ <r11d>
aluKSl textequ <r12d>
aluTMP textequ <r13d>
T textequ <xmm0>
TMP0 textequ <xmm1>
TMP1 textequ <xmm2>
TMP2 textequ <xmm3>
TMP3 textequ <xmm4>
TMP4 textequ <xmm5>
TMP5 textequ <xmm6>
CTR0 textequ <xmm7>
CTR1 textequ <xmm8>
CTR2 textequ <xmm9>
CTR3 textequ <xmm10>
CTR4 textequ <xmm11>
CTR5 textequ <xmm12>
CTR6 textequ <xmm13>
CTR7 textequ <xmm14>
BSWAPMASK textequ <xmm15>
ROUND MACRO i
vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
vaesenc CTR0, CTR0, TMP3
vaesenc CTR1, CTR1, TMP3
vaesenc CTR2, CTR2, TMP3
vaesenc CTR3, CTR3, TMP3
vaesenc CTR4, CTR4, TMP3
vaesenc CTR5, CTR5, TMP3
vaesenc CTR6, CTR6, TMP3
vaesenc CTR7, CTR7, TMP3
ENDM
ROUNDMUL MACRO i
vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
vaesenc CTR0, CTR0, TMP3
vaesenc CTR1, CTR1, TMP3
vaesenc CTR2, CTR2, TMP3
vaesenc CTR3, CTR3, TMP3
vpshufd TMP4, TMP5, 78
vpxor TMP4, TMP4, TMP5
vaesenc CTR4, CTR4, TMP3
vaesenc CTR5, CTR5, TMP3
vaesenc CTR6, CTR6, TMP3
vaesenc CTR7, CTR7, TMP3
vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
vpxor TMP0, TMP0, TMP3
vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
vpclmulqdq TMP3, TMP5, TMP4, 011h
vpxor TMP1, TMP1, TMP3
vpclmulqdq TMP3, TMP5, TMP4, 000h
vpxor TMP2, TMP2, TMP3
ENDM
KARATSUBA MACRO i
vpshufd TMP4, TMP5, 78
vpxor TMP4, TMP4, TMP5
vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
vpxor TMP0, TMP0, TMP3
vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
vpclmulqdq TMP3, TMP5, TMP4, 011h
vpxor TMP1, TMP1, TMP3
vpclmulqdq TMP3, TMP5, TMP4, 000h
vpxor TMP2, TMP2, TMP3
ENDM
NEXTCTR MACRO i
add aluCTR, 1
mov aluTMP, aluCTR
xor aluTMP, aluKSl
bswap aluTMP
mov [3*4 + 8*16 + i*16 + rsp], aluTMP
ENDM
test len, len
jnz LbeginENC
ret
LbeginENC:
vzeroupper
push r11
push r12
push r13
push rbp
sub rsp, 10*16
vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
mov rbp, rsp
sub rsp, 16*16
and rsp, -16
vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
mov KS, [16*16 + 3*16 + Gctx]
mov NR, [4 + KS]
lea KS, [48 + KS]
vpshufb CTR0, CTR0, BSWAPMASK
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
mov aluKSl, [3*4 + KS]
bswap aluCTR
bswap aluKSl
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
cmp len, 128
jb LEncDataSingles
; Prepare the "top" counters
vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
; Encrypt the initial 8 blocks
sub len, 128
vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
vpaddd CTR7, CTR6, XMMWORD PTR[Lone]
vpshufb CTR0, CTR0, BSWAPMASK
vpshufb CTR1, CTR1, BSWAPMASK
vpshufb CTR2, CTR2, BSWAPMASK
vpshufb CTR3, CTR3, BSWAPMASK
vpshufb CTR4, CTR4, BSWAPMASK
vpshufb CTR5, CTR5, BSWAPMASK
vpshufb CTR6, CTR6, BSWAPMASK
vpshufb CTR7, CTR7, BSWAPMASK
vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
vpxor CTR0, CTR0, TMP3
vpxor CTR1, CTR1, TMP3
vpxor CTR2, CTR2, TMP3
vpxor CTR3, CTR3, TMP3
vpxor CTR4, CTR4, TMP3
vpxor CTR5, CTR5, TMP3
vpxor CTR6, CTR6, TMP3
vpxor CTR7, CTR7, TMP3
ROUND 1
add aluCTR, 8
mov aluTMP, aluCTR
xor aluTMP, aluKSl
bswap aluTMP
mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
ROUND 2
NEXTCTR 1
ROUND 3
NEXTCTR 2
ROUND 4
NEXTCTR 3
ROUND 5
NEXTCTR 4
ROUND 6
NEXTCTR 5
ROUND 7
NEXTCTR 6
ROUND 8
NEXTCTR 7
ROUND 9
vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
cmp NR, 10
je @f
ROUND 10
ROUND 11
vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
cmp NR, 12
je @f
ROUND 12
ROUND 13
vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
@@:
vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
vaesenclast CTR0, CTR0, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
vaesenclast CTR1, CTR1, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
vaesenclast CTR2, CTR2, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
vaesenclast CTR3, CTR3, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
vaesenclast CTR4, CTR4, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
vaesenclast CTR5, CTR5, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
vaesenclast CTR6, CTR6, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
vaesenclast CTR7, CTR7, TMP3
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
vpshufb CTR0, CTR0, BSWAPMASK
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
vpshufb CTR1, CTR1, BSWAPMASK
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
vpshufb CTR2, CTR2, BSWAPMASK
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
vpshufb CTR3, CTR3, BSWAPMASK
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
vpshufb CTR4, CTR4, BSWAPMASK
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
vpshufb CTR5, CTR5, BSWAPMASK
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
vpshufb CTR6, CTR6, BSWAPMASK
vmovdqu XMMWORD PTR[7*16 + CT], CTR7
vpshufb TMP5, CTR7, BSWAPMASK
vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
lea CT, [8*16 + CT]
lea PT, [8*16 + PT]
jmp LEncDataOctets
LEncDataOctets:
cmp len, 128
jb LEndEncOctets
sub len, 128
vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
vpshufd TMP4, TMP5, 78
vpxor TMP4, TMP4, TMP5
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
vpclmulqdq TMP1, TMP5, TMP4, 011h
vpclmulqdq TMP2, TMP5, TMP4, 000h
vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
ROUNDMUL 1
NEXTCTR 0
vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
ROUNDMUL 2
NEXTCTR 1
vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
ROUNDMUL 3
NEXTCTR 2
vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
ROUNDMUL 4
NEXTCTR 3
vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
ROUNDMUL 5
NEXTCTR 4
vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
ROUNDMUL 6
NEXTCTR 5
vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
ROUNDMUL 7
NEXTCTR 6
ROUND 8
NEXTCTR 7
vpxor TMP0, TMP0, TMP1
vpxor TMP0, TMP0, TMP2
vpsrldq TMP3, TMP0, 8
vpxor TMP4, TMP1, TMP3
vpslldq TMP3, TMP0, 8
vpxor T, TMP2, TMP3
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
vpalignr T,T,T,8
vpxor T, T, TMP1
ROUND 9
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
vpalignr T,T,T,8
vpxor T, T, TMP1
vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
cmp NR, 10
je @f
ROUND 10
ROUND 11
vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
cmp NR, 12
je @f
ROUND 12
ROUND 13
vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
@@:
vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
vaesenclast CTR0, CTR0, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
vaesenclast CTR1, CTR1, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
vaesenclast CTR2, CTR2, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
vaesenclast CTR3, CTR3, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
vaesenclast CTR4, CTR4, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
vaesenclast CTR5, CTR5, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
vaesenclast CTR6, CTR6, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
vaesenclast CTR7, CTR7, TMP3
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
vpshufb CTR0, CTR0, BSWAPMASK
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
vpshufb CTR1, CTR1, BSWAPMASK
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
vpshufb CTR2, CTR2, BSWAPMASK
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
vpshufb CTR3, CTR3, BSWAPMASK
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
vpshufb CTR4, CTR4, BSWAPMASK
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
vpshufb CTR5, CTR5, BSWAPMASK
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
vpshufb CTR6, CTR6, BSWAPMASK
vmovdqu XMMWORD PTR[7*16 + CT], CTR7
vpshufb TMP5, CTR7, BSWAPMASK
vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
vpxor T, T, TMP4
lea CT, [8*16 + CT]
lea PT, [8*16 + PT]
jmp LEncDataOctets
LEndEncOctets:
vpshufd TMP4, TMP5, 78
vpxor TMP4, TMP4, TMP5
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
vpclmulqdq TMP1, TMP5, TMP4, 011h
vpclmulqdq TMP2, TMP5, TMP4, 000h
vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
KARATSUBA 1
vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
KARATSUBA 2
vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
KARATSUBA 3
vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
KARATSUBA 4
vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
KARATSUBA 5
vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
KARATSUBA 6
vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
KARATSUBA 7
vpxor TMP0, TMP0, TMP1
vpxor TMP0, TMP0, TMP2
vpsrldq TMP3, TMP0, 8
vpxor TMP4, TMP1, TMP3
vpslldq TMP3, TMP0, 8
vpxor T, TMP2, TMP3
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
vpalignr T,T,T,8
vpxor T, T, TMP1
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
vpalignr T,T,T,8
vpxor T, T, TMP1
vpxor T, T, TMP4
sub aluCTR, 7
LEncDataSingles:
cmp len, 16
jb LEncDataTail
sub len, 16
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
NEXTCTR 0
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
cmp NR, 10
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
cmp NR, 12
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
@@:
vaesenclast TMP1, TMP1, TMP2
vpxor TMP1, TMP1, XMMWORD PTR[PT]
vmovdqu XMMWORD PTR[CT], TMP1
lea PT, [16+PT]
lea CT, [16+CT]
vpshufb TMP1, TMP1, BSWAPMASK
vpxor T, T, TMP1
vmovdqu TMP0, XMMWORD PTR[Htbl]
GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
jmp LEncDataSingles
LEncDataTail:
test len, len
jz LEncDataEnd
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
cmp NR, 10
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
cmp NR, 12
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
@@:
vaesenclast TMP1, TMP1, TMP2
; zero a temp location
vpxor TMP2, TMP2, TMP2
vmovdqa XMMWORD PTR[rsp], TMP2
; copy as many bytes as needed
xor KS, KS
@@:
cmp len, KS
je @f
mov al, [PT + KS]
mov [rsp + KS], al
inc KS
jmp @b
@@:
vpxor TMP1, TMP1, XMMWORD PTR[rsp]
vmovdqa XMMWORD PTR[rsp], TMP1
xor KS, KS
@@:
cmp len, KS
je @f
mov al, [rsp + KS]
mov [CT + KS], al
inc KS
jmp @b
@@:
cmp KS, 16
je @f
mov BYTE PTR[rsp + KS], 0
inc KS
jmp @b
@@:
BAIL:
vmovdqa TMP1, XMMWORD PTR[rsp]
vpshufb TMP1, TMP1, BSWAPMASK
vpxor T, T, TMP1
vmovdqu TMP0, XMMWORD PTR[Htbl]
GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
LEncDataEnd:
vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
bswap aluCTR
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
mov rsp, rbp
vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
add rsp, 10*16
pop rbp
pop r13
pop r12
pop r11
vzeroupper
ret
intel_aes_gcmENC ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Decrypt and Authenticate
; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
intel_aes_gcmDEC PROC
NEXTCTR MACRO i
add aluCTR, 1
mov aluTMP, aluCTR
xor aluTMP, aluKSl
bswap aluTMP
mov [3*4 + i*16 + rsp], aluTMP
ENDM
PT textequ <rdx>
CT textequ <rcx>
test len, len
jnz LbeginDEC
ret
LbeginDEC:
vzeroupper
push r11
push r12
push r13
push rbp
sub rsp, 10*16
vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
mov rbp, rsp
sub rsp, 8*16
and rsp, -16
vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
mov KS, [16*16 + 3*16 + Gctx]
mov NR, [4 + KS]
lea KS, [48 + KS]
vpshufb CTR0, CTR0, BSWAPMASK
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
mov aluKSl, [3*4 + KS]
bswap aluCTR
bswap aluKSl
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
cmp len, 128
jb LDecDataSingles
; Prepare the "top" counters
vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
NEXTCTR 1
NEXTCTR 2
NEXTCTR 3
NEXTCTR 4
NEXTCTR 5
NEXTCTR 6
NEXTCTR 7
LDecDataOctets:
cmp len, 128
jb LEndDecOctets
sub len, 128
vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
vpshufd TMP4, TMP5, 78
vpxor TMP4, TMP4, TMP5
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
vpclmulqdq TMP1, TMP5, TMP4, 011h
vpclmulqdq TMP2, TMP5, TMP4, 000h
vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
ROUNDMUL 1
NEXTCTR 0
vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
ROUNDMUL 2
NEXTCTR 1
vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
ROUNDMUL 3
NEXTCTR 2
vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
ROUNDMUL 4
NEXTCTR 3
vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
ROUNDMUL 5
NEXTCTR 4
vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
ROUNDMUL 6
NEXTCTR 5
vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
vpshufb TMP5, TMP5, BSWAPMASK
vpxor TMP5, TMP5, T
ROUNDMUL 7
NEXTCTR 6
ROUND 8
NEXTCTR 7
vpxor TMP0, TMP0, TMP1
vpxor TMP0, TMP0, TMP2
vpsrldq TMP3, TMP0, 8
vpxor TMP4, TMP1, TMP3
vpslldq TMP3, TMP0, 8
vpxor T, TMP2, TMP3
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
vpalignr T,T,T,8
vpxor T, T, TMP1
ROUND 9
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
vpalignr T,T,T,8
vpxor T, T, TMP1
vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
cmp NR, 10
je @f
ROUND 10
ROUND 11
vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
cmp NR, 12
je @f
ROUND 12
ROUND 13
vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
@@:
vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT]
vaesenclast CTR0, CTR0, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT]
vaesenclast CTR1, CTR1, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT]
vaesenclast CTR2, CTR2, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT]
vaesenclast CTR3, CTR3, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT]
vaesenclast CTR4, CTR4, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT]
vaesenclast CTR5, CTR5, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT]
vaesenclast CTR6, CTR6, TMP3
vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT]
vaesenclast CTR7, CTR7, TMP3
vmovdqu XMMWORD PTR[0*16 + PT], CTR0
vmovdqu XMMWORD PTR[1*16 + PT], CTR1
vmovdqu XMMWORD PTR[2*16 + PT], CTR2
vmovdqu XMMWORD PTR[3*16 + PT], CTR3
vmovdqu XMMWORD PTR[4*16 + PT], CTR4
vmovdqu XMMWORD PTR[5*16 + PT], CTR5
vmovdqu XMMWORD PTR[6*16 + PT], CTR6
vmovdqu XMMWORD PTR[7*16 + PT], CTR7
vpxor T, T, TMP4
lea CT, [8*16 + CT]
lea PT, [8*16 + PT]
jmp LDecDataOctets
LEndDecOctets:
sub aluCTR, 7
LDecDataSingles:
cmp len, 16
jb LDecDataTail
sub len, 16
vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
NEXTCTR 0
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
cmp NR, 10
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
cmp NR, 12
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
@@:
vaesenclast TMP1, TMP1, TMP2
vmovdqu TMP2, XMMWORD PTR[CT]
vpxor TMP1, TMP1, TMP2
vmovdqu XMMWORD PTR[PT], TMP1
lea PT, [16+PT]
lea CT, [16+CT]
vpshufb TMP2, TMP2, BSWAPMASK
vpxor T, T, TMP2
vmovdqu TMP0, XMMWORD PTR[Htbl]
GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
jmp LDecDataSingles
LDecDataTail:
test len, len
jz LDecDataEnd
vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
inc aluCTR
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
cmp NR, 10
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
cmp NR, 12
je @f
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
@@:
vaesenclast TMP1, TMP1, TMP2
; copy as many bytes as needed
xor KS, KS
@@:
cmp len, KS
je @f
mov al, [CT + KS]
mov [rsp + KS], al
inc KS
jmp @b
@@:
cmp KS, 16
je @f
mov BYTE PTR[rsp + KS], 0
inc KS
jmp @b
@@:
vmovdqa TMP2, XMMWORD PTR[rsp]
vpshufb TMP2, TMP2, BSWAPMASK
vpxor T, T, TMP2
vmovdqu TMP0, XMMWORD PTR[Htbl]
GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4
vpxor TMP1, TMP1, XMMWORD PTR[rsp]
vmovdqa XMMWORD PTR[rsp], TMP1
xor KS, KS
@@:
cmp len, KS
je @f
mov al, [rsp + KS]
mov [PT + KS], al
inc KS
jmp @b
@@:
LDecDataEnd:
vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
bswap aluCTR
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
mov rsp, rbp
vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
add rsp, 10*16
pop rbp
pop r13
pop r12
pop r11
vzeroupper
ret
ret
intel_aes_gcmDEC ENDP
END