mirror of
https://github.com/rn10950/RetroZilla.git
synced 2024-11-14 03:30:17 +01:00
30d33aa8e8
9934c8faef29, 3c3b381c4865, 5a67f6beee9a, 1b1eb6d77728, a8b668fd72f7, bug962760, bug743700, bug857304, bug972653, bug972450, bug971358, bug903885, bug977073, bug976111, bug949939, bug947653, bug947572, bug903885, bug979106, bug966596, bug979004, bug979752, bug980848, bug938369, bug981170, bug668130, bug974693, bug975056, bug979132, bug370717, bug979070, bug985070, bug900067, bug977673, bug519255, bug989558, bug557299, bug987263, bug369802, a751a5146718, bug992343, bug952572, bug979703, bug994883, bug994869, bug993489, bug984608, bug977869, bug667371, bug672828, bug793347, bug977869
1296 lines
34 KiB
NASM
1296 lines
34 KiB
NASM
; LICENSE:
|
|
; This submission to NSS is to be made available under the terms of the
|
|
; Mozilla Public License, v. 2.0. You can obtain one at http:
|
|
; //mozilla.org/MPL/2.0/.
|
|
;###############################################################################
|
|
; Copyright(c) 2014, Intel Corp.
|
|
; Developers and authors:
|
|
; Shay Gueron and Vlad Krasnov
|
|
; Intel Corporation, Israel Development Centre, Haifa, Israel
|
|
; Please send feedback directly to crypto.feedback.alias@intel.com
|
|
|
|
|
|
.DATA
|
|
ALIGN 16
|
|
Lone dq 1,0
|
|
Ltwo dq 2,0
|
|
Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
|
Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
|
|
Lpoly dq 01h, 0c200000000000000h
|
|
|
|
.CODE
|
|
|
|
|
|
GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
|
|
vpclmulqdq TMP1, SRC2, SRC1, 0h
|
|
vpclmulqdq TMP4, SRC2, SRC1, 011h
|
|
|
|
vpshufd TMP2, SRC2, 78
|
|
vpshufd TMP3, SRC1, 78
|
|
vpxor TMP2, TMP2, SRC2
|
|
vpxor TMP3, TMP3, SRC1
|
|
|
|
vpclmulqdq TMP2, TMP2, TMP3, 0h
|
|
vpxor TMP2, TMP2, TMP1
|
|
vpxor TMP2, TMP2, TMP4
|
|
|
|
vpslldq TMP3, TMP2, 8
|
|
vpsrldq TMP2, TMP2, 8
|
|
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpxor TMP4, TMP4, TMP2
|
|
|
|
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
|
vpshufd TMP3, TMP1, 78
|
|
vpxor TMP1, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
|
vpshufd TMP3, TMP1, 78
|
|
vpxor TMP1, TMP2, TMP3
|
|
|
|
vpxor DST, TMP1, TMP4
|
|
|
|
ENDM
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Generates the final GCM tag
|
|
; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
|
|
; unsigned char *Tp,
|
|
; unsigned int Mlen,
|
|
; unsigned int Alen,
|
|
; unsigned char *X0,
|
|
; unsigned char *TAG);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmTAG PROC
|
|
|
|
Htbl textequ <rcx>
|
|
Tp textequ <rdx>
|
|
Mlen textequ <r8>
|
|
Alen textequ <r9>
|
|
X0 textequ <r10>
|
|
TAG textequ <r11>
|
|
|
|
T textequ <xmm0>
|
|
TMP0 textequ <xmm1>
|
|
|
|
mov X0, [rsp + 1*8 + 4*8]
|
|
mov TAG, [rsp + 1*8 + 5*8]
|
|
|
|
vzeroupper
|
|
vmovdqu T, XMMWORD PTR[Tp]
|
|
vpxor TMP0, TMP0, TMP0
|
|
|
|
shl Mlen, 3
|
|
shl Alen, 3
|
|
|
|
;vpinsrq TMP0, TMP0, Mlen, 0
|
|
;vpinsrq TMP0, TMP0, Alen, 1
|
|
; workaround the ml64.exe vpinsrq issue
|
|
vpinsrd TMP0, TMP0, r8d, 0
|
|
vpinsrd TMP0, TMP0, r9d, 2
|
|
shr Mlen, 32
|
|
shr Alen, 32
|
|
vpinsrd TMP0, TMP0, r8d, 1
|
|
vpinsrd TMP0, TMP0, r9d, 3
|
|
|
|
vpxor T, T, TMP0
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
|
|
|
vpshufb T, T, [Lbswap_mask]
|
|
vpxor T, T, [X0]
|
|
vmovdqu XMMWORD PTR[TAG], T
|
|
vzeroupper
|
|
|
|
ret
|
|
|
|
intel_aes_gcmTAG ENDP
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Generates the H table
|
|
; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmINIT PROC
|
|
|
|
Htbl textequ <rcx>
|
|
KS textequ <rdx>
|
|
NR textequ <r8d>
|
|
|
|
T textequ <xmm0>
|
|
TMP0 textequ <xmm1>
|
|
|
|
vzeroupper
|
|
; AES-ENC(0)
|
|
vmovdqu T, XMMWORD PTR[KS]
|
|
lea KS, [16 + KS]
|
|
dec NR
|
|
Lenc_loop:
|
|
vaesenc T, T, [KS]
|
|
lea KS, [16 + KS]
|
|
dec NR
|
|
jnz Lenc_loop
|
|
|
|
vaesenclast T, T, [KS]
|
|
vpshufb T, T, [Lbswap_mask]
|
|
|
|
;Calculate H` = GFMUL(H, 2)
|
|
vpsrad xmm3, T, 31
|
|
vpshufd xmm3, xmm3, 0ffh
|
|
vpand xmm5, xmm3, [Lpoly]
|
|
vpsrld xmm3, T, 31
|
|
vpslld xmm4, T, 1
|
|
vpslldq xmm3, xmm3, 4
|
|
vpxor T, xmm4, xmm3
|
|
vpxor T, T, xmm5
|
|
|
|
vmovdqu TMP0, T
|
|
vmovdqu XMMWORD PTR[Htbl + 0*16], T
|
|
|
|
vpshufd xmm2, T, 78
|
|
vpxor xmm2, xmm2, T
|
|
vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
|
|
|
|
i = 1
|
|
WHILE i LT 8
|
|
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
|
vmovdqu XMMWORD PTR[Htbl + i*16], T
|
|
vpshufd xmm2, T, 78
|
|
vpxor xmm2, xmm2, T
|
|
vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
|
|
i = i+1
|
|
ENDM
|
|
vzeroupper
|
|
ret
|
|
intel_aes_gcmINIT ENDP
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Authenticate only
|
|
; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmAAD PROC
|
|
|
|
Htbl textequ <rcx>
|
|
inp textequ <rdx>
|
|
len textequ <r8>
|
|
Tp textequ <r9>
|
|
hlp0 textequ <r10>
|
|
|
|
DATA textequ <xmm0>
|
|
T textequ <xmm1>
|
|
TMP0 textequ <xmm2>
|
|
TMP1 textequ <xmm3>
|
|
TMP2 textequ <xmm4>
|
|
TMP3 textequ <xmm5>
|
|
TMP4 textequ <xmm6>
|
|
Xhi textequ <xmm7>
|
|
|
|
KARATSUBA_AAD MACRO i
|
|
vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
|
|
vpxor TMP0, TMP0, TMP3
|
|
vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
|
|
vpxor TMP2, TMP2, TMP3
|
|
ENDM
|
|
|
|
test len, len
|
|
jnz LbeginAAD
|
|
ret
|
|
|
|
LbeginAAD:
|
|
vzeroupper
|
|
|
|
sub rsp, 2*16
|
|
vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
|
|
vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
|
|
|
|
vpxor Xhi, Xhi, Xhi
|
|
|
|
vmovdqu T, XMMWORD PTR[Tp]
|
|
;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
|
|
mov hlp0, len
|
|
and hlp0, 128-1
|
|
jz Lmod_loop
|
|
|
|
and len, -128
|
|
sub hlp0, 16
|
|
|
|
; Prefix block
|
|
vmovdqu DATA, XMMWORD PTR[inp]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
vpxor DATA, DATA, T
|
|
|
|
vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h
|
|
vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
|
|
|
|
lea inp, [inp+16]
|
|
test hlp0, hlp0
|
|
jnz Lpre_loop
|
|
jmp Lred1
|
|
|
|
;hash remaining prefix bocks (up to 7 total prefix blocks)
|
|
Lpre_loop:
|
|
|
|
sub hlp0, 16
|
|
|
|
vmovdqu DATA, XMMWORD PTR[inp]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
|
|
vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h
|
|
vpxor TMP0, TMP0, TMP3
|
|
vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
|
|
vpxor TMP2, TMP2, TMP3
|
|
|
|
test hlp0, hlp0
|
|
lea inp, [inp+16]
|
|
jnz Lpre_loop
|
|
|
|
Lred1:
|
|
|
|
vpxor TMP2, TMP2, TMP0
|
|
vpxor TMP2, TMP2, TMP1
|
|
vpsrldq TMP3, TMP2, 8
|
|
vpslldq TMP2, TMP2, 8
|
|
|
|
vpxor Xhi, TMP1, TMP3
|
|
vpxor T, TMP0, TMP2
|
|
|
|
|
|
Lmod_loop:
|
|
|
|
sub len, 16*8
|
|
jb Ldone
|
|
; Block #0
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*7]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
|
|
vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h
|
|
vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
|
|
|
|
; Block #1
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*6]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 1
|
|
|
|
; Block #2
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*5]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
|
|
vpalignr T, T, T, 8
|
|
|
|
KARATSUBA_AAD 2
|
|
|
|
vpxor T, T, TMP4 ;reduction stage 1b
|
|
|
|
; Block #3
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*4]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 3
|
|
; Block #4
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*3]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
|
|
vpalignr T, T, T, 8
|
|
|
|
KARATSUBA_AAD 4
|
|
|
|
vpxor T, T, TMP4 ;reduction stage 2b
|
|
; Block #5
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*2]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 5
|
|
|
|
vpxor T, T, Xhi ;reduction finalize
|
|
; Block #6
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*1]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 6
|
|
; Block #7
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*0]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
vpxor DATA, DATA, T
|
|
KARATSUBA_AAD 7
|
|
; Aggregated 8 blocks, now karatsuba fixup
|
|
vpxor TMP2, TMP2, TMP0
|
|
vpxor TMP2, TMP2, TMP1
|
|
vpsrldq TMP3, TMP2, 8
|
|
vpslldq TMP2, TMP2, 8
|
|
|
|
vpxor Xhi, TMP1, TMP3
|
|
vpxor T, TMP0, TMP2
|
|
|
|
lea inp, [inp + 16*8]
|
|
jmp Lmod_loop
|
|
|
|
Ldone:
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h
|
|
vpalignr T, T, T, 8
|
|
vpxor T, T, TMP4
|
|
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h
|
|
vpalignr T, T, T, 8
|
|
vpxor T, T, TMP4
|
|
|
|
vpxor T, T, Xhi
|
|
vmovdqu XMMWORD PTR[Tp], T
|
|
vzeroupper
|
|
|
|
vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
|
|
vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
|
|
add rsp, 16*2
|
|
|
|
ret
|
|
|
|
intel_aes_gcmAAD ENDP
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Encrypt and Authenticate
|
|
; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmENC PROC
|
|
|
|
PT textequ <rcx>
|
|
CT textequ <rdx>
|
|
Htbl textequ <r8>
|
|
Gctx textequ <r8>
|
|
len textequ <r9>
|
|
KS textequ <r10>
|
|
NR textequ <eax>
|
|
|
|
aluCTR textequ <r11d>
|
|
aluKSl textequ <r12d>
|
|
aluTMP textequ <r13d>
|
|
|
|
T textequ <xmm0>
|
|
TMP0 textequ <xmm1>
|
|
TMP1 textequ <xmm2>
|
|
TMP2 textequ <xmm3>
|
|
TMP3 textequ <xmm4>
|
|
TMP4 textequ <xmm5>
|
|
TMP5 textequ <xmm6>
|
|
CTR0 textequ <xmm7>
|
|
CTR1 textequ <xmm8>
|
|
CTR2 textequ <xmm9>
|
|
CTR3 textequ <xmm10>
|
|
CTR4 textequ <xmm11>
|
|
CTR5 textequ <xmm12>
|
|
CTR6 textequ <xmm13>
|
|
CTR7 textequ <xmm14>
|
|
BSWAPMASK textequ <xmm15>
|
|
|
|
ROUND MACRO i
|
|
vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
|
|
vaesenc CTR0, CTR0, TMP3
|
|
vaesenc CTR1, CTR1, TMP3
|
|
vaesenc CTR2, CTR2, TMP3
|
|
vaesenc CTR3, CTR3, TMP3
|
|
vaesenc CTR4, CTR4, TMP3
|
|
vaesenc CTR5, CTR5, TMP3
|
|
vaesenc CTR6, CTR6, TMP3
|
|
vaesenc CTR7, CTR7, TMP3
|
|
ENDM
|
|
ROUNDMUL MACRO i
|
|
vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
|
|
|
|
vaesenc CTR0, CTR0, TMP3
|
|
vaesenc CTR1, CTR1, TMP3
|
|
vaesenc CTR2, CTR2, TMP3
|
|
vaesenc CTR3, CTR3, TMP3
|
|
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
|
|
vaesenc CTR4, CTR4, TMP3
|
|
vaesenc CTR5, CTR5, TMP3
|
|
vaesenc CTR6, CTR6, TMP3
|
|
vaesenc CTR7, CTR7, TMP3
|
|
|
|
vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
|
|
vpxor TMP0, TMP0, TMP3
|
|
vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
|
|
vpclmulqdq TMP3, TMP5, TMP4, 011h
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpclmulqdq TMP3, TMP5, TMP4, 000h
|
|
vpxor TMP2, TMP2, TMP3
|
|
ENDM
|
|
KARATSUBA MACRO i
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
|
|
vpxor TMP0, TMP0, TMP3
|
|
vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
|
|
vpclmulqdq TMP3, TMP5, TMP4, 011h
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpclmulqdq TMP3, TMP5, TMP4, 000h
|
|
vpxor TMP2, TMP2, TMP3
|
|
ENDM
|
|
NEXTCTR MACRO i
|
|
add aluCTR, 1
|
|
mov aluTMP, aluCTR
|
|
xor aluTMP, aluKSl
|
|
bswap aluTMP
|
|
mov [3*4 + 8*16 + i*16 + rsp], aluTMP
|
|
ENDM
|
|
|
|
|
|
test len, len
|
|
jnz LbeginENC
|
|
ret
|
|
|
|
LbeginENC:
|
|
|
|
vzeroupper
|
|
push r11
|
|
push r12
|
|
push r13
|
|
push rbp
|
|
sub rsp, 10*16
|
|
vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
|
|
vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
|
|
vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
|
|
vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
|
|
vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
|
|
vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
|
|
vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
|
|
vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
|
|
vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
|
|
vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
|
|
|
|
mov rbp, rsp
|
|
sub rsp, 16*16
|
|
and rsp, -16
|
|
|
|
vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
|
|
vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
|
vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
|
|
mov KS, [16*16 + 3*16 + Gctx]
|
|
mov NR, [4 + KS]
|
|
lea KS, [48 + KS]
|
|
|
|
vpshufb CTR0, CTR0, BSWAPMASK
|
|
|
|
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
|
mov aluKSl, [3*4 + KS]
|
|
bswap aluCTR
|
|
bswap aluKSl
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
|
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
|
vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
|
|
|
|
cmp len, 128
|
|
jb LEncDataSingles
|
|
; Prepare the "top" counters
|
|
vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
|
|
|
|
; Encrypt the initial 8 blocks
|
|
sub len, 128
|
|
vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
|
|
vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
|
|
vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
|
|
vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
|
|
vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
|
|
vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
|
|
vpaddd CTR7, CTR6, XMMWORD PTR[Lone]
|
|
|
|
vpshufb CTR0, CTR0, BSWAPMASK
|
|
vpshufb CTR1, CTR1, BSWAPMASK
|
|
vpshufb CTR2, CTR2, BSWAPMASK
|
|
vpshufb CTR3, CTR3, BSWAPMASK
|
|
vpshufb CTR4, CTR4, BSWAPMASK
|
|
vpshufb CTR5, CTR5, BSWAPMASK
|
|
vpshufb CTR6, CTR6, BSWAPMASK
|
|
vpshufb CTR7, CTR7, BSWAPMASK
|
|
|
|
vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
|
|
vpxor CTR0, CTR0, TMP3
|
|
vpxor CTR1, CTR1, TMP3
|
|
vpxor CTR2, CTR2, TMP3
|
|
vpxor CTR3, CTR3, TMP3
|
|
vpxor CTR4, CTR4, TMP3
|
|
vpxor CTR5, CTR5, TMP3
|
|
vpxor CTR6, CTR6, TMP3
|
|
vpxor CTR7, CTR7, TMP3
|
|
|
|
ROUND 1
|
|
|
|
add aluCTR, 8
|
|
mov aluTMP, aluCTR
|
|
xor aluTMP, aluKSl
|
|
bswap aluTMP
|
|
mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
|
|
|
|
ROUND 2
|
|
NEXTCTR 1
|
|
ROUND 3
|
|
NEXTCTR 2
|
|
ROUND 4
|
|
NEXTCTR 3
|
|
ROUND 5
|
|
NEXTCTR 4
|
|
ROUND 6
|
|
NEXTCTR 5
|
|
ROUND 7
|
|
NEXTCTR 6
|
|
ROUND 8
|
|
NEXTCTR 7
|
|
ROUND 9
|
|
vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
|
|
ROUND 10
|
|
ROUND 11
|
|
vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
|
|
ROUND 12
|
|
ROUND 13
|
|
vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
|
|
vaesenclast CTR0, CTR0, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
|
|
vaesenclast CTR1, CTR1, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
|
|
vaesenclast CTR2, CTR2, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
|
|
vaesenclast CTR3, CTR3, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
|
|
vaesenclast CTR4, CTR4, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
|
|
vaesenclast CTR5, CTR5, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
|
|
vaesenclast CTR6, CTR6, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
|
|
vaesenclast CTR7, CTR7, TMP3
|
|
|
|
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
|
vpshufb CTR0, CTR0, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
|
vpshufb CTR1, CTR1, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
|
vpshufb CTR2, CTR2, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
|
vpshufb CTR3, CTR3, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
|
vpshufb CTR4, CTR4, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
|
vpshufb CTR5, CTR5, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
|
vpshufb CTR6, CTR6, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[7*16 + CT], CTR7
|
|
vpshufb TMP5, CTR7, BSWAPMASK
|
|
|
|
vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
|
|
vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
|
|
vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
|
|
vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
|
|
vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
|
|
vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
|
|
vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
|
|
|
|
lea CT, [8*16 + CT]
|
|
lea PT, [8*16 + PT]
|
|
jmp LEncDataOctets
|
|
|
|
LEncDataOctets:
|
|
cmp len, 128
|
|
jb LEndEncOctets
|
|
sub len, 128
|
|
|
|
vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
|
|
vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
|
|
vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
|
|
vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
|
|
vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
|
|
vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
|
|
vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
|
|
vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
|
|
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
|
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
|
|
ROUNDMUL 1
|
|
NEXTCTR 0
|
|
vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
|
|
ROUNDMUL 2
|
|
NEXTCTR 1
|
|
vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
|
|
ROUNDMUL 3
|
|
NEXTCTR 2
|
|
vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
|
|
ROUNDMUL 4
|
|
NEXTCTR 3
|
|
vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
|
|
ROUNDMUL 5
|
|
NEXTCTR 4
|
|
vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
|
|
ROUNDMUL 6
|
|
NEXTCTR 5
|
|
vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
|
|
ROUNDMUL 7
|
|
NEXTCTR 6
|
|
|
|
ROUND 8
|
|
NEXTCTR 7
|
|
|
|
vpxor TMP0, TMP0, TMP1
|
|
vpxor TMP0, TMP0, TMP2
|
|
vpsrldq TMP3, TMP0, 8
|
|
vpxor TMP4, TMP1, TMP3
|
|
vpslldq TMP3, TMP0, 8
|
|
vpxor T, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr T,T,T,8
|
|
vpxor T, T, TMP1
|
|
|
|
ROUND 9
|
|
|
|
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr T,T,T,8
|
|
vpxor T, T, TMP1
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
|
|
ROUND 10
|
|
ROUND 11
|
|
vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
|
|
ROUND 12
|
|
ROUND 13
|
|
vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
|
|
vaesenclast CTR0, CTR0, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
|
|
vaesenclast CTR1, CTR1, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
|
|
vaesenclast CTR2, CTR2, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
|
|
vaesenclast CTR3, CTR3, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
|
|
vaesenclast CTR4, CTR4, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
|
|
vaesenclast CTR5, CTR5, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
|
|
vaesenclast CTR6, CTR6, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
|
|
vaesenclast CTR7, CTR7, TMP3
|
|
|
|
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
|
vpshufb CTR0, CTR0, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
|
vpshufb CTR1, CTR1, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
|
vpshufb CTR2, CTR2, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
|
vpshufb CTR3, CTR3, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
|
vpshufb CTR4, CTR4, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
|
vpshufb CTR5, CTR5, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
|
vpshufb CTR6, CTR6, BSWAPMASK
|
|
vmovdqu XMMWORD PTR[7*16 + CT], CTR7
|
|
vpshufb TMP5, CTR7, BSWAPMASK
|
|
|
|
vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
|
|
vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
|
|
vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
|
|
vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
|
|
vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
|
|
vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
|
|
vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
|
|
|
|
vpxor T, T, TMP4
|
|
|
|
lea CT, [8*16 + CT]
|
|
lea PT, [8*16 + PT]
|
|
jmp LEncDataOctets
|
|
|
|
LEndEncOctets:
|
|
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
|
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
|
|
KARATSUBA 1
|
|
vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
|
|
KARATSUBA 2
|
|
vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
|
|
KARATSUBA 3
|
|
vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
|
|
KARATSUBA 4
|
|
vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
|
|
KARATSUBA 5
|
|
vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
|
|
KARATSUBA 6
|
|
vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
|
|
KARATSUBA 7
|
|
|
|
vpxor TMP0, TMP0, TMP1
|
|
vpxor TMP0, TMP0, TMP2
|
|
vpsrldq TMP3, TMP0, 8
|
|
vpxor TMP4, TMP1, TMP3
|
|
vpslldq TMP3, TMP0, 8
|
|
vpxor T, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr T,T,T,8
|
|
vpxor T, T, TMP1
|
|
|
|
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr T,T,T,8
|
|
vpxor T, T, TMP1
|
|
|
|
vpxor T, T, TMP4
|
|
|
|
sub aluCTR, 7
|
|
|
|
LEncDataSingles:
|
|
|
|
cmp len, 16
|
|
jb LEncDataTail
|
|
sub len, 16
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
|
|
NEXTCTR 0
|
|
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast TMP1, TMP1, TMP2
|
|
vpxor TMP1, TMP1, XMMWORD PTR[PT]
|
|
vmovdqu XMMWORD PTR[CT], TMP1
|
|
|
|
lea PT, [16+PT]
|
|
lea CT, [16+CT]
|
|
|
|
vpshufb TMP1, TMP1, BSWAPMASK
|
|
vpxor T, T, TMP1
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
|
|
|
|
jmp LEncDataSingles
|
|
|
|
LEncDataTail:
|
|
|
|
test len, len
|
|
jz LEncDataEnd
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
|
|
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast TMP1, TMP1, TMP2
|
|
; zero a temp location
|
|
vpxor TMP2, TMP2, TMP2
|
|
vmovdqa XMMWORD PTR[rsp], TMP2
|
|
; copy as many bytes as needed
|
|
xor KS, KS
|
|
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov al, [PT + KS]
|
|
mov [rsp + KS], al
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
vpxor TMP1, TMP1, XMMWORD PTR[rsp]
|
|
vmovdqa XMMWORD PTR[rsp], TMP1
|
|
xor KS, KS
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov al, [rsp + KS]
|
|
mov [CT + KS], al
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
cmp KS, 16
|
|
je @f
|
|
mov BYTE PTR[rsp + KS], 0
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
BAIL:
|
|
vmovdqa TMP1, XMMWORD PTR[rsp]
|
|
vpshufb TMP1, TMP1, BSWAPMASK
|
|
vpxor T, T, TMP1
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
|
|
|
|
LEncDataEnd:
|
|
|
|
vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
|
|
bswap aluCTR
|
|
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
|
|
|
mov rsp, rbp
|
|
|
|
vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
|
|
vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
|
|
vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
|
|
vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
|
|
vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
|
|
vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
|
|
vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
|
|
vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
|
|
vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
|
|
vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
|
|
|
|
add rsp, 10*16
|
|
pop rbp
|
|
pop r13
|
|
pop r12
|
|
pop r11
|
|
|
|
vzeroupper
|
|
|
|
ret
|
|
intel_aes_gcmENC ENDP
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Decrypt and Authenticate
|
|
; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmDEC PROC
|
|
|
|
NEXTCTR MACRO i
|
|
add aluCTR, 1
|
|
mov aluTMP, aluCTR
|
|
xor aluTMP, aluKSl
|
|
bswap aluTMP
|
|
mov [3*4 + i*16 + rsp], aluTMP
|
|
ENDM
|
|
|
|
PT textequ <rdx>
|
|
CT textequ <rcx>
|
|
|
|
test len, len
|
|
jnz LbeginDEC
|
|
ret
|
|
|
|
LbeginDEC:
|
|
|
|
vzeroupper
|
|
push r11
|
|
push r12
|
|
push r13
|
|
push rbp
|
|
sub rsp, 10*16
|
|
vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
|
|
vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
|
|
vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
|
|
vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
|
|
vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
|
|
vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
|
|
vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
|
|
vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
|
|
vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
|
|
vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
|
|
|
|
mov rbp, rsp
|
|
sub rsp, 8*16
|
|
and rsp, -16
|
|
|
|
vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
|
|
vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
|
vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
|
|
mov KS, [16*16 + 3*16 + Gctx]
|
|
mov NR, [4 + KS]
|
|
lea KS, [48 + KS]
|
|
|
|
vpshufb CTR0, CTR0, BSWAPMASK
|
|
|
|
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
|
mov aluKSl, [3*4 + KS]
|
|
bswap aluCTR
|
|
bswap aluKSl
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
|
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
|
vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
|
|
|
|
cmp len, 128
|
|
jb LDecDataSingles
|
|
; Prepare the "top" counters
|
|
vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
|
|
vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
|
|
|
|
NEXTCTR 1
|
|
NEXTCTR 2
|
|
NEXTCTR 3
|
|
NEXTCTR 4
|
|
NEXTCTR 5
|
|
NEXTCTR 6
|
|
NEXTCTR 7
|
|
|
|
LDecDataOctets:
|
|
cmp len, 128
|
|
jb LEndDecOctets
|
|
sub len, 128
|
|
|
|
vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
|
|
vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
|
|
vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
|
|
vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
|
|
vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
|
|
vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
|
|
vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
|
|
vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
|
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
ROUNDMUL 1
|
|
NEXTCTR 0
|
|
vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
ROUNDMUL 2
|
|
NEXTCTR 1
|
|
vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
ROUNDMUL 3
|
|
NEXTCTR 2
|
|
vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
ROUNDMUL 4
|
|
NEXTCTR 3
|
|
vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
ROUNDMUL 5
|
|
NEXTCTR 4
|
|
vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
ROUNDMUL 6
|
|
NEXTCTR 5
|
|
vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
|
|
vpshufb TMP5, TMP5, BSWAPMASK
|
|
vpxor TMP5, TMP5, T
|
|
ROUNDMUL 7
|
|
NEXTCTR 6
|
|
|
|
ROUND 8
|
|
NEXTCTR 7
|
|
|
|
vpxor TMP0, TMP0, TMP1
|
|
vpxor TMP0, TMP0, TMP2
|
|
vpsrldq TMP3, TMP0, 8
|
|
vpxor TMP4, TMP1, TMP3
|
|
vpslldq TMP3, TMP0, 8
|
|
vpxor T, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr T,T,T,8
|
|
vpxor T, T, TMP1
|
|
|
|
ROUND 9
|
|
|
|
vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr T,T,T,8
|
|
vpxor T, T, TMP1
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
|
|
ROUND 10
|
|
ROUND 11
|
|
vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
|
|
ROUND 12
|
|
ROUND 13
|
|
vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT]
|
|
vaesenclast CTR0, CTR0, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT]
|
|
vaesenclast CTR1, CTR1, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT]
|
|
vaesenclast CTR2, CTR2, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT]
|
|
vaesenclast CTR3, CTR3, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT]
|
|
vaesenclast CTR4, CTR4, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT]
|
|
vaesenclast CTR5, CTR5, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT]
|
|
vaesenclast CTR6, CTR6, TMP3
|
|
vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT]
|
|
vaesenclast CTR7, CTR7, TMP3
|
|
|
|
vmovdqu XMMWORD PTR[0*16 + PT], CTR0
|
|
vmovdqu XMMWORD PTR[1*16 + PT], CTR1
|
|
vmovdqu XMMWORD PTR[2*16 + PT], CTR2
|
|
vmovdqu XMMWORD PTR[3*16 + PT], CTR3
|
|
vmovdqu XMMWORD PTR[4*16 + PT], CTR4
|
|
vmovdqu XMMWORD PTR[5*16 + PT], CTR5
|
|
vmovdqu XMMWORD PTR[6*16 + PT], CTR6
|
|
vmovdqu XMMWORD PTR[7*16 + PT], CTR7
|
|
|
|
vpxor T, T, TMP4
|
|
|
|
lea CT, [8*16 + CT]
|
|
lea PT, [8*16 + PT]
|
|
jmp LDecDataOctets
|
|
|
|
LEndDecOctets:
|
|
|
|
sub aluCTR, 7
|
|
|
|
LDecDataSingles:
|
|
|
|
cmp len, 16
|
|
jb LDecDataTail
|
|
sub len, 16
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
|
|
NEXTCTR 0
|
|
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast TMP1, TMP1, TMP2
|
|
|
|
vmovdqu TMP2, XMMWORD PTR[CT]
|
|
vpxor TMP1, TMP1, TMP2
|
|
vmovdqu XMMWORD PTR[PT], TMP1
|
|
|
|
lea PT, [16+PT]
|
|
lea CT, [16+CT]
|
|
|
|
vpshufb TMP2, TMP2, BSWAPMASK
|
|
vpxor T, T, TMP2
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
|
|
|
|
jmp LDecDataSingles
|
|
|
|
LDecDataTail:
|
|
|
|
test len, len
|
|
jz LDecDataEnd
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
|
|
inc aluCTR
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast TMP1, TMP1, TMP2
|
|
; copy as many bytes as needed
|
|
xor KS, KS
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov al, [CT + KS]
|
|
mov [rsp + KS], al
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
cmp KS, 16
|
|
je @f
|
|
mov BYTE PTR[rsp + KS], 0
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
vmovdqa TMP2, XMMWORD PTR[rsp]
|
|
vpshufb TMP2, TMP2, BSWAPMASK
|
|
vpxor T, T, TMP2
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4
|
|
|
|
|
|
vpxor TMP1, TMP1, XMMWORD PTR[rsp]
|
|
vmovdqa XMMWORD PTR[rsp], TMP1
|
|
xor KS, KS
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov al, [rsp + KS]
|
|
mov [PT + KS], al
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
|
|
LDecDataEnd:
|
|
|
|
vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
|
|
bswap aluCTR
|
|
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
|
|
|
mov rsp, rbp
|
|
|
|
vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
|
|
vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
|
|
vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
|
|
vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
|
|
vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
|
|
vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
|
|
vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
|
|
vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
|
|
vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
|
|
vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
|
|
|
|
add rsp, 10*16
|
|
pop rbp
|
|
pop r13
|
|
pop r12
|
|
pop r11
|
|
|
|
vzeroupper
|
|
|
|
ret
|
|
ret
|
|
intel_aes_gcmDEC ENDP
|
|
|
|
|
|
END
|