mirror of
https://github.com/rn10950/RetroZilla.git
synced 2024-11-15 04:00:12 +01:00
30d33aa8e8
9934c8faef29, 3c3b381c4865, 5a67f6beee9a, 1b1eb6d77728, a8b668fd72f7, bug962760, bug743700, bug857304, bug972653, bug972450, bug971358, bug903885, bug977073, bug976111, bug949939, bug947653, bug947572, bug903885, bug979106, bug966596, bug979004, bug979752, bug980848, bug938369, bug981170, bug668130, bug974693, bug975056, bug979132, bug370717, bug979070, bug985070, bug900067, bug977673, bug519255, bug989558, bug557299, bug987263, bug369802, a751a5146718, bug992343, bug952572, bug979703, bug994883, bug994869, bug993489, bug984608, bug977869, bug667371, bug672828, bug793347, bug977869
1210 lines
31 KiB
NASM
1210 lines
31 KiB
NASM
; LICENSE:
|
|
; This submission to NSS is to be made available under the terms of the
|
|
; Mozilla Public License, v. 2.0. You can obtain one at http:
|
|
; //mozilla.org/MPL/2.0/.
|
|
;###############################################################################
|
|
; Copyright(c) 2014, Intel Corp.
|
|
; Developers and authors:
|
|
; Shay Gueron and Vlad Krasnov
|
|
; Intel Corporation, Israel Development Centre, Haifa, Israel
|
|
; Please send feedback directly to crypto.feedback.alias@intel.com
|
|
|
|
|
|
.MODEL FLAT, C
|
|
.XMM
|
|
|
|
.DATA
|
|
ALIGN 16
|
|
Lone dq 1,0
|
|
Ltwo dq 2,0
|
|
Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
|
Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
|
|
Lpoly dq 01h, 0c200000000000000h
|
|
|
|
.CODE
|
|
|
|
|
|
GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
|
|
vpclmulqdq TMP1, SRC2, SRC1, 0h
|
|
vpclmulqdq TMP4, SRC2, SRC1, 011h
|
|
|
|
vpshufd TMP2, SRC2, 78
|
|
vpshufd TMP3, SRC1, 78
|
|
vpxor TMP2, TMP2, SRC2
|
|
vpxor TMP3, TMP3, SRC1
|
|
|
|
vpclmulqdq TMP2, TMP2, TMP3, 0h
|
|
vpxor TMP2, TMP2, TMP1
|
|
vpxor TMP2, TMP2, TMP4
|
|
|
|
vpslldq TMP3, TMP2, 8
|
|
vpsrldq TMP2, TMP2, 8
|
|
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpxor TMP4, TMP4, TMP2
|
|
|
|
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
|
vpshufd TMP3, TMP1, 78
|
|
vpxor TMP1, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
|
vpshufd TMP3, TMP1, 78
|
|
vpxor TMP1, TMP2, TMP3
|
|
|
|
vpxor DST, TMP1, TMP4
|
|
|
|
ENDM
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Generates the final GCM tag
|
|
; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
|
|
; unsigned char *Tp,
|
|
; unsigned int Mlen,
|
|
; unsigned int Alen,
|
|
; unsigned char* X0,
|
|
; unsigned char* TAG);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmTAG PROC
|
|
|
|
Htbl textequ <eax>
|
|
Tp textequ <ecx>
|
|
X0 textequ <edx>
|
|
TAG textequ <ebx>
|
|
|
|
T textequ <xmm0>
|
|
TMP0 textequ <xmm1>
|
|
|
|
push ebx
|
|
|
|
mov Htbl, [esp + 2*4 + 0*4]
|
|
mov Tp, [esp + 2*4 + 1*4]
|
|
mov X0, [esp + 2*4 + 4*4]
|
|
mov TAG, [esp + 2*4 + 5*4]
|
|
|
|
vzeroupper
|
|
vmovdqu T, XMMWORD PTR[Tp]
|
|
|
|
vpxor TMP0, TMP0, TMP0
|
|
vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0
|
|
vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2
|
|
vpsllq TMP0, TMP0, 3
|
|
|
|
vpxor T, T, TMP0
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
|
|
|
vpshufb T, T, [Lbswap_mask]
|
|
vpxor T, T, [X0]
|
|
vmovdqu XMMWORD PTR[TAG], T
|
|
vzeroupper
|
|
|
|
pop ebx
|
|
|
|
ret
|
|
|
|
intel_aes_gcmTAG ENDP
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Generates the H table
|
|
; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmINIT PROC
|
|
|
|
Htbl textequ <eax>
|
|
KS textequ <ecx>
|
|
NR textequ <edx>
|
|
|
|
T textequ <xmm0>
|
|
TMP0 textequ <xmm1>
|
|
|
|
mov Htbl, [esp + 4*1 + 0*4]
|
|
mov KS, [esp + 4*1 + 1*4]
|
|
mov NR, [esp + 4*1 + 2*4]
|
|
|
|
vzeroupper
|
|
; AES-ENC(0)
|
|
vmovdqu T, XMMWORD PTR[KS]
|
|
lea KS, [16 + KS]
|
|
dec NR
|
|
Lenc_loop:
|
|
vaesenc T, T, [KS]
|
|
lea KS, [16 + KS]
|
|
dec NR
|
|
jnz Lenc_loop
|
|
|
|
vaesenclast T, T, [KS]
|
|
vpshufb T, T, [Lbswap_mask]
|
|
|
|
;Calculate H` = GFMUL(H, 2)
|
|
vpsrad xmm3, T, 31
|
|
vpshufd xmm3, xmm3, 0ffh
|
|
vpand xmm5, xmm3, [Lpoly]
|
|
vpsrld xmm3, T, 31
|
|
vpslld xmm4, T, 1
|
|
vpslldq xmm3, xmm3, 4
|
|
vpxor T, xmm4, xmm3
|
|
vpxor T, T, xmm5
|
|
|
|
vmovdqu TMP0, T
|
|
vmovdqu XMMWORD PTR[Htbl + 0*16], T
|
|
|
|
vpshufd xmm2, T, 78
|
|
vpxor xmm2, xmm2, T
|
|
vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
|
|
|
|
i = 1
|
|
WHILE i LT 8
|
|
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
|
vmovdqu XMMWORD PTR[Htbl + i*16], T
|
|
vpshufd xmm2, T, 78
|
|
vpxor xmm2, xmm2, T
|
|
vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
|
|
i = i+1
|
|
ENDM
|
|
vzeroupper
|
|
ret
|
|
intel_aes_gcmINIT ENDP
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Authenticate only
|
|
; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmAAD PROC
|
|
|
|
Htbl textequ <eax>
|
|
inp textequ <ecx>
|
|
len textequ <edx>
|
|
Tp textequ <ebx>
|
|
hlp0 textequ <esi>
|
|
|
|
DATA textequ <xmm0>
|
|
T textequ <xmm1>
|
|
TMP0 textequ <xmm2>
|
|
TMP1 textequ <xmm3>
|
|
TMP2 textequ <xmm4>
|
|
TMP3 textequ <xmm5>
|
|
TMP4 textequ <xmm6>
|
|
Xhi textequ <xmm7>
|
|
|
|
KARATSUBA_AAD MACRO i
|
|
vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
|
|
vpxor TMP0, TMP0, TMP3
|
|
vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
|
|
vpxor TMP2, TMP2, TMP3
|
|
ENDM
|
|
|
|
cmp DWORD PTR[esp + 1*3 + 2*4], 0
|
|
jnz LbeginAAD
|
|
ret
|
|
|
|
LbeginAAD:
|
|
push ebx
|
|
push esi
|
|
|
|
mov Htbl, [esp + 4*3 + 0*4]
|
|
mov inp, [esp + 4*3 + 1*4]
|
|
mov len, [esp + 4*3 + 2*4]
|
|
mov Tp, [esp + 4*3 + 3*4]
|
|
|
|
vzeroupper
|
|
|
|
vpxor Xhi, Xhi, Xhi
|
|
|
|
vmovdqu T, XMMWORD PTR[Tp]
|
|
;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
|
|
mov hlp0, len
|
|
and hlp0, 128-1
|
|
jz Lmod_loop
|
|
|
|
and len, -128
|
|
sub hlp0, 16
|
|
|
|
; Prefix block
|
|
vmovdqu DATA, XMMWORD PTR[inp]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
vpxor DATA, DATA, T
|
|
|
|
vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h
|
|
vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
|
|
|
|
lea inp, [inp+16]
|
|
test hlp0, hlp0
|
|
jnz Lpre_loop
|
|
jmp Lred1
|
|
|
|
;hash remaining prefix bocks (up to 7 total prefix blocks)
|
|
Lpre_loop:
|
|
|
|
sub hlp0, 16
|
|
|
|
vmovdqu DATA, XMMWORD PTR[inp]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
|
|
vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h
|
|
vpxor TMP0, TMP0, TMP3
|
|
vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
|
|
vpxor TMP2, TMP2, TMP3
|
|
|
|
test hlp0, hlp0
|
|
lea inp, [inp+16]
|
|
jnz Lpre_loop
|
|
|
|
Lred1:
|
|
|
|
vpxor TMP2, TMP2, TMP0
|
|
vpxor TMP2, TMP2, TMP1
|
|
vpsrldq TMP3, TMP2, 8
|
|
vpslldq TMP2, TMP2, 8
|
|
|
|
vpxor Xhi, TMP1, TMP3
|
|
vpxor T, TMP0, TMP2
|
|
|
|
Lmod_loop:
|
|
|
|
sub len, 16*8
|
|
jb Ldone
|
|
; Block #0
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*7]
|
|
vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask]
|
|
|
|
vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h
|
|
vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h
|
|
vpshufd TMP3, DATA, 78
|
|
vpxor TMP3, TMP3, DATA
|
|
vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h
|
|
|
|
; Block #1
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*6]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 1
|
|
|
|
; Block #2
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*5]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
|
|
vpalignr T, T, T, 8
|
|
|
|
KARATSUBA_AAD 2
|
|
|
|
vpxor T, T, TMP4 ;reduction stage 1b
|
|
|
|
; Block #3
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*4]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 3
|
|
; Block #4
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*3]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
|
|
vpalignr T, T, T, 8
|
|
|
|
KARATSUBA_AAD 4
|
|
|
|
vpxor T, T, TMP4 ;reduction stage 2b
|
|
; Block #5
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*2]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 5
|
|
|
|
vpxor T, T, Xhi ;reduction finalize
|
|
; Block #6
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*1]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
KARATSUBA_AAD 6
|
|
; Block #7
|
|
vmovdqu DATA, XMMWORD PTR[inp + 16*0]
|
|
vpshufb DATA, DATA, [Lbswap_mask]
|
|
vpxor DATA, DATA, T
|
|
KARATSUBA_AAD 7
|
|
; Aggregated 8 blocks, now karatsuba fixup
|
|
vpxor TMP2, TMP2, TMP0
|
|
vpxor TMP2, TMP2, TMP1
|
|
vpsrldq TMP3, TMP2, 8
|
|
vpslldq TMP2, TMP2, 8
|
|
|
|
vpxor Xhi, TMP1, TMP3
|
|
vpxor T, TMP0, TMP2
|
|
|
|
lea inp, [inp + 16*8]
|
|
jmp Lmod_loop
|
|
|
|
Ldone:
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h
|
|
vpalignr T, T, T, 8
|
|
vpxor T, T, TMP4
|
|
|
|
vpclmulqdq TMP4, T, [Lpoly], 010h
|
|
vpalignr T, T, T, 8
|
|
vpxor T, T, TMP4
|
|
|
|
vpxor T, T, Xhi
|
|
vmovdqu XMMWORD PTR[Tp], T
|
|
vzeroupper
|
|
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
|
|
intel_aes_gcmAAD ENDP
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Encrypt and Authenticate
|
|
; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
ALIGN 16
|
|
intel_aes_gcmENC PROC
|
|
|
|
PT textequ <eax>
|
|
CT textequ <ecx>
|
|
Htbl textequ <edx>
|
|
Gctx textequ <edx>
|
|
len textequ <DWORD PTR[ebp + 5*4 + 3*4]>
|
|
KS textequ <esi>
|
|
NR textequ <DWORD PTR[-40 + KS]>
|
|
|
|
aluCTR textequ <ebx>
|
|
aluTMP textequ <edi>
|
|
|
|
T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]>
|
|
TMP0 textequ <xmm1>
|
|
TMP1 textequ <xmm2>
|
|
TMP2 textequ <xmm3>
|
|
TMP3 textequ <xmm4>
|
|
TMP4 textequ <xmm5>
|
|
TMP5 textequ <xmm6>
|
|
|
|
CTR0 textequ <xmm0>
|
|
CTR1 textequ <xmm1>
|
|
CTR2 textequ <xmm2>
|
|
CTR3 textequ <xmm3>
|
|
CTR4 textequ <xmm4>
|
|
CTR5 textequ <xmm5>
|
|
CTR6 textequ <xmm6>
|
|
|
|
ROUND MACRO i
|
|
vmovdqu xmm7, XMMWORD PTR[i*16 + KS]
|
|
vaesenc CTR0, CTR0, xmm7
|
|
vaesenc CTR1, CTR1, xmm7
|
|
vaesenc CTR2, CTR2, xmm7
|
|
vaesenc CTR3, CTR3, xmm7
|
|
vaesenc CTR4, CTR4, xmm7
|
|
vaesenc CTR5, CTR5, xmm7
|
|
vaesenc CTR6, CTR6, xmm7
|
|
ENDM
|
|
|
|
KARATSUBA MACRO i
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
|
|
vpxor TMP0, TMP0, TMP3
|
|
vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
|
|
vpclmulqdq TMP3, TMP5, TMP4, 011h
|
|
vpxor TMP1, TMP1, TMP3
|
|
vpclmulqdq TMP3, TMP5, TMP4, 000h
|
|
vpxor TMP2, TMP2, TMP3
|
|
ENDM
|
|
|
|
NEXTCTR MACRO i
|
|
add aluCTR, 1
|
|
mov aluTMP, aluCTR
|
|
bswap aluTMP
|
|
xor aluTMP, [3*4 + KS]
|
|
mov [3*4 + 8*16 + i*16 + esp], aluTMP
|
|
ENDM
|
|
|
|
cmp DWORD PTR[1*4 + 3*4 + esp], 0
|
|
jne LbeginENC
|
|
ret
|
|
|
|
LbeginENC:
|
|
|
|
vzeroupper
|
|
push ebp
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
|
|
mov ebp, esp
|
|
sub esp, 16*16
|
|
and esp, -16
|
|
|
|
mov PT, [ebp + 5*4 + 0*4]
|
|
mov CT, [ebp + 5*4 + 1*4]
|
|
mov Gctx, [ebp + 5*4 + 2*4]
|
|
|
|
mov KS, [16*16 + 3*16 + Gctx]
|
|
lea KS, [44 + KS]
|
|
|
|
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
|
bswap aluCTR
|
|
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
|
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
|
vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0
|
|
|
|
cmp len, 16*7
|
|
jb LEncDataSingles
|
|
; Prepare the "top" counters
|
|
vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0
|
|
|
|
vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
|
; Encrypt the initial 7 blocks
|
|
sub len, 16*7
|
|
vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
|
|
vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
|
|
vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
|
|
vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
|
|
vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
|
|
vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
|
|
|
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask]
|
|
|
|
vmovdqu xmm7, XMMWORD PTR[0*16 + KS]
|
|
vpxor CTR0, CTR0, xmm7
|
|
vpxor CTR1, CTR1, xmm7
|
|
vpxor CTR2, CTR2, xmm7
|
|
vpxor CTR3, CTR3, xmm7
|
|
vpxor CTR4, CTR4, xmm7
|
|
vpxor CTR5, CTR5, xmm7
|
|
vpxor CTR6, CTR6, xmm7
|
|
|
|
ROUND 1
|
|
|
|
add aluCTR, 7
|
|
mov aluTMP, aluCTR
|
|
bswap aluTMP
|
|
xor aluTMP, [KS + 3*4]
|
|
mov [8*16 + 0*16 + 3*4 + esp], aluTMP
|
|
|
|
ROUND 2
|
|
NEXTCTR 1
|
|
ROUND 3
|
|
NEXTCTR 2
|
|
ROUND 4
|
|
NEXTCTR 3
|
|
ROUND 5
|
|
NEXTCTR 4
|
|
ROUND 6
|
|
NEXTCTR 5
|
|
ROUND 7
|
|
NEXTCTR 6
|
|
ROUND 8
|
|
ROUND 9
|
|
vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
|
|
ROUND 10
|
|
ROUND 11
|
|
vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
|
|
ROUND 12
|
|
ROUND 13
|
|
vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast CTR0, CTR0, xmm7
|
|
vaesenclast CTR1, CTR1, xmm7
|
|
vaesenclast CTR2, CTR2, xmm7
|
|
vaesenclast CTR3, CTR3, xmm7
|
|
vaesenclast CTR4, CTR4, xmm7
|
|
vaesenclast CTR5, CTR5, xmm7
|
|
vaesenclast CTR6, CTR6, xmm7
|
|
|
|
vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
|
|
vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
|
|
vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
|
|
vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
|
|
vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
|
|
vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
|
|
vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
|
|
|
|
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
|
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
|
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
|
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
|
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
|
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
|
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
|
|
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
|
|
|
|
vmovdqa XMMWORD PTR[1*16 + esp], CTR5
|
|
vmovdqa XMMWORD PTR[2*16 + esp], CTR4
|
|
vmovdqa XMMWORD PTR[3*16 + esp], CTR3
|
|
vmovdqa XMMWORD PTR[4*16 + esp], CTR2
|
|
vmovdqa XMMWORD PTR[5*16 + esp], CTR1
|
|
vmovdqa XMMWORD PTR[6*16 + esp], CTR0
|
|
|
|
lea CT, [7*16 + CT]
|
|
lea PT, [7*16 + PT]
|
|
jmp LEncData7
|
|
|
|
LEncData7:
|
|
cmp len, 16*7
|
|
jb LEndEnc7
|
|
sub len, 16*7
|
|
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
|
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
|
|
KARATSUBA 1
|
|
vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
|
|
KARATSUBA 2
|
|
vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
|
|
KARATSUBA 3
|
|
vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
|
|
KARATSUBA 4
|
|
vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
|
|
KARATSUBA 5
|
|
vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
|
|
vpxor TMP5, TMP5, T
|
|
KARATSUBA 6
|
|
|
|
vpxor TMP0, TMP0, TMP1
|
|
vpxor TMP0, TMP0, TMP2
|
|
vpsrldq TMP3, TMP0, 8
|
|
vpxor TMP4, TMP1, TMP3
|
|
vpslldq TMP3, TMP0, 8
|
|
vpxor TMP5, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr TMP5,TMP5,TMP5,8
|
|
vpxor TMP5, TMP5, TMP1
|
|
|
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr TMP5,TMP5,TMP5,8
|
|
vpxor TMP5, TMP5, TMP1
|
|
|
|
vpxor TMP5, TMP5, TMP4
|
|
vmovdqu T, TMP5
|
|
|
|
vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp]
|
|
vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp]
|
|
vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp]
|
|
vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp]
|
|
vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp]
|
|
vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp]
|
|
vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp]
|
|
|
|
ROUND 1
|
|
NEXTCTR 0
|
|
ROUND 2
|
|
NEXTCTR 1
|
|
ROUND 3
|
|
NEXTCTR 2
|
|
ROUND 4
|
|
NEXTCTR 3
|
|
ROUND 5
|
|
NEXTCTR 4
|
|
ROUND 6
|
|
NEXTCTR 5
|
|
ROUND 7
|
|
NEXTCTR 6
|
|
|
|
ROUND 8
|
|
ROUND 9
|
|
|
|
vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
|
|
ROUND 10
|
|
ROUND 11
|
|
vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
|
|
ROUND 12
|
|
ROUND 13
|
|
vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast CTR0, CTR0, xmm7
|
|
vaesenclast CTR1, CTR1, xmm7
|
|
vaesenclast CTR2, CTR2, xmm7
|
|
vaesenclast CTR3, CTR3, xmm7
|
|
vaesenclast CTR4, CTR4, xmm7
|
|
vaesenclast CTR5, CTR5, xmm7
|
|
vaesenclast CTR6, CTR6, xmm7
|
|
|
|
vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
|
|
vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
|
|
vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
|
|
vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
|
|
vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
|
|
vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
|
|
vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
|
|
|
|
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
|
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
|
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
|
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
|
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
|
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
|
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
|
|
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
|
|
vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
|
|
|
|
vmovdqa XMMWORD PTR[1*16 + esp], CTR5
|
|
vmovdqa XMMWORD PTR[2*16 + esp], CTR4
|
|
vmovdqa XMMWORD PTR[3*16 + esp], CTR3
|
|
vmovdqa XMMWORD PTR[4*16 + esp], CTR2
|
|
vmovdqa XMMWORD PTR[5*16 + esp], CTR1
|
|
vmovdqa XMMWORD PTR[6*16 + esp], CTR0
|
|
|
|
lea CT, [7*16 + CT]
|
|
lea PT, [7*16 + PT]
|
|
jmp LEncData7
|
|
|
|
LEndEnc7:
|
|
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
|
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
|
|
KARATSUBA 1
|
|
vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
|
|
KARATSUBA 2
|
|
vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
|
|
KARATSUBA 3
|
|
vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
|
|
KARATSUBA 4
|
|
vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
|
|
KARATSUBA 5
|
|
vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
|
|
vpxor TMP5, TMP5, T
|
|
KARATSUBA 6
|
|
|
|
vpxor TMP0, TMP0, TMP1
|
|
vpxor TMP0, TMP0, TMP2
|
|
vpsrldq TMP3, TMP0, 8
|
|
vpxor TMP4, TMP1, TMP3
|
|
vpslldq TMP3, TMP0, 8
|
|
vpxor TMP5, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr TMP5,TMP5,TMP5,8
|
|
vpxor TMP5, TMP5, TMP1
|
|
|
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr TMP5,TMP5,TMP5,8
|
|
vpxor TMP5, TMP5, TMP1
|
|
|
|
vpxor TMP5, TMP5, TMP4
|
|
vmovdqu T, TMP5
|
|
|
|
sub aluCTR, 6
|
|
|
|
LEncDataSingles:
|
|
|
|
cmp len, 16
|
|
jb LEncDataTail
|
|
sub len, 16
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
|
|
NEXTCTR 0
|
|
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast TMP1, TMP1, TMP2
|
|
vpxor TMP1, TMP1, XMMWORD PTR[PT]
|
|
vmovdqu XMMWORD PTR[CT], TMP1
|
|
|
|
lea PT, [16+PT]
|
|
lea CT, [16+CT]
|
|
|
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
|
vpxor TMP1, TMP1, T
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
|
vmovdqu T, TMP1
|
|
|
|
jmp LEncDataSingles
|
|
|
|
LEncDataTail:
|
|
|
|
cmp len, 0
|
|
je LEncDataEnd
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
|
|
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast TMP1, TMP1, TMP2
|
|
; zero a temp location
|
|
vpxor TMP2, TMP2, TMP2
|
|
vmovdqa XMMWORD PTR[esp], TMP2
|
|
; copy as many bytes as needed
|
|
xor KS, KS
|
|
mov aluTMP, edx
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov dl, BYTE PTR[PT + KS]
|
|
mov BYTE PTR[esp + KS], dl
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
vpxor TMP1, TMP1, XMMWORD PTR[esp]
|
|
vmovdqa XMMWORD PTR[esp], TMP1
|
|
xor KS, KS
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov dl, BYTE PTR[esp + KS]
|
|
mov BYTE PTR[CT + KS], dl
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
cmp KS, 16
|
|
je @f
|
|
mov BYTE PTR[esp + KS], 0
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
mov edx, aluTMP
|
|
vmovdqa TMP1, XMMWORD PTR[esp]
|
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
|
vpxor TMP1, TMP1, T
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
|
vmovdqu T, TMP1
|
|
|
|
LEncDataEnd:
|
|
inc aluCTR
|
|
bswap aluCTR
|
|
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
|
|
|
mov esp, ebp
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
pop ebp
|
|
|
|
|
|
vzeroupper
|
|
|
|
ret
|
|
intel_aes_gcmENC ENDP
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; Decrypt and Authenticate
|
|
; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
|
|
;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
NEXTCTR MACRO i
|
|
add aluCTR, 1
|
|
mov aluTMP, aluCTR
|
|
bswap aluTMP
|
|
xor aluTMP, [3*4 + KS]
|
|
mov [3*4 + i*16 + esp], aluTMP
|
|
ENDM
|
|
|
|
intel_aes_gcmDEC PROC
|
|
|
|
cmp DWORD PTR[1*4 + 3*4 + esp], 0
|
|
jne LbeginDEC
|
|
ret
|
|
|
|
LbeginDEC:
|
|
|
|
vzeroupper
|
|
push ebp
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
|
|
mov ebp, esp
|
|
sub esp, 8*16
|
|
and esp, -16
|
|
|
|
mov CT, [ebp + 5*4 + 0*4]
|
|
mov PT, [ebp + 5*4 + 1*4]
|
|
mov Gctx, [ebp + 5*4 + 2*4]
|
|
|
|
mov KS, [16*16 + 3*16 + Gctx]
|
|
lea KS, [44 + KS]
|
|
|
|
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
|
bswap aluCTR
|
|
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
|
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
|
vmovdqu XMMWORD PTR[0*16 + esp], TMP0
|
|
|
|
cmp len, 16*7
|
|
jb LDecDataSingles
|
|
vmovdqu XMMWORD PTR[1*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[2*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[3*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[4*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[5*16 + esp], TMP0
|
|
vmovdqu XMMWORD PTR[6*16 + esp], TMP0
|
|
dec aluCTR
|
|
|
|
LDecData7:
|
|
cmp len, 16*7
|
|
jb LDecData7End
|
|
sub len, 16*7
|
|
|
|
vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
|
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
|
vpxor TMP5, TMP5, T
|
|
vpshufd TMP4, TMP5, 78
|
|
vpxor TMP4, TMP4, TMP5
|
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h
|
|
vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl]
|
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
|
|
|
NEXTCTR 0
|
|
vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
|
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
|
KARATSUBA 5
|
|
NEXTCTR 1
|
|
vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
|
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
|
KARATSUBA 4
|
|
NEXTCTR 2
|
|
vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
|
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
|
KARATSUBA 3
|
|
NEXTCTR 3
|
|
vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
|
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
|
KARATSUBA 2
|
|
NEXTCTR 4
|
|
vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
|
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
|
KARATSUBA 1
|
|
NEXTCTR 5
|
|
vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
|
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
|
KARATSUBA 0
|
|
NEXTCTR 6
|
|
|
|
vpxor TMP0, TMP0, TMP1
|
|
vpxor TMP0, TMP0, TMP2
|
|
vpsrldq TMP3, TMP0, 8
|
|
vpxor TMP4, TMP1, TMP3
|
|
vpslldq TMP3, TMP0, 8
|
|
vpxor TMP5, TMP2, TMP3
|
|
|
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr TMP5,TMP5,TMP5,8
|
|
vpxor TMP5, TMP5, TMP1
|
|
|
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
|
vpalignr TMP5,TMP5,TMP5,8
|
|
vpxor TMP5, TMP5, TMP1
|
|
|
|
vpxor TMP5, TMP5, TMP4
|
|
vmovdqu T, TMP5
|
|
|
|
vmovdqa CTR0, XMMWORD PTR[0*16 + esp]
|
|
vmovdqa CTR1, XMMWORD PTR[1*16 + esp]
|
|
vmovdqa CTR2, XMMWORD PTR[2*16 + esp]
|
|
vmovdqa CTR3, XMMWORD PTR[3*16 + esp]
|
|
vmovdqa CTR4, XMMWORD PTR[4*16 + esp]
|
|
vmovdqa CTR5, XMMWORD PTR[5*16 + esp]
|
|
vmovdqa CTR6, XMMWORD PTR[6*16 + esp]
|
|
|
|
ROUND 1
|
|
ROUND 2
|
|
ROUND 3
|
|
ROUND 4
|
|
ROUND 5
|
|
ROUND 6
|
|
ROUND 7
|
|
ROUND 8
|
|
ROUND 9
|
|
vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
|
|
ROUND 10
|
|
ROUND 11
|
|
vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
|
|
ROUND 12
|
|
ROUND 13
|
|
vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast CTR0, CTR0, xmm7
|
|
vaesenclast CTR1, CTR1, xmm7
|
|
vaesenclast CTR2, CTR2, xmm7
|
|
vaesenclast CTR3, CTR3, xmm7
|
|
vaesenclast CTR4, CTR4, xmm7
|
|
vaesenclast CTR5, CTR5, xmm7
|
|
vaesenclast CTR6, CTR6, xmm7
|
|
|
|
vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT]
|
|
vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT]
|
|
vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT]
|
|
vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT]
|
|
vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT]
|
|
vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT]
|
|
vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT]
|
|
|
|
vmovdqu XMMWORD PTR[0*16 + PT], CTR0
|
|
vmovdqu XMMWORD PTR[1*16 + PT], CTR1
|
|
vmovdqu XMMWORD PTR[2*16 + PT], CTR2
|
|
vmovdqu XMMWORD PTR[3*16 + PT], CTR3
|
|
vmovdqu XMMWORD PTR[4*16 + PT], CTR4
|
|
vmovdqu XMMWORD PTR[5*16 + PT], CTR5
|
|
vmovdqu XMMWORD PTR[6*16 + PT], CTR6
|
|
|
|
lea CT, [7*16 + CT]
|
|
lea PT, [7*16 + PT]
|
|
jmp LDecData7
|
|
|
|
LDecData7End:
|
|
|
|
NEXTCTR 0
|
|
|
|
LDecDataSingles:
|
|
|
|
cmp len, 16
|
|
jb LDecDataTail
|
|
sub len, 16
|
|
|
|
vmovdqu TMP1, XMMWORD PTR[CT]
|
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
|
vpxor TMP1, TMP1, T
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
|
vmovdqu T, TMP1
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
|
|
NEXTCTR 0
|
|
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast TMP1, TMP1, TMP2
|
|
vpxor TMP1, TMP1, XMMWORD PTR[CT]
|
|
vmovdqu XMMWORD PTR[PT], TMP1
|
|
|
|
lea PT, [16+PT]
|
|
lea CT, [16+CT]
|
|
jmp LDecDataSingles
|
|
|
|
LDecDataTail:
|
|
|
|
cmp len, 0
|
|
je LDecDataEnd
|
|
|
|
vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
|
|
inc aluCTR
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
|
cmp NR, 10
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
|
cmp NR, 12
|
|
je @f
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
|
@@:
|
|
vaesenclast xmm7, TMP1, TMP2
|
|
|
|
; copy as many bytes as needed
|
|
xor KS, KS
|
|
mov aluTMP, edx
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov dl, BYTE PTR[CT + KS]
|
|
mov BYTE PTR[esp + KS], dl
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
cmp KS, 16
|
|
je @f
|
|
mov BYTE PTR[esp + KS], 0
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
mov edx, aluTMP
|
|
vmovdqa TMP1, XMMWORD PTR[esp]
|
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
|
vpxor TMP1, TMP1, T
|
|
|
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
|
vmovdqu T, TMP1
|
|
|
|
vpxor xmm7, xmm7, XMMWORD PTR[esp]
|
|
vmovdqa XMMWORD PTR[esp], xmm7
|
|
xor KS, KS
|
|
mov aluTMP, edx
|
|
@@:
|
|
cmp len, KS
|
|
je @f
|
|
mov dl, BYTE PTR[esp + KS]
|
|
mov BYTE PTR[PT + KS], dl
|
|
inc KS
|
|
jmp @b
|
|
@@:
|
|
mov edx, aluTMP
|
|
|
|
LDecDataEnd:
|
|
|
|
bswap aluCTR
|
|
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
|
|
|
mov esp, ebp
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
pop ebp
|
|
|
|
vzeroupper
|
|
|
|
ret
|
|
intel_aes_gcmDEC ENDP
|
|
|
|
|
|
END
|