mirror of
https://github.com/rn10950/RetroZilla.git
synced 2024-11-13 11:10:13 +01:00
1210 lines
31 KiB
NASM
1210 lines
31 KiB
NASM
|
; LICENSE:
|
||
|
; This submission to NSS is to be made available under the terms of the
|
||
|
; Mozilla Public License, v. 2.0. You can obtain one at http:
|
||
|
; //mozilla.org/MPL/2.0/.
|
||
|
;###############################################################################
|
||
|
; Copyright(c) 2014, Intel Corp.
|
||
|
; Developers and authors:
|
||
|
; Shay Gueron and Vlad Krasnov
|
||
|
; Intel Corporation, Israel Development Centre, Haifa, Israel
|
||
|
; Please send feedback directly to crypto.feedback.alias@intel.com
|
||
|
|
||
|
|
||
|
.MODEL FLAT, C
|
||
|
.XMM
|
||
|
|
||
|
.DATA
|
||
|
ALIGN 16
|
||
|
Lone dq 1,0
|
||
|
Ltwo dq 2,0
|
||
|
Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||
|
Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
|
||
|
Lpoly dq 01h, 0c200000000000000h
|
||
|
|
||
|
.CODE
|
||
|
|
||
|
|
||
|
GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
|
||
|
vpclmulqdq TMP1, SRC2, SRC1, 0h
|
||
|
vpclmulqdq TMP4, SRC2, SRC1, 011h
|
||
|
|
||
|
vpshufd TMP2, SRC2, 78
|
||
|
vpshufd TMP3, SRC1, 78
|
||
|
vpxor TMP2, TMP2, SRC2
|
||
|
vpxor TMP3, TMP3, SRC1
|
||
|
|
||
|
vpclmulqdq TMP2, TMP2, TMP3, 0h
|
||
|
vpxor TMP2, TMP2, TMP1
|
||
|
vpxor TMP2, TMP2, TMP4
|
||
|
|
||
|
vpslldq TMP3, TMP2, 8
|
||
|
vpsrldq TMP2, TMP2, 8
|
||
|
|
||
|
vpxor TMP1, TMP1, TMP3
|
||
|
vpxor TMP4, TMP4, TMP2
|
||
|
|
||
|
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
||
|
vpshufd TMP3, TMP1, 78
|
||
|
vpxor TMP1, TMP2, TMP3
|
||
|
|
||
|
vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
||
|
vpshufd TMP3, TMP1, 78
|
||
|
vpxor TMP1, TMP2, TMP3
|
||
|
|
||
|
vpxor DST, TMP1, TMP4
|
||
|
|
||
|
ENDM
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
;
|
||
|
; Generates the final GCM tag
|
||
|
; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
|
||
|
; unsigned char *Tp,
|
||
|
; unsigned int Mlen,
|
||
|
; unsigned int Alen,
|
||
|
; unsigned char* X0,
|
||
|
; unsigned char* TAG);
|
||
|
;
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
|
||
|
ALIGN 16
|
||
|
intel_aes_gcmTAG PROC
|
||
|
|
||
|
Htbl textequ <eax>
|
||
|
Tp textequ <ecx>
|
||
|
X0 textequ <edx>
|
||
|
TAG textequ <ebx>
|
||
|
|
||
|
T textequ <xmm0>
|
||
|
TMP0 textequ <xmm1>
|
||
|
|
||
|
push ebx
|
||
|
|
||
|
mov Htbl, [esp + 2*4 + 0*4]
|
||
|
mov Tp, [esp + 2*4 + 1*4]
|
||
|
mov X0, [esp + 2*4 + 4*4]
|
||
|
mov TAG, [esp + 2*4 + 5*4]
|
||
|
|
||
|
vzeroupper
|
||
|
vmovdqu T, XMMWORD PTR[Tp]
|
||
|
|
||
|
vpxor TMP0, TMP0, TMP0
|
||
|
vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0
|
||
|
vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2
|
||
|
vpsllq TMP0, TMP0, 3
|
||
|
|
||
|
vpxor T, T, TMP0
|
||
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
||
|
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
||
|
|
||
|
vpshufb T, T, [Lbswap_mask]
|
||
|
vpxor T, T, [X0]
|
||
|
vmovdqu XMMWORD PTR[TAG], T
|
||
|
vzeroupper
|
||
|
|
||
|
pop ebx
|
||
|
|
||
|
ret
|
||
|
|
||
|
intel_aes_gcmTAG ENDP
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
;
|
||
|
; Generates the H table
|
||
|
; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
|
||
|
;
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
|
||
|
ALIGN 16
|
||
|
intel_aes_gcmINIT PROC
|
||
|
|
||
|
Htbl textequ <eax>
|
||
|
KS textequ <ecx>
|
||
|
NR textequ <edx>
|
||
|
|
||
|
T textequ <xmm0>
|
||
|
TMP0 textequ <xmm1>
|
||
|
|
||
|
mov Htbl, [esp + 4*1 + 0*4]
|
||
|
mov KS, [esp + 4*1 + 1*4]
|
||
|
mov NR, [esp + 4*1 + 2*4]
|
||
|
|
||
|
vzeroupper
|
||
|
; AES-ENC(0)
|
||
|
vmovdqu T, XMMWORD PTR[KS]
|
||
|
lea KS, [16 + KS]
|
||
|
dec NR
|
||
|
Lenc_loop:
|
||
|
vaesenc T, T, [KS]
|
||
|
lea KS, [16 + KS]
|
||
|
dec NR
|
||
|
jnz Lenc_loop
|
||
|
|
||
|
vaesenclast T, T, [KS]
|
||
|
vpshufb T, T, [Lbswap_mask]
|
||
|
|
||
|
;Calculate H` = GFMUL(H, 2)
|
||
|
vpsrad xmm3, T, 31
|
||
|
vpshufd xmm3, xmm3, 0ffh
|
||
|
vpand xmm5, xmm3, [Lpoly]
|
||
|
vpsrld xmm3, T, 31
|
||
|
vpslld xmm4, T, 1
|
||
|
vpslldq xmm3, xmm3, 4
|
||
|
vpxor T, xmm4, xmm3
|
||
|
vpxor T, T, xmm5
|
||
|
|
||
|
vmovdqu TMP0, T
|
||
|
vmovdqu XMMWORD PTR[Htbl + 0*16], T
|
||
|
|
||
|
vpshufd xmm2, T, 78
|
||
|
vpxor xmm2, xmm2, T
|
||
|
vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
|
||
|
|
||
|
i = 1
|
||
|
WHILE i LT 8
|
||
|
GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
||
|
vmovdqu XMMWORD PTR[Htbl + i*16], T
|
||
|
vpshufd xmm2, T, 78
|
||
|
vpxor xmm2, xmm2, T
|
||
|
vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
|
||
|
i = i+1
|
||
|
ENDM
|
||
|
vzeroupper
|
||
|
ret
|
||
|
intel_aes_gcmINIT ENDP
|
||
|
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
;
|
||
|
; Authenticate only
|
||
|
; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
|
||
|
;
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
|
||
|
ALIGN 16
|
||
|
intel_aes_gcmAAD PROC
|
||
|
|
||
|
Htbl textequ <eax>
|
||
|
inp textequ <ecx>
|
||
|
len textequ <edx>
|
||
|
Tp textequ <ebx>
|
||
|
hlp0 textequ <esi>
|
||
|
|
||
|
DATA textequ <xmm0>
|
||
|
T textequ <xmm1>
|
||
|
TMP0 textequ <xmm2>
|
||
|
TMP1 textequ <xmm3>
|
||
|
TMP2 textequ <xmm4>
|
||
|
TMP3 textequ <xmm5>
|
||
|
TMP4 textequ <xmm6>
|
||
|
Xhi textequ <xmm7>
|
||
|
|
||
|
KARATSUBA_AAD MACRO i
|
||
|
vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
|
||
|
vpxor TMP0, TMP0, TMP3
|
||
|
vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
|
||
|
vpxor TMP1, TMP1, TMP3
|
||
|
vpshufd TMP3, DATA, 78
|
||
|
vpxor TMP3, TMP3, DATA
|
||
|
vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
|
||
|
vpxor TMP2, TMP2, TMP3
|
||
|
ENDM
|
||
|
|
||
|
cmp DWORD PTR[esp + 1*3 + 2*4], 0
|
||
|
jnz LbeginAAD
|
||
|
ret
|
||
|
|
||
|
LbeginAAD:
|
||
|
push ebx
|
||
|
push esi
|
||
|
|
||
|
mov Htbl, [esp + 4*3 + 0*4]
|
||
|
mov inp, [esp + 4*3 + 1*4]
|
||
|
mov len, [esp + 4*3 + 2*4]
|
||
|
mov Tp, [esp + 4*3 + 3*4]
|
||
|
|
||
|
vzeroupper
|
||
|
|
||
|
vpxor Xhi, Xhi, Xhi
|
||
|
|
||
|
vmovdqu T, XMMWORD PTR[Tp]
|
||
|
;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
|
||
|
mov hlp0, len
|
||
|
and hlp0, 128-1
|
||
|
jz Lmod_loop
|
||
|
|
||
|
and len, -128
|
||
|
sub hlp0, 16
|
||
|
|
||
|
; Prefix block
|
||
|
vmovdqu DATA, XMMWORD PTR[inp]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
vpxor DATA, DATA, T
|
||
|
|
||
|
vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h
|
||
|
vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h
|
||
|
vpshufd TMP3, DATA, 78
|
||
|
vpxor TMP3, TMP3, DATA
|
||
|
vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
|
||
|
|
||
|
lea inp, [inp+16]
|
||
|
test hlp0, hlp0
|
||
|
jnz Lpre_loop
|
||
|
jmp Lred1
|
||
|
|
||
|
;hash remaining prefix bocks (up to 7 total prefix blocks)
|
||
|
Lpre_loop:
|
||
|
|
||
|
sub hlp0, 16
|
||
|
|
||
|
vmovdqu DATA, XMMWORD PTR[inp]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
|
||
|
vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h
|
||
|
vpxor TMP0, TMP0, TMP3
|
||
|
vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h
|
||
|
vpxor TMP1, TMP1, TMP3
|
||
|
vpshufd TMP3, DATA, 78
|
||
|
vpxor TMP3, TMP3, DATA
|
||
|
vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
|
||
|
vpxor TMP2, TMP2, TMP3
|
||
|
|
||
|
test hlp0, hlp0
|
||
|
lea inp, [inp+16]
|
||
|
jnz Lpre_loop
|
||
|
|
||
|
Lred1:
|
||
|
|
||
|
vpxor TMP2, TMP2, TMP0
|
||
|
vpxor TMP2, TMP2, TMP1
|
||
|
vpsrldq TMP3, TMP2, 8
|
||
|
vpslldq TMP2, TMP2, 8
|
||
|
|
||
|
vpxor Xhi, TMP1, TMP3
|
||
|
vpxor T, TMP0, TMP2
|
||
|
|
||
|
Lmod_loop:
|
||
|
|
||
|
sub len, 16*8
|
||
|
jb Ldone
|
||
|
; Block #0
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*7]
|
||
|
vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask]
|
||
|
|
||
|
vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h
|
||
|
vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h
|
||
|
vpshufd TMP3, DATA, 78
|
||
|
vpxor TMP3, TMP3, DATA
|
||
|
vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h
|
||
|
|
||
|
; Block #1
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*6]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
KARATSUBA_AAD 1
|
||
|
|
||
|
; Block #2
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*5]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
|
||
|
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
|
||
|
vpalignr T, T, T, 8
|
||
|
|
||
|
KARATSUBA_AAD 2
|
||
|
|
||
|
vpxor T, T, TMP4 ;reduction stage 1b
|
||
|
|
||
|
; Block #3
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*4]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
KARATSUBA_AAD 3
|
||
|
; Block #4
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*3]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
|
||
|
vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
|
||
|
vpalignr T, T, T, 8
|
||
|
|
||
|
KARATSUBA_AAD 4
|
||
|
|
||
|
vpxor T, T, TMP4 ;reduction stage 2b
|
||
|
; Block #5
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*2]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
KARATSUBA_AAD 5
|
||
|
|
||
|
vpxor T, T, Xhi ;reduction finalize
|
||
|
; Block #6
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*1]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
KARATSUBA_AAD 6
|
||
|
; Block #7
|
||
|
vmovdqu DATA, XMMWORD PTR[inp + 16*0]
|
||
|
vpshufb DATA, DATA, [Lbswap_mask]
|
||
|
vpxor DATA, DATA, T
|
||
|
KARATSUBA_AAD 7
|
||
|
; Aggregated 8 blocks, now karatsuba fixup
|
||
|
vpxor TMP2, TMP2, TMP0
|
||
|
vpxor TMP2, TMP2, TMP1
|
||
|
vpsrldq TMP3, TMP2, 8
|
||
|
vpslldq TMP2, TMP2, 8
|
||
|
|
||
|
vpxor Xhi, TMP1, TMP3
|
||
|
vpxor T, TMP0, TMP2
|
||
|
|
||
|
lea inp, [inp + 16*8]
|
||
|
jmp Lmod_loop
|
||
|
|
||
|
Ldone:
|
||
|
vpclmulqdq TMP4, T, [Lpoly], 010h
|
||
|
vpalignr T, T, T, 8
|
||
|
vpxor T, T, TMP4
|
||
|
|
||
|
vpclmulqdq TMP4, T, [Lpoly], 010h
|
||
|
vpalignr T, T, T, 8
|
||
|
vpxor T, T, TMP4
|
||
|
|
||
|
vpxor T, T, Xhi
|
||
|
vmovdqu XMMWORD PTR[Tp], T
|
||
|
vzeroupper
|
||
|
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
ret
|
||
|
|
||
|
intel_aes_gcmAAD ENDP
|
||
|
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
;
|
||
|
; Encrypt and Authenticate
|
||
|
; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
|
||
|
;
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
|
||
|
ALIGN 16
|
||
|
intel_aes_gcmENC PROC
|
||
|
|
||
|
PT textequ <eax>
|
||
|
CT textequ <ecx>
|
||
|
Htbl textequ <edx>
|
||
|
Gctx textequ <edx>
|
||
|
len textequ <DWORD PTR[ebp + 5*4 + 3*4]>
|
||
|
KS textequ <esi>
|
||
|
NR textequ <DWORD PTR[-40 + KS]>
|
||
|
|
||
|
aluCTR textequ <ebx>
|
||
|
aluTMP textequ <edi>
|
||
|
|
||
|
T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]>
|
||
|
TMP0 textequ <xmm1>
|
||
|
TMP1 textequ <xmm2>
|
||
|
TMP2 textequ <xmm3>
|
||
|
TMP3 textequ <xmm4>
|
||
|
TMP4 textequ <xmm5>
|
||
|
TMP5 textequ <xmm6>
|
||
|
|
||
|
CTR0 textequ <xmm0>
|
||
|
CTR1 textequ <xmm1>
|
||
|
CTR2 textequ <xmm2>
|
||
|
CTR3 textequ <xmm3>
|
||
|
CTR4 textequ <xmm4>
|
||
|
CTR5 textequ <xmm5>
|
||
|
CTR6 textequ <xmm6>
|
||
|
|
||
|
ROUND MACRO i
|
||
|
vmovdqu xmm7, XMMWORD PTR[i*16 + KS]
|
||
|
vaesenc CTR0, CTR0, xmm7
|
||
|
vaesenc CTR1, CTR1, xmm7
|
||
|
vaesenc CTR2, CTR2, xmm7
|
||
|
vaesenc CTR3, CTR3, xmm7
|
||
|
vaesenc CTR4, CTR4, xmm7
|
||
|
vaesenc CTR5, CTR5, xmm7
|
||
|
vaesenc CTR6, CTR6, xmm7
|
||
|
ENDM
|
||
|
|
||
|
KARATSUBA MACRO i
|
||
|
vpshufd TMP4, TMP5, 78
|
||
|
vpxor TMP4, TMP4, TMP5
|
||
|
vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
|
||
|
vpxor TMP0, TMP0, TMP3
|
||
|
vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
|
||
|
vpclmulqdq TMP3, TMP5, TMP4, 011h
|
||
|
vpxor TMP1, TMP1, TMP3
|
||
|
vpclmulqdq TMP3, TMP5, TMP4, 000h
|
||
|
vpxor TMP2, TMP2, TMP3
|
||
|
ENDM
|
||
|
|
||
|
NEXTCTR MACRO i
|
||
|
add aluCTR, 1
|
||
|
mov aluTMP, aluCTR
|
||
|
bswap aluTMP
|
||
|
xor aluTMP, [3*4 + KS]
|
||
|
mov [3*4 + 8*16 + i*16 + esp], aluTMP
|
||
|
ENDM
|
||
|
|
||
|
cmp DWORD PTR[1*4 + 3*4 + esp], 0
|
||
|
jne LbeginENC
|
||
|
ret
|
||
|
|
||
|
LbeginENC:
|
||
|
|
||
|
vzeroupper
|
||
|
push ebp
|
||
|
push ebx
|
||
|
push esi
|
||
|
push edi
|
||
|
|
||
|
mov ebp, esp
|
||
|
sub esp, 16*16
|
||
|
and esp, -16
|
||
|
|
||
|
mov PT, [ebp + 5*4 + 0*4]
|
||
|
mov CT, [ebp + 5*4 + 1*4]
|
||
|
mov Gctx, [ebp + 5*4 + 2*4]
|
||
|
|
||
|
mov KS, [16*16 + 3*16 + Gctx]
|
||
|
lea KS, [44 + KS]
|
||
|
|
||
|
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
||
|
bswap aluCTR
|
||
|
|
||
|
|
||
|
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
||
|
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
||
|
vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0
|
||
|
|
||
|
cmp len, 16*7
|
||
|
jb LEncDataSingles
|
||
|
; Prepare the "top" counters
|
||
|
vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0
|
||
|
|
||
|
vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
||
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
||
|
; Encrypt the initial 7 blocks
|
||
|
sub len, 16*7
|
||
|
vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
|
||
|
vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
|
||
|
vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
|
||
|
vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
|
||
|
vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
|
||
|
vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
|
||
|
|
||
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask]
|
||
|
|
||
|
vmovdqu xmm7, XMMWORD PTR[0*16 + KS]
|
||
|
vpxor CTR0, CTR0, xmm7
|
||
|
vpxor CTR1, CTR1, xmm7
|
||
|
vpxor CTR2, CTR2, xmm7
|
||
|
vpxor CTR3, CTR3, xmm7
|
||
|
vpxor CTR4, CTR4, xmm7
|
||
|
vpxor CTR5, CTR5, xmm7
|
||
|
vpxor CTR6, CTR6, xmm7
|
||
|
|
||
|
ROUND 1
|
||
|
|
||
|
add aluCTR, 7
|
||
|
mov aluTMP, aluCTR
|
||
|
bswap aluTMP
|
||
|
xor aluTMP, [KS + 3*4]
|
||
|
mov [8*16 + 0*16 + 3*4 + esp], aluTMP
|
||
|
|
||
|
ROUND 2
|
||
|
NEXTCTR 1
|
||
|
ROUND 3
|
||
|
NEXTCTR 2
|
||
|
ROUND 4
|
||
|
NEXTCTR 3
|
||
|
ROUND 5
|
||
|
NEXTCTR 4
|
||
|
ROUND 6
|
||
|
NEXTCTR 5
|
||
|
ROUND 7
|
||
|
NEXTCTR 6
|
||
|
ROUND 8
|
||
|
ROUND 9
|
||
|
vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
|
||
|
cmp NR, 10
|
||
|
je @f
|
||
|
|
||
|
ROUND 10
|
||
|
ROUND 11
|
||
|
vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
|
||
|
cmp NR, 12
|
||
|
je @f
|
||
|
|
||
|
ROUND 12
|
||
|
ROUND 13
|
||
|
vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
|
||
|
@@:
|
||
|
vaesenclast CTR0, CTR0, xmm7
|
||
|
vaesenclast CTR1, CTR1, xmm7
|
||
|
vaesenclast CTR2, CTR2, xmm7
|
||
|
vaesenclast CTR3, CTR3, xmm7
|
||
|
vaesenclast CTR4, CTR4, xmm7
|
||
|
vaesenclast CTR5, CTR5, xmm7
|
||
|
vaesenclast CTR6, CTR6, xmm7
|
||
|
|
||
|
vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
|
||
|
vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
|
||
|
vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
|
||
|
vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
|
||
|
vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
|
||
|
vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
|
||
|
vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
|
||
|
|
||
|
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
||
|
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
||
|
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
||
|
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
||
|
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
||
|
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
||
|
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
||
|
|
||
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
|
||
|
|
||
|
vmovdqa XMMWORD PTR[1*16 + esp], CTR5
|
||
|
vmovdqa XMMWORD PTR[2*16 + esp], CTR4
|
||
|
vmovdqa XMMWORD PTR[3*16 + esp], CTR3
|
||
|
vmovdqa XMMWORD PTR[4*16 + esp], CTR2
|
||
|
vmovdqa XMMWORD PTR[5*16 + esp], CTR1
|
||
|
vmovdqa XMMWORD PTR[6*16 + esp], CTR0
|
||
|
|
||
|
lea CT, [7*16 + CT]
|
||
|
lea PT, [7*16 + PT]
|
||
|
jmp LEncData7
|
||
|
|
||
|
LEncData7:
|
||
|
cmp len, 16*7
|
||
|
jb LEndEnc7
|
||
|
sub len, 16*7
|
||
|
|
||
|
vpshufd TMP4, TMP5, 78
|
||
|
vpxor TMP4, TMP4, TMP5
|
||
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
||
|
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
||
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
||
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
||
|
|
||
|
vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
|
||
|
KARATSUBA 1
|
||
|
vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
|
||
|
KARATSUBA 2
|
||
|
vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
|
||
|
KARATSUBA 3
|
||
|
vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
|
||
|
KARATSUBA 4
|
||
|
vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
|
||
|
KARATSUBA 5
|
||
|
vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
|
||
|
vpxor TMP5, TMP5, T
|
||
|
KARATSUBA 6
|
||
|
|
||
|
vpxor TMP0, TMP0, TMP1
|
||
|
vpxor TMP0, TMP0, TMP2
|
||
|
vpsrldq TMP3, TMP0, 8
|
||
|
vpxor TMP4, TMP1, TMP3
|
||
|
vpslldq TMP3, TMP0, 8
|
||
|
vpxor TMP5, TMP2, TMP3
|
||
|
|
||
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
||
|
vpalignr TMP5,TMP5,TMP5,8
|
||
|
vpxor TMP5, TMP5, TMP1
|
||
|
|
||
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
||
|
vpalignr TMP5,TMP5,TMP5,8
|
||
|
vpxor TMP5, TMP5, TMP1
|
||
|
|
||
|
vpxor TMP5, TMP5, TMP4
|
||
|
vmovdqu T, TMP5
|
||
|
|
||
|
vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp]
|
||
|
vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp]
|
||
|
vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp]
|
||
|
vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp]
|
||
|
vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp]
|
||
|
vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp]
|
||
|
vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp]
|
||
|
|
||
|
ROUND 1
|
||
|
NEXTCTR 0
|
||
|
ROUND 2
|
||
|
NEXTCTR 1
|
||
|
ROUND 3
|
||
|
NEXTCTR 2
|
||
|
ROUND 4
|
||
|
NEXTCTR 3
|
||
|
ROUND 5
|
||
|
NEXTCTR 4
|
||
|
ROUND 6
|
||
|
NEXTCTR 5
|
||
|
ROUND 7
|
||
|
NEXTCTR 6
|
||
|
|
||
|
ROUND 8
|
||
|
ROUND 9
|
||
|
|
||
|
vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
|
||
|
cmp NR, 10
|
||
|
je @f
|
||
|
|
||
|
ROUND 10
|
||
|
ROUND 11
|
||
|
vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
|
||
|
cmp NR, 12
|
||
|
je @f
|
||
|
|
||
|
ROUND 12
|
||
|
ROUND 13
|
||
|
vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
|
||
|
@@:
|
||
|
vaesenclast CTR0, CTR0, xmm7
|
||
|
vaesenclast CTR1, CTR1, xmm7
|
||
|
vaesenclast CTR2, CTR2, xmm7
|
||
|
vaesenclast CTR3, CTR3, xmm7
|
||
|
vaesenclast CTR4, CTR4, xmm7
|
||
|
vaesenclast CTR5, CTR5, xmm7
|
||
|
vaesenclast CTR6, CTR6, xmm7
|
||
|
|
||
|
vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
|
||
|
vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
|
||
|
vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
|
||
|
vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
|
||
|
vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
|
||
|
vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
|
||
|
vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
|
||
|
|
||
|
vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
||
|
vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
||
|
vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
||
|
vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
||
|
vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
||
|
vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
||
|
vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
||
|
|
||
|
vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
|
||
|
vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
|
||
|
|
||
|
vmovdqa XMMWORD PTR[1*16 + esp], CTR5
|
||
|
vmovdqa XMMWORD PTR[2*16 + esp], CTR4
|
||
|
vmovdqa XMMWORD PTR[3*16 + esp], CTR3
|
||
|
vmovdqa XMMWORD PTR[4*16 + esp], CTR2
|
||
|
vmovdqa XMMWORD PTR[5*16 + esp], CTR1
|
||
|
vmovdqa XMMWORD PTR[6*16 + esp], CTR0
|
||
|
|
||
|
lea CT, [7*16 + CT]
|
||
|
lea PT, [7*16 + PT]
|
||
|
jmp LEncData7
|
||
|
|
||
|
LEndEnc7:
|
||
|
|
||
|
vpshufd TMP4, TMP5, 78
|
||
|
vpxor TMP4, TMP4, TMP5
|
||
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
||
|
vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
||
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
||
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
||
|
|
||
|
vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
|
||
|
KARATSUBA 1
|
||
|
vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
|
||
|
KARATSUBA 2
|
||
|
vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
|
||
|
KARATSUBA 3
|
||
|
vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
|
||
|
KARATSUBA 4
|
||
|
vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
|
||
|
KARATSUBA 5
|
||
|
vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
|
||
|
vpxor TMP5, TMP5, T
|
||
|
KARATSUBA 6
|
||
|
|
||
|
vpxor TMP0, TMP0, TMP1
|
||
|
vpxor TMP0, TMP0, TMP2
|
||
|
vpsrldq TMP3, TMP0, 8
|
||
|
vpxor TMP4, TMP1, TMP3
|
||
|
vpslldq TMP3, TMP0, 8
|
||
|
vpxor TMP5, TMP2, TMP3
|
||
|
|
||
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
||
|
vpalignr TMP5,TMP5,TMP5,8
|
||
|
vpxor TMP5, TMP5, TMP1
|
||
|
|
||
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
||
|
vpalignr TMP5,TMP5,TMP5,8
|
||
|
vpxor TMP5, TMP5, TMP1
|
||
|
|
||
|
vpxor TMP5, TMP5, TMP4
|
||
|
vmovdqu T, TMP5
|
||
|
|
||
|
sub aluCTR, 6
|
||
|
|
||
|
LEncDataSingles:
|
||
|
|
||
|
cmp len, 16
|
||
|
jb LEncDataTail
|
||
|
sub len, 16
|
||
|
|
||
|
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
|
||
|
NEXTCTR 0
|
||
|
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
||
|
cmp NR, 10
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
||
|
cmp NR, 12
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
||
|
@@:
|
||
|
vaesenclast TMP1, TMP1, TMP2
|
||
|
vpxor TMP1, TMP1, XMMWORD PTR[PT]
|
||
|
vmovdqu XMMWORD PTR[CT], TMP1
|
||
|
|
||
|
lea PT, [16+PT]
|
||
|
lea CT, [16+CT]
|
||
|
|
||
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
||
|
vpxor TMP1, TMP1, T
|
||
|
|
||
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
||
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
||
|
vmovdqu T, TMP1
|
||
|
|
||
|
jmp LEncDataSingles
|
||
|
|
||
|
LEncDataTail:
|
||
|
|
||
|
cmp len, 0
|
||
|
je LEncDataEnd
|
||
|
|
||
|
vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
|
||
|
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
||
|
cmp NR, 10
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
||
|
cmp NR, 12
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
||
|
@@:
|
||
|
vaesenclast TMP1, TMP1, TMP2
|
||
|
; zero a temp location
|
||
|
vpxor TMP2, TMP2, TMP2
|
||
|
vmovdqa XMMWORD PTR[esp], TMP2
|
||
|
; copy as many bytes as needed
|
||
|
xor KS, KS
|
||
|
mov aluTMP, edx
|
||
|
@@:
|
||
|
cmp len, KS
|
||
|
je @f
|
||
|
mov dl, BYTE PTR[PT + KS]
|
||
|
mov BYTE PTR[esp + KS], dl
|
||
|
inc KS
|
||
|
jmp @b
|
||
|
@@:
|
||
|
vpxor TMP1, TMP1, XMMWORD PTR[esp]
|
||
|
vmovdqa XMMWORD PTR[esp], TMP1
|
||
|
xor KS, KS
|
||
|
@@:
|
||
|
cmp len, KS
|
||
|
je @f
|
||
|
mov dl, BYTE PTR[esp + KS]
|
||
|
mov BYTE PTR[CT + KS], dl
|
||
|
inc KS
|
||
|
jmp @b
|
||
|
@@:
|
||
|
cmp KS, 16
|
||
|
je @f
|
||
|
mov BYTE PTR[esp + KS], 0
|
||
|
inc KS
|
||
|
jmp @b
|
||
|
@@:
|
||
|
mov edx, aluTMP
|
||
|
vmovdqa TMP1, XMMWORD PTR[esp]
|
||
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
||
|
vpxor TMP1, TMP1, T
|
||
|
|
||
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
||
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
||
|
vmovdqu T, TMP1
|
||
|
|
||
|
LEncDataEnd:
|
||
|
inc aluCTR
|
||
|
bswap aluCTR
|
||
|
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
||
|
|
||
|
mov esp, ebp
|
||
|
pop edi
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
pop ebp
|
||
|
|
||
|
|
||
|
vzeroupper
|
||
|
|
||
|
ret
|
||
|
intel_aes_gcmENC ENDP
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
;
|
||
|
; Decrypt and Authenticate
|
||
|
; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
|
||
|
;
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
|
||
|
|
||
|
NEXTCTR MACRO i
|
||
|
add aluCTR, 1
|
||
|
mov aluTMP, aluCTR
|
||
|
bswap aluTMP
|
||
|
xor aluTMP, [3*4 + KS]
|
||
|
mov [3*4 + i*16 + esp], aluTMP
|
||
|
ENDM
|
||
|
|
||
|
intel_aes_gcmDEC PROC
|
||
|
|
||
|
cmp DWORD PTR[1*4 + 3*4 + esp], 0
|
||
|
jne LbeginDEC
|
||
|
ret
|
||
|
|
||
|
LbeginDEC:
|
||
|
|
||
|
vzeroupper
|
||
|
push ebp
|
||
|
push ebx
|
||
|
push esi
|
||
|
push edi
|
||
|
|
||
|
mov ebp, esp
|
||
|
sub esp, 8*16
|
||
|
and esp, -16
|
||
|
|
||
|
mov CT, [ebp + 5*4 + 0*4]
|
||
|
mov PT, [ebp + 5*4 + 1*4]
|
||
|
mov Gctx, [ebp + 5*4 + 2*4]
|
||
|
|
||
|
mov KS, [16*16 + 3*16 + Gctx]
|
||
|
lea KS, [44 + KS]
|
||
|
|
||
|
mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
||
|
bswap aluCTR
|
||
|
|
||
|
|
||
|
vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
||
|
vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
||
|
vmovdqu XMMWORD PTR[0*16 + esp], TMP0
|
||
|
|
||
|
cmp len, 16*7
|
||
|
jb LDecDataSingles
|
||
|
vmovdqu XMMWORD PTR[1*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[2*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[3*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[4*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[5*16 + esp], TMP0
|
||
|
vmovdqu XMMWORD PTR[6*16 + esp], TMP0
|
||
|
dec aluCTR
|
||
|
|
||
|
LDecData7:
|
||
|
cmp len, 16*7
|
||
|
jb LDecData7End
|
||
|
sub len, 16*7
|
||
|
|
||
|
vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
|
||
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
||
|
vpxor TMP5, TMP5, T
|
||
|
vpshufd TMP4, TMP5, 78
|
||
|
vpxor TMP4, TMP4, TMP5
|
||
|
vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h
|
||
|
vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl]
|
||
|
vpclmulqdq TMP1, TMP5, TMP4, 011h
|
||
|
vpclmulqdq TMP2, TMP5, TMP4, 000h
|
||
|
|
||
|
NEXTCTR 0
|
||
|
vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
|
||
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
||
|
KARATSUBA 5
|
||
|
NEXTCTR 1
|
||
|
vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
|
||
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
||
|
KARATSUBA 4
|
||
|
NEXTCTR 2
|
||
|
vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
|
||
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
||
|
KARATSUBA 3
|
||
|
NEXTCTR 3
|
||
|
vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
|
||
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
||
|
KARATSUBA 2
|
||
|
NEXTCTR 4
|
||
|
vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
|
||
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
||
|
KARATSUBA 1
|
||
|
NEXTCTR 5
|
||
|
vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
|
||
|
vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
|
||
|
KARATSUBA 0
|
||
|
NEXTCTR 6
|
||
|
|
||
|
vpxor TMP0, TMP0, TMP1
|
||
|
vpxor TMP0, TMP0, TMP2
|
||
|
vpsrldq TMP3, TMP0, 8
|
||
|
vpxor TMP4, TMP1, TMP3
|
||
|
vpslldq TMP3, TMP0, 8
|
||
|
vpxor TMP5, TMP2, TMP3
|
||
|
|
||
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
||
|
vpalignr TMP5,TMP5,TMP5,8
|
||
|
vpxor TMP5, TMP5, TMP1
|
||
|
|
||
|
vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
|
||
|
vpalignr TMP5,TMP5,TMP5,8
|
||
|
vpxor TMP5, TMP5, TMP1
|
||
|
|
||
|
vpxor TMP5, TMP5, TMP4
|
||
|
vmovdqu T, TMP5
|
||
|
|
||
|
vmovdqa CTR0, XMMWORD PTR[0*16 + esp]
|
||
|
vmovdqa CTR1, XMMWORD PTR[1*16 + esp]
|
||
|
vmovdqa CTR2, XMMWORD PTR[2*16 + esp]
|
||
|
vmovdqa CTR3, XMMWORD PTR[3*16 + esp]
|
||
|
vmovdqa CTR4, XMMWORD PTR[4*16 + esp]
|
||
|
vmovdqa CTR5, XMMWORD PTR[5*16 + esp]
|
||
|
vmovdqa CTR6, XMMWORD PTR[6*16 + esp]
|
||
|
|
||
|
ROUND 1
|
||
|
ROUND 2
|
||
|
ROUND 3
|
||
|
ROUND 4
|
||
|
ROUND 5
|
||
|
ROUND 6
|
||
|
ROUND 7
|
||
|
ROUND 8
|
||
|
ROUND 9
|
||
|
vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
|
||
|
cmp NR, 10
|
||
|
je @f
|
||
|
|
||
|
ROUND 10
|
||
|
ROUND 11
|
||
|
vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
|
||
|
cmp NR, 12
|
||
|
je @f
|
||
|
|
||
|
ROUND 12
|
||
|
ROUND 13
|
||
|
vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
|
||
|
@@:
|
||
|
vaesenclast CTR0, CTR0, xmm7
|
||
|
vaesenclast CTR1, CTR1, xmm7
|
||
|
vaesenclast CTR2, CTR2, xmm7
|
||
|
vaesenclast CTR3, CTR3, xmm7
|
||
|
vaesenclast CTR4, CTR4, xmm7
|
||
|
vaesenclast CTR5, CTR5, xmm7
|
||
|
vaesenclast CTR6, CTR6, xmm7
|
||
|
|
||
|
vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT]
|
||
|
vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT]
|
||
|
vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT]
|
||
|
vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT]
|
||
|
vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT]
|
||
|
vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT]
|
||
|
vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT]
|
||
|
|
||
|
vmovdqu XMMWORD PTR[0*16 + PT], CTR0
|
||
|
vmovdqu XMMWORD PTR[1*16 + PT], CTR1
|
||
|
vmovdqu XMMWORD PTR[2*16 + PT], CTR2
|
||
|
vmovdqu XMMWORD PTR[3*16 + PT], CTR3
|
||
|
vmovdqu XMMWORD PTR[4*16 + PT], CTR4
|
||
|
vmovdqu XMMWORD PTR[5*16 + PT], CTR5
|
||
|
vmovdqu XMMWORD PTR[6*16 + PT], CTR6
|
||
|
|
||
|
lea CT, [7*16 + CT]
|
||
|
lea PT, [7*16 + PT]
|
||
|
jmp LDecData7
|
||
|
|
||
|
LDecData7End:
|
||
|
|
||
|
NEXTCTR 0
|
||
|
|
||
|
LDecDataSingles:
|
||
|
|
||
|
cmp len, 16
|
||
|
jb LDecDataTail
|
||
|
sub len, 16
|
||
|
|
||
|
vmovdqu TMP1, XMMWORD PTR[CT]
|
||
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
||
|
vpxor TMP1, TMP1, T
|
||
|
|
||
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
||
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
||
|
vmovdqu T, TMP1
|
||
|
|
||
|
vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
|
||
|
NEXTCTR 0
|
||
|
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
||
|
cmp NR, 10
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
||
|
cmp NR, 12
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
||
|
@@:
|
||
|
vaesenclast TMP1, TMP1, TMP2
|
||
|
vpxor TMP1, TMP1, XMMWORD PTR[CT]
|
||
|
vmovdqu XMMWORD PTR[PT], TMP1
|
||
|
|
||
|
lea PT, [16+PT]
|
||
|
lea CT, [16+CT]
|
||
|
jmp LDecDataSingles
|
||
|
|
||
|
LDecDataTail:
|
||
|
|
||
|
cmp len, 0
|
||
|
je LDecDataEnd
|
||
|
|
||
|
vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
|
||
|
inc aluCTR
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
||
|
cmp NR, 10
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
||
|
cmp NR, 12
|
||
|
je @f
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
||
|
vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
||
|
vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
||
|
@@:
|
||
|
vaesenclast xmm7, TMP1, TMP2
|
||
|
|
||
|
; copy as many bytes as needed
|
||
|
xor KS, KS
|
||
|
mov aluTMP, edx
|
||
|
@@:
|
||
|
cmp len, KS
|
||
|
je @f
|
||
|
mov dl, BYTE PTR[CT + KS]
|
||
|
mov BYTE PTR[esp + KS], dl
|
||
|
inc KS
|
||
|
jmp @b
|
||
|
@@:
|
||
|
cmp KS, 16
|
||
|
je @f
|
||
|
mov BYTE PTR[esp + KS], 0
|
||
|
inc KS
|
||
|
jmp @b
|
||
|
@@:
|
||
|
mov edx, aluTMP
|
||
|
vmovdqa TMP1, XMMWORD PTR[esp]
|
||
|
vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
|
||
|
vpxor TMP1, TMP1, T
|
||
|
|
||
|
vmovdqu TMP0, XMMWORD PTR[Htbl]
|
||
|
GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
|
||
|
vmovdqu T, TMP1
|
||
|
|
||
|
vpxor xmm7, xmm7, XMMWORD PTR[esp]
|
||
|
vmovdqa XMMWORD PTR[esp], xmm7
|
||
|
xor KS, KS
|
||
|
mov aluTMP, edx
|
||
|
@@:
|
||
|
cmp len, KS
|
||
|
je @f
|
||
|
mov dl, BYTE PTR[esp + KS]
|
||
|
mov BYTE PTR[PT + KS], dl
|
||
|
inc KS
|
||
|
jmp @b
|
||
|
@@:
|
||
|
mov edx, aluTMP
|
||
|
|
||
|
LDecDataEnd:
|
||
|
|
||
|
bswap aluCTR
|
||
|
mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
||
|
|
||
|
mov esp, ebp
|
||
|
pop edi
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
pop ebp
|
||
|
|
||
|
vzeroupper
|
||
|
|
||
|
ret
|
||
|
intel_aes_gcmDEC ENDP
|
||
|
|
||
|
|
||
|
END
|