mirror of
https://github.com/rn10950/RetroZilla.git
synced 2024-11-14 03:30:17 +01:00
30d33aa8e8
9934c8faef29, 3c3b381c4865, 5a67f6beee9a, 1b1eb6d77728, a8b668fd72f7, bug962760, bug743700, bug857304, bug972653, bug972450, bug971358, bug903885, bug977073, bug976111, bug949939, bug947653, bug947572, bug903885, bug979106, bug966596, bug979004, bug979752, bug980848, bug938369, bug981170, bug668130, bug974693, bug975056, bug979132, bug370717, bug979070, bug985070, bug900067, bug977673, bug519255, bug989558, bug557299, bug987263, bug369802, a751a5146718, bug992343, bug952572, bug979703, bug994883, bug994869, bug993489, bug984608, bug977869, bug667371, bug672828, bug793347, bug977869
972 lines
21 KiB
NASM
972 lines
21 KiB
NASM
; LICENSE:
|
|
; This submission to NSS is to be made available under the terms of the
|
|
; Mozilla Public License, v. 2.0. You can obtain one at http:
|
|
; //mozilla.org/MPL/2.0/.
|
|
;###############################################################################
|
|
; Copyright(c) 2014, Intel Corp.
|
|
; Developers and authors:
|
|
; Shay Gueron and Vlad Krasnov
|
|
; Intel Corporation, Israel Development Centre, Haifa, Israel
|
|
; Please send feedback directly to crypto.feedback.alias@intel.com
|
|
|
|
|
|
.DATA
|
|
ALIGN 16
|
|
Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
|
|
Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
|
|
Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
|
|
Lcon1 dd 1,1,1,1
|
|
Lcon2 dd 1bh,1bh,1bh,1bh
|
|
|
|
.CODE
|
|
|
|
ctx textequ <rcx>
|
|
output textequ <rdx>
|
|
input textequ <r8>
|
|
inputLen textequ <r9d>
|
|
|
|
|
|
aes_rnd MACRO i
|
|
movdqu xmm8, [i*16 + ctx]
|
|
aesenc xmm0, xmm8
|
|
aesenc xmm1, xmm8
|
|
aesenc xmm2, xmm8
|
|
aesenc xmm3, xmm8
|
|
aesenc xmm4, xmm8
|
|
aesenc xmm5, xmm8
|
|
aesenc xmm6, xmm8
|
|
aesenc xmm7, xmm8
|
|
ENDM
|
|
|
|
aes_last_rnd MACRO i
|
|
movdqu xmm8, [i*16 + ctx]
|
|
aesenclast xmm0, xmm8
|
|
aesenclast xmm1, xmm8
|
|
aesenclast xmm2, xmm8
|
|
aesenclast xmm3, xmm8
|
|
aesenclast xmm4, xmm8
|
|
aesenclast xmm5, xmm8
|
|
aesenclast xmm6, xmm8
|
|
aesenclast xmm7, xmm8
|
|
ENDM
|
|
|
|
aes_dec_rnd MACRO i
|
|
movdqu xmm8, [i*16 + ctx]
|
|
aesdec xmm0, xmm8
|
|
aesdec xmm1, xmm8
|
|
aesdec xmm2, xmm8
|
|
aesdec xmm3, xmm8
|
|
aesdec xmm4, xmm8
|
|
aesdec xmm5, xmm8
|
|
aesdec xmm6, xmm8
|
|
aesdec xmm7, xmm8
|
|
ENDM
|
|
|
|
aes_dec_last_rnd MACRO i
|
|
movdqu xmm8, [i*16 + ctx]
|
|
aesdeclast xmm0, xmm8
|
|
aesdeclast xmm1, xmm8
|
|
aesdeclast xmm2, xmm8
|
|
aesdeclast xmm3, xmm8
|
|
aesdeclast xmm4, xmm8
|
|
aesdeclast xmm5, xmm8
|
|
aesdeclast xmm6, xmm8
|
|
aesdeclast xmm7, xmm8
|
|
ENDM
|
|
|
|
|
|
gen_aes_ecb_func MACRO enc, rnds
|
|
|
|
LOCAL loop8
|
|
LOCAL loop1
|
|
LOCAL bail
|
|
|
|
xor inputLen, inputLen
|
|
mov input, [rsp + 1*8 + 8*4]
|
|
mov inputLen, [rsp + 1*8 + 8*5]
|
|
|
|
sub rsp, 3*16
|
|
|
|
movdqu [rsp + 0*16], xmm6
|
|
movdqu [rsp + 1*16], xmm7
|
|
movdqu [rsp + 2*16], xmm8
|
|
|
|
lea ctx, [48+ctx]
|
|
|
|
loop8:
|
|
cmp inputLen, 8*16
|
|
jb loop1
|
|
|
|
movdqu xmm0, [0*16 + input]
|
|
movdqu xmm1, [1*16 + input]
|
|
movdqu xmm2, [2*16 + input]
|
|
movdqu xmm3, [3*16 + input]
|
|
movdqu xmm4, [4*16 + input]
|
|
movdqu xmm5, [5*16 + input]
|
|
movdqu xmm6, [6*16 + input]
|
|
movdqu xmm7, [7*16 + input]
|
|
|
|
movdqu xmm8, [0*16 + ctx]
|
|
pxor xmm0, xmm8
|
|
pxor xmm1, xmm8
|
|
pxor xmm2, xmm8
|
|
pxor xmm3, xmm8
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm8
|
|
pxor xmm6, xmm8
|
|
pxor xmm7, xmm8
|
|
|
|
IF enc eq 1
|
|
rnd textequ <aes_rnd>
|
|
lastrnd textequ <aes_last_rnd>
|
|
aesinst textequ <aesenc>
|
|
aeslastinst textequ <aesenclast>
|
|
ELSE
|
|
rnd textequ <aes_dec_rnd>
|
|
lastrnd textequ <aes_dec_last_rnd>
|
|
aesinst textequ <aesdec>
|
|
aeslastinst textequ <aesdeclast>
|
|
ENDIF
|
|
|
|
i = 1
|
|
WHILE i LT rnds
|
|
rnd i
|
|
i = i+1
|
|
ENDM
|
|
lastrnd rnds
|
|
|
|
movdqu [0*16 + output], xmm0
|
|
movdqu [1*16 + output], xmm1
|
|
movdqu [2*16 + output], xmm2
|
|
movdqu [3*16 + output], xmm3
|
|
movdqu [4*16 + output], xmm4
|
|
movdqu [5*16 + output], xmm5
|
|
movdqu [6*16 + output], xmm6
|
|
movdqu [7*16 + output], xmm7
|
|
|
|
lea input, [8*16 + input]
|
|
lea output, [8*16 + output]
|
|
sub inputLen, 8*16
|
|
jmp loop8
|
|
|
|
loop1:
|
|
cmp inputLen, 1*16
|
|
jb bail
|
|
|
|
movdqu xmm0, [input]
|
|
movdqu xmm7, [0*16 + ctx]
|
|
pxor xmm0, xmm7
|
|
|
|
i = 1
|
|
WHILE i LT rnds
|
|
movdqu xmm7, [i*16 + ctx]
|
|
aesinst xmm0, xmm7
|
|
i = i+1
|
|
ENDM
|
|
movdqu xmm7, [rnds*16 + ctx]
|
|
aeslastinst xmm0, xmm7
|
|
|
|
movdqu [output], xmm0
|
|
|
|
lea input, [1*16 + input]
|
|
lea output, [1*16 + output]
|
|
sub inputLen, 1*16
|
|
jmp loop1
|
|
|
|
bail:
|
|
xor rax, rax
|
|
|
|
movdqu xmm6, [rsp + 0*16]
|
|
movdqu xmm7, [rsp + 1*16]
|
|
movdqu xmm8, [rsp + 2*16]
|
|
add rsp, 3*16
|
|
ret
|
|
ENDM
|
|
|
|
intel_aes_encrypt_ecb_128 PROC
|
|
gen_aes_ecb_func 1, 10
|
|
intel_aes_encrypt_ecb_128 ENDP
|
|
|
|
intel_aes_encrypt_ecb_192 PROC
|
|
gen_aes_ecb_func 1, 12
|
|
intel_aes_encrypt_ecb_192 ENDP
|
|
|
|
intel_aes_encrypt_ecb_256 PROC
|
|
gen_aes_ecb_func 1, 14
|
|
intel_aes_encrypt_ecb_256 ENDP
|
|
|
|
intel_aes_decrypt_ecb_128 PROC
|
|
gen_aes_ecb_func 0, 10
|
|
intel_aes_decrypt_ecb_128 ENDP
|
|
|
|
intel_aes_decrypt_ecb_192 PROC
|
|
gen_aes_ecb_func 0, 12
|
|
intel_aes_decrypt_ecb_192 ENDP
|
|
|
|
intel_aes_decrypt_ecb_256 PROC
|
|
gen_aes_ecb_func 0, 14
|
|
intel_aes_decrypt_ecb_256 ENDP
|
|
|
|
|
|
KEY textequ <rcx>
|
|
KS textequ <rdx>
|
|
ITR textequ <r8>
|
|
|
|
intel_aes_encrypt_init_128 PROC
|
|
|
|
movdqu xmm1, [KEY]
|
|
movdqu [KS], xmm1
|
|
movdqa xmm2, xmm1
|
|
|
|
lea ITR, Lcon1
|
|
movdqa xmm0, [ITR]
|
|
lea ITR, Lmask
|
|
movdqa xmm4, [ITR]
|
|
|
|
mov ITR, 8
|
|
|
|
Lenc_128_ks_loop:
|
|
lea KS, [16 + KS]
|
|
dec ITR
|
|
|
|
pshufb xmm2, xmm4
|
|
aesenclast xmm2, xmm0
|
|
pslld xmm0, 1
|
|
movdqa xmm3, xmm1
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqu [KS], xmm1
|
|
movdqa xmm2, xmm1
|
|
|
|
jne Lenc_128_ks_loop
|
|
|
|
lea ITR, Lcon2
|
|
movdqa xmm0, [ITR]
|
|
|
|
pshufb xmm2, xmm4
|
|
aesenclast xmm2, xmm0
|
|
pslld xmm0, 1
|
|
movdqa xmm3, xmm1
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqu [16 + KS], xmm1
|
|
movdqa xmm2, xmm1
|
|
|
|
pshufb xmm2, xmm4
|
|
aesenclast xmm2, xmm0
|
|
movdqa xmm3, xmm1
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pslldq xmm3, 4
|
|
pxor xmm1, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqu [32 + KS], xmm1
|
|
movdqa xmm2, xmm1
|
|
|
|
ret
|
|
intel_aes_encrypt_init_128 ENDP
|
|
|
|
|
|
intel_aes_decrypt_init_128 PROC
|
|
|
|
push KS
|
|
push KEY
|
|
|
|
call intel_aes_encrypt_init_128
|
|
|
|
pop KEY
|
|
pop KS
|
|
|
|
movdqu xmm0, [0*16 + KS]
|
|
movdqu xmm1, [10*16 + KS]
|
|
movdqu [10*16 + KS], xmm0
|
|
movdqu [0*16 + KS], xmm1
|
|
|
|
i = 1
|
|
WHILE i LT 5
|
|
movdqu xmm0, [i*16 + KS]
|
|
movdqu xmm1, [(10-i)*16 + KS]
|
|
|
|
aesimc xmm0, xmm0
|
|
aesimc xmm1, xmm1
|
|
|
|
movdqu [(10-i)*16 + KS], xmm0
|
|
movdqu [i*16 + KS], xmm1
|
|
|
|
i = i+1
|
|
ENDM
|
|
|
|
movdqu xmm0, [5*16 + KS]
|
|
aesimc xmm0, xmm0
|
|
movdqu [5*16 + KS], xmm0
|
|
ret
|
|
intel_aes_decrypt_init_128 ENDP
|
|
|
|
|
|
intel_aes_encrypt_init_192 PROC
|
|
|
|
sub rsp, 16*2
|
|
movdqu [16*0 + rsp], xmm6
|
|
movdqu [16*1 + rsp], xmm7
|
|
|
|
movdqu xmm1, [KEY]
|
|
mov ITR, [16 + KEY]
|
|
movd xmm3, ITR
|
|
|
|
movdqu [KS], xmm1
|
|
movdqa xmm5, xmm3
|
|
|
|
lea ITR, Lcon1
|
|
movdqu xmm0, [ITR]
|
|
lea ITR, Lmask192
|
|
movdqu xmm4, [ITR]
|
|
|
|
mov ITR, 4
|
|
|
|
Lenc_192_ks_loop:
|
|
movdqa xmm2, xmm3
|
|
pshufb xmm2, xmm4
|
|
aesenclast xmm2, xmm0
|
|
pslld xmm0, 1
|
|
|
|
movdqa xmm6, xmm1
|
|
movdqa xmm7, xmm3
|
|
pslldq xmm6, 4
|
|
pslldq xmm7, 4
|
|
pxor xmm1, xmm6
|
|
pxor xmm3, xmm7
|
|
pslldq xmm6, 4
|
|
pxor xmm1, xmm6
|
|
pslldq xmm6, 4
|
|
pxor xmm1, xmm6
|
|
pxor xmm1, xmm2
|
|
pshufd xmm2, xmm1, 0ffh
|
|
pxor xmm3, xmm2
|
|
|
|
movdqa xmm6, xmm1
|
|
shufpd xmm5, xmm1, 00h
|
|
shufpd xmm6, xmm3, 01h
|
|
|
|
movdqu [16 + KS], xmm5
|
|
movdqu [32 + KS], xmm6
|
|
|
|
movdqa xmm2, xmm3
|
|
pshufb xmm2, xmm4
|
|
aesenclast xmm2, xmm0
|
|
pslld xmm0, 1
|
|
|
|
movdqa xmm6, xmm1
|
|
movdqa xmm7, xmm3
|
|
pslldq xmm6, 4
|
|
pslldq xmm7, 4
|
|
pxor xmm1, xmm6
|
|
pxor xmm3, xmm7
|
|
pslldq xmm6, 4
|
|
pxor xmm1, xmm6
|
|
pslldq xmm6, 4
|
|
pxor xmm1, xmm6
|
|
pxor xmm1, xmm2
|
|
pshufd xmm2, xmm1, 0ffh
|
|
pxor xmm3, xmm2
|
|
|
|
movdqu [48 + KS], xmm1
|
|
movdqa xmm5, xmm3
|
|
|
|
lea KS, [48 + KS]
|
|
|
|
dec ITR
|
|
jnz Lenc_192_ks_loop
|
|
|
|
movdqu [16 + KS], xmm5
|
|
|
|
movdqu xmm7, [16*1 + rsp]
|
|
movdqu xmm6, [16*0 + rsp]
|
|
add rsp, 16*2
|
|
ret
|
|
intel_aes_encrypt_init_192 ENDP
|
|
|
|
intel_aes_decrypt_init_192 PROC
|
|
push KS
|
|
push KEY
|
|
|
|
call intel_aes_encrypt_init_192
|
|
|
|
pop KEY
|
|
pop KS
|
|
|
|
movdqu xmm0, [0*16 + KS]
|
|
movdqu xmm1, [12*16 + KS]
|
|
movdqu [12*16 + KS], xmm0
|
|
movdqu [0*16 + KS], xmm1
|
|
|
|
i = 1
|
|
WHILE i LT 6
|
|
movdqu xmm0, [i*16 + KS]
|
|
movdqu xmm1, [(12-i)*16 + KS]
|
|
|
|
aesimc xmm0, xmm0
|
|
aesimc xmm1, xmm1
|
|
|
|
movdqu [(12-i)*16 + KS], xmm0
|
|
movdqu [i*16 + KS], xmm1
|
|
|
|
i = i+1
|
|
ENDM
|
|
|
|
movdqu xmm0, [6*16 + KS]
|
|
aesimc xmm0, xmm0
|
|
movdqu [6*16 + KS], xmm0
|
|
ret
|
|
intel_aes_decrypt_init_192 ENDP
|
|
|
|
|
|
intel_aes_encrypt_init_256 PROC
|
|
sub rsp, 16*2
|
|
movdqu [16*0 + rsp], xmm6
|
|
movdqu [16*1 + rsp], xmm7
|
|
|
|
movdqu xmm1, [16*0 + KEY]
|
|
movdqu xmm3, [16*1 + KEY]
|
|
|
|
movdqu [16*0 + KS], xmm1
|
|
movdqu [16*1 + KS], xmm3
|
|
|
|
lea ITR, Lcon1
|
|
movdqu xmm0, [ITR]
|
|
lea ITR, Lmask256
|
|
movdqu xmm5, [ITR]
|
|
|
|
pxor xmm6, xmm6
|
|
|
|
mov ITR, 6
|
|
|
|
Lenc_256_ks_loop:
|
|
|
|
movdqa xmm2, xmm3
|
|
pshufb xmm2, xmm5
|
|
aesenclast xmm2, xmm0
|
|
pslld xmm0, 1
|
|
movdqa xmm4, xmm1
|
|
pslldq xmm4, 4
|
|
pxor xmm1, xmm4
|
|
pslldq xmm4, 4
|
|
pxor xmm1, xmm4
|
|
pslldq xmm4, 4
|
|
pxor xmm1, xmm4
|
|
pxor xmm1, xmm2
|
|
movdqu [16*2 + KS], xmm1
|
|
|
|
pshufd xmm2, xmm1, 0ffh
|
|
aesenclast xmm2, xmm6
|
|
movdqa xmm4, xmm3
|
|
pslldq xmm4, 4
|
|
pxor xmm3, xmm4
|
|
pslldq xmm4, 4
|
|
pxor xmm3, xmm4
|
|
pslldq xmm4, 4
|
|
pxor xmm3, xmm4
|
|
pxor xmm3, xmm2
|
|
movdqu [16*3 + KS], xmm3
|
|
|
|
lea KS, [32 + KS]
|
|
dec ITR
|
|
jnz Lenc_256_ks_loop
|
|
|
|
movdqa xmm2, xmm3
|
|
pshufb xmm2, xmm5
|
|
aesenclast xmm2, xmm0
|
|
movdqa xmm4, xmm1
|
|
pslldq xmm4, 4
|
|
pxor xmm1, xmm4
|
|
pslldq xmm4, 4
|
|
pxor xmm1, xmm4
|
|
pslldq xmm4, 4
|
|
pxor xmm1, xmm4
|
|
pxor xmm1, xmm2
|
|
movdqu [16*2 + KS], xmm1
|
|
|
|
movdqu xmm7, [16*1 + rsp]
|
|
movdqu xmm6, [16*0 + rsp]
|
|
add rsp, 16*2
|
|
ret
|
|
|
|
intel_aes_encrypt_init_256 ENDP
|
|
|
|
|
|
intel_aes_decrypt_init_256 PROC
|
|
push KS
|
|
push KEY
|
|
|
|
call intel_aes_encrypt_init_256
|
|
|
|
pop KEY
|
|
pop KS
|
|
|
|
movdqu xmm0, [0*16 + KS]
|
|
movdqu xmm1, [14*16 + KS]
|
|
movdqu [14*16 + KS], xmm0
|
|
movdqu [0*16 + KS], xmm1
|
|
|
|
i = 1
|
|
WHILE i LT 7
|
|
movdqu xmm0, [i*16 + KS]
|
|
movdqu xmm1, [(14-i)*16 + KS]
|
|
|
|
aesimc xmm0, xmm0
|
|
aesimc xmm1, xmm1
|
|
|
|
movdqu [(14-i)*16 + KS], xmm0
|
|
movdqu [i*16 + KS], xmm1
|
|
|
|
i = i+1
|
|
ENDM
|
|
|
|
movdqu xmm0, [7*16 + KS]
|
|
aesimc xmm0, xmm0
|
|
movdqu [7*16 + KS], xmm0
|
|
ret
|
|
intel_aes_decrypt_init_256 ENDP
|
|
|
|
|
|
|
|
gen_aes_cbc_enc_func MACRO rnds
|
|
|
|
LOCAL loop1
|
|
LOCAL bail
|
|
|
|
mov input, [rsp + 1*8 + 8*4]
|
|
mov inputLen, [rsp + 1*8 + 8*5]
|
|
|
|
sub rsp, 3*16
|
|
|
|
movdqu [rsp + 0*16], xmm6
|
|
movdqu [rsp + 1*16], xmm7
|
|
movdqu [rsp + 2*16], xmm8
|
|
|
|
lea ctx, [48+ctx]
|
|
|
|
movdqu xmm0, [-32+ctx]
|
|
|
|
movdqu xmm2, [0*16 + ctx]
|
|
movdqu xmm3, [1*16 + ctx]
|
|
movdqu xmm4, [2*16 + ctx]
|
|
movdqu xmm5, [3*16 + ctx]
|
|
movdqu xmm6, [4*16 + ctx]
|
|
movdqu xmm7, [5*16 + ctx]
|
|
|
|
loop1:
|
|
cmp inputLen, 1*16
|
|
jb bail
|
|
|
|
movdqu xmm1, [input]
|
|
pxor xmm1, xmm2
|
|
pxor xmm0, xmm1
|
|
|
|
aesenc xmm0, xmm3
|
|
aesenc xmm0, xmm4
|
|
aesenc xmm0, xmm5
|
|
aesenc xmm0, xmm6
|
|
aesenc xmm0, xmm7
|
|
|
|
i = 6
|
|
WHILE i LT rnds
|
|
movdqu xmm8, [i*16 + ctx]
|
|
aesenc xmm0, xmm8
|
|
i = i+1
|
|
ENDM
|
|
movdqu xmm8, [rnds*16 + ctx]
|
|
aesenclast xmm0, xmm8
|
|
|
|
movdqu [output], xmm0
|
|
|
|
lea input, [1*16 + input]
|
|
lea output, [1*16 + output]
|
|
sub inputLen, 1*16
|
|
jmp loop1
|
|
|
|
bail:
|
|
movdqu [-32+ctx], xmm0
|
|
|
|
xor rax, rax
|
|
|
|
movdqu xmm6, [rsp + 0*16]
|
|
movdqu xmm7, [rsp + 1*16]
|
|
movdqu xmm8, [rsp + 2*16]
|
|
add rsp, 3*16
|
|
ret
|
|
|
|
ENDM
|
|
|
|
gen_aes_cbc_dec_func MACRO rnds
|
|
|
|
LOCAL loop8
|
|
LOCAL loop1
|
|
LOCAL dec1
|
|
LOCAL bail
|
|
|
|
mov input, [rsp + 1*8 + 8*4]
|
|
mov inputLen, [rsp + 1*8 + 8*5]
|
|
|
|
sub rsp, 3*16
|
|
|
|
movdqu [rsp + 0*16], xmm6
|
|
movdqu [rsp + 1*16], xmm7
|
|
movdqu [rsp + 2*16], xmm8
|
|
|
|
lea ctx, [48+ctx]
|
|
|
|
loop8:
|
|
cmp inputLen, 8*16
|
|
jb dec1
|
|
|
|
movdqu xmm0, [0*16 + input]
|
|
movdqu xmm1, [1*16 + input]
|
|
movdqu xmm2, [2*16 + input]
|
|
movdqu xmm3, [3*16 + input]
|
|
movdqu xmm4, [4*16 + input]
|
|
movdqu xmm5, [5*16 + input]
|
|
movdqu xmm6, [6*16 + input]
|
|
movdqu xmm7, [7*16 + input]
|
|
|
|
movdqu xmm8, [0*16 + ctx]
|
|
pxor xmm0, xmm8
|
|
pxor xmm1, xmm8
|
|
pxor xmm2, xmm8
|
|
pxor xmm3, xmm8
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm8
|
|
pxor xmm6, xmm8
|
|
pxor xmm7, xmm8
|
|
|
|
i = 1
|
|
WHILE i LT rnds
|
|
aes_dec_rnd i
|
|
i = i+1
|
|
ENDM
|
|
aes_dec_last_rnd rnds
|
|
|
|
movdqu xmm8, [-32 + ctx]
|
|
pxor xmm0, xmm8
|
|
movdqu xmm8, [0*16 + input]
|
|
pxor xmm1, xmm8
|
|
movdqu xmm8, [1*16 + input]
|
|
pxor xmm2, xmm8
|
|
movdqu xmm8, [2*16 + input]
|
|
pxor xmm3, xmm8
|
|
movdqu xmm8, [3*16 + input]
|
|
pxor xmm4, xmm8
|
|
movdqu xmm8, [4*16 + input]
|
|
pxor xmm5, xmm8
|
|
movdqu xmm8, [5*16 + input]
|
|
pxor xmm6, xmm8
|
|
movdqu xmm8, [6*16 + input]
|
|
pxor xmm7, xmm8
|
|
movdqu xmm8, [7*16 + input]
|
|
|
|
movdqu [0*16 + output], xmm0
|
|
movdqu [1*16 + output], xmm1
|
|
movdqu [2*16 + output], xmm2
|
|
movdqu [3*16 + output], xmm3
|
|
movdqu [4*16 + output], xmm4
|
|
movdqu [5*16 + output], xmm5
|
|
movdqu [6*16 + output], xmm6
|
|
movdqu [7*16 + output], xmm7
|
|
movdqu [-32 + ctx], xmm8
|
|
|
|
lea input, [8*16 + input]
|
|
lea output, [8*16 + output]
|
|
sub inputLen, 8*16
|
|
jmp loop8
|
|
dec1:
|
|
|
|
movdqu xmm3, [-32 + ctx]
|
|
|
|
loop1:
|
|
cmp inputLen, 1*16
|
|
jb bail
|
|
|
|
movdqu xmm0, [input]
|
|
movdqa xmm4, xmm0
|
|
movdqu xmm7, [0*16 + ctx]
|
|
pxor xmm0, xmm7
|
|
|
|
i = 1
|
|
WHILE i LT rnds
|
|
movdqu xmm7, [i*16 + ctx]
|
|
aesdec xmm0, xmm7
|
|
i = i+1
|
|
ENDM
|
|
movdqu xmm7, [rnds*16 + ctx]
|
|
aesdeclast xmm0, xmm7
|
|
pxor xmm3, xmm0
|
|
|
|
movdqu [output], xmm3
|
|
movdqa xmm3, xmm4
|
|
|
|
lea input, [1*16 + input]
|
|
lea output, [1*16 + output]
|
|
sub inputLen, 1*16
|
|
jmp loop1
|
|
|
|
bail:
|
|
movdqu [-32 + ctx], xmm3
|
|
xor rax, rax
|
|
|
|
movdqu xmm6, [rsp + 0*16]
|
|
movdqu xmm7, [rsp + 1*16]
|
|
movdqu xmm8, [rsp + 2*16]
|
|
add rsp, 3*16
|
|
ret
|
|
ENDM
|
|
|
|
intel_aes_encrypt_cbc_128 PROC
|
|
gen_aes_cbc_enc_func 10
|
|
intel_aes_encrypt_cbc_128 ENDP
|
|
|
|
intel_aes_encrypt_cbc_192 PROC
|
|
gen_aes_cbc_enc_func 12
|
|
intel_aes_encrypt_cbc_192 ENDP
|
|
|
|
intel_aes_encrypt_cbc_256 PROC
|
|
gen_aes_cbc_enc_func 14
|
|
intel_aes_encrypt_cbc_256 ENDP
|
|
|
|
intel_aes_decrypt_cbc_128 PROC
|
|
gen_aes_cbc_dec_func 10
|
|
intel_aes_decrypt_cbc_128 ENDP
|
|
|
|
intel_aes_decrypt_cbc_192 PROC
|
|
gen_aes_cbc_dec_func 12
|
|
intel_aes_decrypt_cbc_192 ENDP
|
|
|
|
intel_aes_decrypt_cbc_256 PROC
|
|
gen_aes_cbc_dec_func 14
|
|
intel_aes_decrypt_cbc_256 ENDP
|
|
|
|
|
|
|
|
ctrCtx textequ <r10>
|
|
CTR textequ <r11d>
|
|
CTRSave textequ <eax>
|
|
|
|
gen_aes_ctr_func MACRO rnds
|
|
|
|
LOCAL loop8
|
|
LOCAL loop1
|
|
LOCAL enc1
|
|
LOCAL bail
|
|
|
|
mov input, [rsp + 8*1 + 4*8]
|
|
mov inputLen, [rsp + 8*1 + 5*8]
|
|
|
|
mov ctrCtx, ctx
|
|
mov ctx, [8+ctrCtx]
|
|
lea ctx, [48+ctx]
|
|
|
|
sub rsp, 3*16
|
|
movdqu [rsp + 0*16], xmm6
|
|
movdqu [rsp + 1*16], xmm7
|
|
movdqu [rsp + 2*16], xmm8
|
|
|
|
|
|
push rbp
|
|
mov rbp, rsp
|
|
sub rsp, 8*16
|
|
and rsp, -16
|
|
|
|
|
|
movdqu xmm0, [16+ctrCtx]
|
|
mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
|
|
bswap CTRSave
|
|
movdqu xmm1, [ctx + 0*16]
|
|
|
|
pxor xmm0, xmm1
|
|
|
|
movdqa [rsp + 0*16], xmm0
|
|
movdqa [rsp + 1*16], xmm0
|
|
movdqa [rsp + 2*16], xmm0
|
|
movdqa [rsp + 3*16], xmm0
|
|
movdqa [rsp + 4*16], xmm0
|
|
movdqa [rsp + 5*16], xmm0
|
|
movdqa [rsp + 6*16], xmm0
|
|
movdqa [rsp + 7*16], xmm0
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + 1*16 + 3*4], CTR
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + 2*16 + 3*4], CTR
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + 3*16 + 3*4], CTR
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + 4*16 + 3*4], CTR
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + 5*16 + 3*4], CTR
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + 6*16 + 3*4], CTR
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + 7*16 + 3*4], CTR
|
|
|
|
|
|
loop8:
|
|
cmp inputLen, 8*16
|
|
jb loop1
|
|
|
|
movdqu xmm0, [0*16 + rsp]
|
|
movdqu xmm1, [1*16 + rsp]
|
|
movdqu xmm2, [2*16 + rsp]
|
|
movdqu xmm3, [3*16 + rsp]
|
|
movdqu xmm4, [4*16 + rsp]
|
|
movdqu xmm5, [5*16 + rsp]
|
|
movdqu xmm6, [6*16 + rsp]
|
|
movdqu xmm7, [7*16 + rsp]
|
|
|
|
i = 1
|
|
WHILE i LE 8
|
|
aes_rnd i
|
|
|
|
inc CTRSave
|
|
mov CTR, CTRSave
|
|
bswap CTR
|
|
xor CTR, DWORD PTR [ctx + 3*4]
|
|
mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
|
|
|
|
i = i+1
|
|
ENDM
|
|
WHILE i LT rnds
|
|
aes_rnd i
|
|
i = i+1
|
|
ENDM
|
|
aes_last_rnd rnds
|
|
|
|
movdqu xmm8, [0*16 + input]
|
|
pxor xmm0, xmm8
|
|
movdqu xmm8, [1*16 + input]
|
|
pxor xmm1, xmm8
|
|
movdqu xmm8, [2*16 + input]
|
|
pxor xmm2, xmm8
|
|
movdqu xmm8, [3*16 + input]
|
|
pxor xmm3, xmm8
|
|
movdqu xmm8, [4*16 + input]
|
|
pxor xmm4, xmm8
|
|
movdqu xmm8, [5*16 + input]
|
|
pxor xmm5, xmm8
|
|
movdqu xmm8, [6*16 + input]
|
|
pxor xmm6, xmm8
|
|
movdqu xmm8, [7*16 + input]
|
|
pxor xmm7, xmm8
|
|
|
|
movdqu [0*16 + output], xmm0
|
|
movdqu [1*16 + output], xmm1
|
|
movdqu [2*16 + output], xmm2
|
|
movdqu [3*16 + output], xmm3
|
|
movdqu [4*16 + output], xmm4
|
|
movdqu [5*16 + output], xmm5
|
|
movdqu [6*16 + output], xmm6
|
|
movdqu [7*16 + output], xmm7
|
|
|
|
lea input, [8*16 + input]
|
|
lea output, [8*16 + output]
|
|
sub inputLen, 8*16
|
|
jmp loop8
|
|
|
|
|
|
loop1:
|
|
cmp inputLen, 1*16
|
|
jb bail
|
|
|
|
movdqu xmm0, [rsp]
|
|
add rsp, 16
|
|
|
|
i = 1
|
|
WHILE i LT rnds
|
|
movdqu xmm7, [i*16 + ctx]
|
|
aesenc xmm0, xmm7
|
|
i = i+1
|
|
ENDM
|
|
movdqu xmm7, [rnds*16 + ctx]
|
|
aesenclast xmm0, xmm7
|
|
|
|
movdqu xmm7, [input]
|
|
pxor xmm0, xmm7
|
|
movdqu [output], xmm0
|
|
|
|
lea input, [1*16 + input]
|
|
lea output, [1*16 + output]
|
|
sub inputLen, 1*16
|
|
jmp loop1
|
|
|
|
bail:
|
|
|
|
movdqu xmm0, [rsp]
|
|
movdqu xmm1, [ctx + 0*16]
|
|
pxor xmm0, xmm1
|
|
movdqu [16+ctrCtx], xmm0
|
|
|
|
|
|
xor rax, rax
|
|
mov rsp, rbp
|
|
pop rbp
|
|
|
|
movdqu xmm6, [rsp + 0*16]
|
|
movdqu xmm7, [rsp + 1*16]
|
|
movdqu xmm8, [rsp + 2*16]
|
|
add rsp, 3*16
|
|
|
|
ret
|
|
ENDM
|
|
|
|
|
|
intel_aes_encrypt_ctr_128 PROC
|
|
gen_aes_ctr_func 10
|
|
intel_aes_encrypt_ctr_128 ENDP
|
|
|
|
intel_aes_encrypt_ctr_192 PROC
|
|
gen_aes_ctr_func 12
|
|
intel_aes_encrypt_ctr_192 ENDP
|
|
|
|
intel_aes_encrypt_ctr_256 PROC
|
|
gen_aes_ctr_func 14
|
|
intel_aes_encrypt_ctr_256 ENDP
|
|
|
|
|
|
END
|