RetroZilla/security/nss/lib/freebl/intel-aes-x64-masm.asm
roytam1 30d33aa8e8 cherry-picked mozilla NSS upstream changes (to rev f7a4c771997e, which is on par with 3.16.1 but without windows rand() changes):
9934c8faef29, 3c3b381c4865, 5a67f6beee9a, 1b1eb6d77728, a8b668fd72f7, bug962760, bug743700, bug857304, bug972653, bug972450, bug971358, bug903885, bug977073, bug976111, bug949939, bug947653, bug947572, bug903885, bug979106, bug966596, bug979004, bug979752, bug980848, bug938369, bug981170, bug668130, bug974693, bug975056, bug979132, bug370717, bug979070, bug985070, bug900067, bug977673, bug519255, bug989558, bug557299, bug987263, bug369802, a751a5146718, bug992343, bug952572, bug979703, bug994883, bug994869, bug993489, bug984608, bug977869, bug667371, bug672828, bug793347, bug977869
2018-07-14 21:22:29 +08:00

972 lines
21 KiB
NASM

; LICENSE:
; This submission to NSS is to be made available under the terms of the
; Mozilla Public License, v. 2.0. You can obtain one at http:
; //mozilla.org/MPL/2.0/.
;###############################################################################
; Copyright(c) 2014, Intel Corp.
; Developers and authors:
; Shay Gueron and Vlad Krasnov
; Intel Corporation, Israel Development Centre, Haifa, Israel
; Please send feedback directly to crypto.feedback.alias@intel.com
.DATA
ALIGN 16
Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
Lcon1 dd 1,1,1,1
Lcon2 dd 1bh,1bh,1bh,1bh
.CODE
ctx textequ <rcx>
output textequ <rdx>
input textequ <r8>
inputLen textequ <r9d>
aes_rnd MACRO i
movdqu xmm8, [i*16 + ctx]
aesenc xmm0, xmm8
aesenc xmm1, xmm8
aesenc xmm2, xmm8
aesenc xmm3, xmm8
aesenc xmm4, xmm8
aesenc xmm5, xmm8
aesenc xmm6, xmm8
aesenc xmm7, xmm8
ENDM
aes_last_rnd MACRO i
movdqu xmm8, [i*16 + ctx]
aesenclast xmm0, xmm8
aesenclast xmm1, xmm8
aesenclast xmm2, xmm8
aesenclast xmm3, xmm8
aesenclast xmm4, xmm8
aesenclast xmm5, xmm8
aesenclast xmm6, xmm8
aesenclast xmm7, xmm8
ENDM
aes_dec_rnd MACRO i
movdqu xmm8, [i*16 + ctx]
aesdec xmm0, xmm8
aesdec xmm1, xmm8
aesdec xmm2, xmm8
aesdec xmm3, xmm8
aesdec xmm4, xmm8
aesdec xmm5, xmm8
aesdec xmm6, xmm8
aesdec xmm7, xmm8
ENDM
aes_dec_last_rnd MACRO i
movdqu xmm8, [i*16 + ctx]
aesdeclast xmm0, xmm8
aesdeclast xmm1, xmm8
aesdeclast xmm2, xmm8
aesdeclast xmm3, xmm8
aesdeclast xmm4, xmm8
aesdeclast xmm5, xmm8
aesdeclast xmm6, xmm8
aesdeclast xmm7, xmm8
ENDM
gen_aes_ecb_func MACRO enc, rnds
LOCAL loop8
LOCAL loop1
LOCAL bail
xor inputLen, inputLen
mov input, [rsp + 1*8 + 8*4]
mov inputLen, [rsp + 1*8 + 8*5]
sub rsp, 3*16
movdqu [rsp + 0*16], xmm6
movdqu [rsp + 1*16], xmm7
movdqu [rsp + 2*16], xmm8
lea ctx, [48+ctx]
loop8:
cmp inputLen, 8*16
jb loop1
movdqu xmm0, [0*16 + input]
movdqu xmm1, [1*16 + input]
movdqu xmm2, [2*16 + input]
movdqu xmm3, [3*16 + input]
movdqu xmm4, [4*16 + input]
movdqu xmm5, [5*16 + input]
movdqu xmm6, [6*16 + input]
movdqu xmm7, [7*16 + input]
movdqu xmm8, [0*16 + ctx]
pxor xmm0, xmm8
pxor xmm1, xmm8
pxor xmm2, xmm8
pxor xmm3, xmm8
pxor xmm4, xmm8
pxor xmm5, xmm8
pxor xmm6, xmm8
pxor xmm7, xmm8
IF enc eq 1
rnd textequ <aes_rnd>
lastrnd textequ <aes_last_rnd>
aesinst textequ <aesenc>
aeslastinst textequ <aesenclast>
ELSE
rnd textequ <aes_dec_rnd>
lastrnd textequ <aes_dec_last_rnd>
aesinst textequ <aesdec>
aeslastinst textequ <aesdeclast>
ENDIF
i = 1
WHILE i LT rnds
rnd i
i = i+1
ENDM
lastrnd rnds
movdqu [0*16 + output], xmm0
movdqu [1*16 + output], xmm1
movdqu [2*16 + output], xmm2
movdqu [3*16 + output], xmm3
movdqu [4*16 + output], xmm4
movdqu [5*16 + output], xmm5
movdqu [6*16 + output], xmm6
movdqu [7*16 + output], xmm7
lea input, [8*16 + input]
lea output, [8*16 + output]
sub inputLen, 8*16
jmp loop8
loop1:
cmp inputLen, 1*16
jb bail
movdqu xmm0, [input]
movdqu xmm7, [0*16 + ctx]
pxor xmm0, xmm7
i = 1
WHILE i LT rnds
movdqu xmm7, [i*16 + ctx]
aesinst xmm0, xmm7
i = i+1
ENDM
movdqu xmm7, [rnds*16 + ctx]
aeslastinst xmm0, xmm7
movdqu [output], xmm0
lea input, [1*16 + input]
lea output, [1*16 + output]
sub inputLen, 1*16
jmp loop1
bail:
xor rax, rax
movdqu xmm6, [rsp + 0*16]
movdqu xmm7, [rsp + 1*16]
movdqu xmm8, [rsp + 2*16]
add rsp, 3*16
ret
ENDM
intel_aes_encrypt_ecb_128 PROC
gen_aes_ecb_func 1, 10
intel_aes_encrypt_ecb_128 ENDP
intel_aes_encrypt_ecb_192 PROC
gen_aes_ecb_func 1, 12
intel_aes_encrypt_ecb_192 ENDP
intel_aes_encrypt_ecb_256 PROC
gen_aes_ecb_func 1, 14
intel_aes_encrypt_ecb_256 ENDP
intel_aes_decrypt_ecb_128 PROC
gen_aes_ecb_func 0, 10
intel_aes_decrypt_ecb_128 ENDP
intel_aes_decrypt_ecb_192 PROC
gen_aes_ecb_func 0, 12
intel_aes_decrypt_ecb_192 ENDP
intel_aes_decrypt_ecb_256 PROC
gen_aes_ecb_func 0, 14
intel_aes_decrypt_ecb_256 ENDP
KEY textequ <rcx>
KS textequ <rdx>
ITR textequ <r8>
intel_aes_encrypt_init_128 PROC
movdqu xmm1, [KEY]
movdqu [KS], xmm1
movdqa xmm2, xmm1
lea ITR, Lcon1
movdqa xmm0, [ITR]
lea ITR, Lmask
movdqa xmm4, [ITR]
mov ITR, 8
Lenc_128_ks_loop:
lea KS, [16 + KS]
dec ITR
pshufb xmm2, xmm4
aesenclast xmm2, xmm0
pslld xmm0, 1
movdqa xmm3, xmm1
pslldq xmm3, 4
pxor xmm1, xmm3
pslldq xmm3, 4
pxor xmm1, xmm3
pslldq xmm3, 4
pxor xmm1, xmm3
pxor xmm1, xmm2
movdqu [KS], xmm1
movdqa xmm2, xmm1
jne Lenc_128_ks_loop
lea ITR, Lcon2
movdqa xmm0, [ITR]
pshufb xmm2, xmm4
aesenclast xmm2, xmm0
pslld xmm0, 1
movdqa xmm3, xmm1
pslldq xmm3, 4
pxor xmm1, xmm3
pslldq xmm3, 4
pxor xmm1, xmm3
pslldq xmm3, 4
pxor xmm1, xmm3
pxor xmm1, xmm2
movdqu [16 + KS], xmm1
movdqa xmm2, xmm1
pshufb xmm2, xmm4
aesenclast xmm2, xmm0
movdqa xmm3, xmm1
pslldq xmm3, 4
pxor xmm1, xmm3
pslldq xmm3, 4
pxor xmm1, xmm3
pslldq xmm3, 4
pxor xmm1, xmm3
pxor xmm1, xmm2
movdqu [32 + KS], xmm1
movdqa xmm2, xmm1
ret
intel_aes_encrypt_init_128 ENDP
intel_aes_decrypt_init_128 PROC
push KS
push KEY
call intel_aes_encrypt_init_128
pop KEY
pop KS
movdqu xmm0, [0*16 + KS]
movdqu xmm1, [10*16 + KS]
movdqu [10*16 + KS], xmm0
movdqu [0*16 + KS], xmm1
i = 1
WHILE i LT 5
movdqu xmm0, [i*16 + KS]
movdqu xmm1, [(10-i)*16 + KS]
aesimc xmm0, xmm0
aesimc xmm1, xmm1
movdqu [(10-i)*16 + KS], xmm0
movdqu [i*16 + KS], xmm1
i = i+1
ENDM
movdqu xmm0, [5*16 + KS]
aesimc xmm0, xmm0
movdqu [5*16 + KS], xmm0
ret
intel_aes_decrypt_init_128 ENDP
intel_aes_encrypt_init_192 PROC
sub rsp, 16*2
movdqu [16*0 + rsp], xmm6
movdqu [16*1 + rsp], xmm7
movdqu xmm1, [KEY]
mov ITR, [16 + KEY]
movd xmm3, ITR
movdqu [KS], xmm1
movdqa xmm5, xmm3
lea ITR, Lcon1
movdqu xmm0, [ITR]
lea ITR, Lmask192
movdqu xmm4, [ITR]
mov ITR, 4
Lenc_192_ks_loop:
movdqa xmm2, xmm3
pshufb xmm2, xmm4
aesenclast xmm2, xmm0
pslld xmm0, 1
movdqa xmm6, xmm1
movdqa xmm7, xmm3
pslldq xmm6, 4
pslldq xmm7, 4
pxor xmm1, xmm6
pxor xmm3, xmm7
pslldq xmm6, 4
pxor xmm1, xmm6
pslldq xmm6, 4
pxor xmm1, xmm6
pxor xmm1, xmm2
pshufd xmm2, xmm1, 0ffh
pxor xmm3, xmm2
movdqa xmm6, xmm1
shufpd xmm5, xmm1, 00h
shufpd xmm6, xmm3, 01h
movdqu [16 + KS], xmm5
movdqu [32 + KS], xmm6
movdqa xmm2, xmm3
pshufb xmm2, xmm4
aesenclast xmm2, xmm0
pslld xmm0, 1
movdqa xmm6, xmm1
movdqa xmm7, xmm3
pslldq xmm6, 4
pslldq xmm7, 4
pxor xmm1, xmm6
pxor xmm3, xmm7
pslldq xmm6, 4
pxor xmm1, xmm6
pslldq xmm6, 4
pxor xmm1, xmm6
pxor xmm1, xmm2
pshufd xmm2, xmm1, 0ffh
pxor xmm3, xmm2
movdqu [48 + KS], xmm1
movdqa xmm5, xmm3
lea KS, [48 + KS]
dec ITR
jnz Lenc_192_ks_loop
movdqu [16 + KS], xmm5
movdqu xmm7, [16*1 + rsp]
movdqu xmm6, [16*0 + rsp]
add rsp, 16*2
ret
intel_aes_encrypt_init_192 ENDP
intel_aes_decrypt_init_192 PROC
push KS
push KEY
call intel_aes_encrypt_init_192
pop KEY
pop KS
movdqu xmm0, [0*16 + KS]
movdqu xmm1, [12*16 + KS]
movdqu [12*16 + KS], xmm0
movdqu [0*16 + KS], xmm1
i = 1
WHILE i LT 6
movdqu xmm0, [i*16 + KS]
movdqu xmm1, [(12-i)*16 + KS]
aesimc xmm0, xmm0
aesimc xmm1, xmm1
movdqu [(12-i)*16 + KS], xmm0
movdqu [i*16 + KS], xmm1
i = i+1
ENDM
movdqu xmm0, [6*16 + KS]
aesimc xmm0, xmm0
movdqu [6*16 + KS], xmm0
ret
intel_aes_decrypt_init_192 ENDP
intel_aes_encrypt_init_256 PROC
sub rsp, 16*2
movdqu [16*0 + rsp], xmm6
movdqu [16*1 + rsp], xmm7
movdqu xmm1, [16*0 + KEY]
movdqu xmm3, [16*1 + KEY]
movdqu [16*0 + KS], xmm1
movdqu [16*1 + KS], xmm3
lea ITR, Lcon1
movdqu xmm0, [ITR]
lea ITR, Lmask256
movdqu xmm5, [ITR]
pxor xmm6, xmm6
mov ITR, 6
Lenc_256_ks_loop:
movdqa xmm2, xmm3
pshufb xmm2, xmm5
aesenclast xmm2, xmm0
pslld xmm0, 1
movdqa xmm4, xmm1
pslldq xmm4, 4
pxor xmm1, xmm4
pslldq xmm4, 4
pxor xmm1, xmm4
pslldq xmm4, 4
pxor xmm1, xmm4
pxor xmm1, xmm2
movdqu [16*2 + KS], xmm1
pshufd xmm2, xmm1, 0ffh
aesenclast xmm2, xmm6
movdqa xmm4, xmm3
pslldq xmm4, 4
pxor xmm3, xmm4
pslldq xmm4, 4
pxor xmm3, xmm4
pslldq xmm4, 4
pxor xmm3, xmm4
pxor xmm3, xmm2
movdqu [16*3 + KS], xmm3
lea KS, [32 + KS]
dec ITR
jnz Lenc_256_ks_loop
movdqa xmm2, xmm3
pshufb xmm2, xmm5
aesenclast xmm2, xmm0
movdqa xmm4, xmm1
pslldq xmm4, 4
pxor xmm1, xmm4
pslldq xmm4, 4
pxor xmm1, xmm4
pslldq xmm4, 4
pxor xmm1, xmm4
pxor xmm1, xmm2
movdqu [16*2 + KS], xmm1
movdqu xmm7, [16*1 + rsp]
movdqu xmm6, [16*0 + rsp]
add rsp, 16*2
ret
intel_aes_encrypt_init_256 ENDP
intel_aes_decrypt_init_256 PROC
push KS
push KEY
call intel_aes_encrypt_init_256
pop KEY
pop KS
movdqu xmm0, [0*16 + KS]
movdqu xmm1, [14*16 + KS]
movdqu [14*16 + KS], xmm0
movdqu [0*16 + KS], xmm1
i = 1
WHILE i LT 7
movdqu xmm0, [i*16 + KS]
movdqu xmm1, [(14-i)*16 + KS]
aesimc xmm0, xmm0
aesimc xmm1, xmm1
movdqu [(14-i)*16 + KS], xmm0
movdqu [i*16 + KS], xmm1
i = i+1
ENDM
movdqu xmm0, [7*16 + KS]
aesimc xmm0, xmm0
movdqu [7*16 + KS], xmm0
ret
intel_aes_decrypt_init_256 ENDP
gen_aes_cbc_enc_func MACRO rnds
LOCAL loop1
LOCAL bail
mov input, [rsp + 1*8 + 8*4]
mov inputLen, [rsp + 1*8 + 8*5]
sub rsp, 3*16
movdqu [rsp + 0*16], xmm6
movdqu [rsp + 1*16], xmm7
movdqu [rsp + 2*16], xmm8
lea ctx, [48+ctx]
movdqu xmm0, [-32+ctx]
movdqu xmm2, [0*16 + ctx]
movdqu xmm3, [1*16 + ctx]
movdqu xmm4, [2*16 + ctx]
movdqu xmm5, [3*16 + ctx]
movdqu xmm6, [4*16 + ctx]
movdqu xmm7, [5*16 + ctx]
loop1:
cmp inputLen, 1*16
jb bail
movdqu xmm1, [input]
pxor xmm1, xmm2
pxor xmm0, xmm1
aesenc xmm0, xmm3
aesenc xmm0, xmm4
aesenc xmm0, xmm5
aesenc xmm0, xmm6
aesenc xmm0, xmm7
i = 6
WHILE i LT rnds
movdqu xmm8, [i*16 + ctx]
aesenc xmm0, xmm8
i = i+1
ENDM
movdqu xmm8, [rnds*16 + ctx]
aesenclast xmm0, xmm8
movdqu [output], xmm0
lea input, [1*16 + input]
lea output, [1*16 + output]
sub inputLen, 1*16
jmp loop1
bail:
movdqu [-32+ctx], xmm0
xor rax, rax
movdqu xmm6, [rsp + 0*16]
movdqu xmm7, [rsp + 1*16]
movdqu xmm8, [rsp + 2*16]
add rsp, 3*16
ret
ENDM
gen_aes_cbc_dec_func MACRO rnds
LOCAL loop8
LOCAL loop1
LOCAL dec1
LOCAL bail
mov input, [rsp + 1*8 + 8*4]
mov inputLen, [rsp + 1*8 + 8*5]
sub rsp, 3*16
movdqu [rsp + 0*16], xmm6
movdqu [rsp + 1*16], xmm7
movdqu [rsp + 2*16], xmm8
lea ctx, [48+ctx]
loop8:
cmp inputLen, 8*16
jb dec1
movdqu xmm0, [0*16 + input]
movdqu xmm1, [1*16 + input]
movdqu xmm2, [2*16 + input]
movdqu xmm3, [3*16 + input]
movdqu xmm4, [4*16 + input]
movdqu xmm5, [5*16 + input]
movdqu xmm6, [6*16 + input]
movdqu xmm7, [7*16 + input]
movdqu xmm8, [0*16 + ctx]
pxor xmm0, xmm8
pxor xmm1, xmm8
pxor xmm2, xmm8
pxor xmm3, xmm8
pxor xmm4, xmm8
pxor xmm5, xmm8
pxor xmm6, xmm8
pxor xmm7, xmm8
i = 1
WHILE i LT rnds
aes_dec_rnd i
i = i+1
ENDM
aes_dec_last_rnd rnds
movdqu xmm8, [-32 + ctx]
pxor xmm0, xmm8
movdqu xmm8, [0*16 + input]
pxor xmm1, xmm8
movdqu xmm8, [1*16 + input]
pxor xmm2, xmm8
movdqu xmm8, [2*16 + input]
pxor xmm3, xmm8
movdqu xmm8, [3*16 + input]
pxor xmm4, xmm8
movdqu xmm8, [4*16 + input]
pxor xmm5, xmm8
movdqu xmm8, [5*16 + input]
pxor xmm6, xmm8
movdqu xmm8, [6*16 + input]
pxor xmm7, xmm8
movdqu xmm8, [7*16 + input]
movdqu [0*16 + output], xmm0
movdqu [1*16 + output], xmm1
movdqu [2*16 + output], xmm2
movdqu [3*16 + output], xmm3
movdqu [4*16 + output], xmm4
movdqu [5*16 + output], xmm5
movdqu [6*16 + output], xmm6
movdqu [7*16 + output], xmm7
movdqu [-32 + ctx], xmm8
lea input, [8*16 + input]
lea output, [8*16 + output]
sub inputLen, 8*16
jmp loop8
dec1:
movdqu xmm3, [-32 + ctx]
loop1:
cmp inputLen, 1*16
jb bail
movdqu xmm0, [input]
movdqa xmm4, xmm0
movdqu xmm7, [0*16 + ctx]
pxor xmm0, xmm7
i = 1
WHILE i LT rnds
movdqu xmm7, [i*16 + ctx]
aesdec xmm0, xmm7
i = i+1
ENDM
movdqu xmm7, [rnds*16 + ctx]
aesdeclast xmm0, xmm7
pxor xmm3, xmm0
movdqu [output], xmm3
movdqa xmm3, xmm4
lea input, [1*16 + input]
lea output, [1*16 + output]
sub inputLen, 1*16
jmp loop1
bail:
movdqu [-32 + ctx], xmm3
xor rax, rax
movdqu xmm6, [rsp + 0*16]
movdqu xmm7, [rsp + 1*16]
movdqu xmm8, [rsp + 2*16]
add rsp, 3*16
ret
ENDM
intel_aes_encrypt_cbc_128 PROC
gen_aes_cbc_enc_func 10
intel_aes_encrypt_cbc_128 ENDP
intel_aes_encrypt_cbc_192 PROC
gen_aes_cbc_enc_func 12
intel_aes_encrypt_cbc_192 ENDP
intel_aes_encrypt_cbc_256 PROC
gen_aes_cbc_enc_func 14
intel_aes_encrypt_cbc_256 ENDP
intel_aes_decrypt_cbc_128 PROC
gen_aes_cbc_dec_func 10
intel_aes_decrypt_cbc_128 ENDP
intel_aes_decrypt_cbc_192 PROC
gen_aes_cbc_dec_func 12
intel_aes_decrypt_cbc_192 ENDP
intel_aes_decrypt_cbc_256 PROC
gen_aes_cbc_dec_func 14
intel_aes_decrypt_cbc_256 ENDP
ctrCtx textequ <r10>
CTR textequ <r11d>
CTRSave textequ <eax>
gen_aes_ctr_func MACRO rnds
LOCAL loop8
LOCAL loop1
LOCAL enc1
LOCAL bail
mov input, [rsp + 8*1 + 4*8]
mov inputLen, [rsp + 8*1 + 5*8]
mov ctrCtx, ctx
mov ctx, [8+ctrCtx]
lea ctx, [48+ctx]
sub rsp, 3*16
movdqu [rsp + 0*16], xmm6
movdqu [rsp + 1*16], xmm7
movdqu [rsp + 2*16], xmm8
push rbp
mov rbp, rsp
sub rsp, 8*16
and rsp, -16
movdqu xmm0, [16+ctrCtx]
mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
bswap CTRSave
movdqu xmm1, [ctx + 0*16]
pxor xmm0, xmm1
movdqa [rsp + 0*16], xmm0
movdqa [rsp + 1*16], xmm0
movdqa [rsp + 2*16], xmm0
movdqa [rsp + 3*16], xmm0
movdqa [rsp + 4*16], xmm0
movdqa [rsp + 5*16], xmm0
movdqa [rsp + 6*16], xmm0
movdqa [rsp + 7*16], xmm0
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + 1*16 + 3*4], CTR
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + 2*16 + 3*4], CTR
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + 3*16 + 3*4], CTR
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + 4*16 + 3*4], CTR
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + 5*16 + 3*4], CTR
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + 6*16 + 3*4], CTR
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + 7*16 + 3*4], CTR
loop8:
cmp inputLen, 8*16
jb loop1
movdqu xmm0, [0*16 + rsp]
movdqu xmm1, [1*16 + rsp]
movdqu xmm2, [2*16 + rsp]
movdqu xmm3, [3*16 + rsp]
movdqu xmm4, [4*16 + rsp]
movdqu xmm5, [5*16 + rsp]
movdqu xmm6, [6*16 + rsp]
movdqu xmm7, [7*16 + rsp]
i = 1
WHILE i LE 8
aes_rnd i
inc CTRSave
mov CTR, CTRSave
bswap CTR
xor CTR, DWORD PTR [ctx + 3*4]
mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
i = i+1
ENDM
WHILE i LT rnds
aes_rnd i
i = i+1
ENDM
aes_last_rnd rnds
movdqu xmm8, [0*16 + input]
pxor xmm0, xmm8
movdqu xmm8, [1*16 + input]
pxor xmm1, xmm8
movdqu xmm8, [2*16 + input]
pxor xmm2, xmm8
movdqu xmm8, [3*16 + input]
pxor xmm3, xmm8
movdqu xmm8, [4*16 + input]
pxor xmm4, xmm8
movdqu xmm8, [5*16 + input]
pxor xmm5, xmm8
movdqu xmm8, [6*16 + input]
pxor xmm6, xmm8
movdqu xmm8, [7*16 + input]
pxor xmm7, xmm8
movdqu [0*16 + output], xmm0
movdqu [1*16 + output], xmm1
movdqu [2*16 + output], xmm2
movdqu [3*16 + output], xmm3
movdqu [4*16 + output], xmm4
movdqu [5*16 + output], xmm5
movdqu [6*16 + output], xmm6
movdqu [7*16 + output], xmm7
lea input, [8*16 + input]
lea output, [8*16 + output]
sub inputLen, 8*16
jmp loop8
loop1:
cmp inputLen, 1*16
jb bail
movdqu xmm0, [rsp]
add rsp, 16
i = 1
WHILE i LT rnds
movdqu xmm7, [i*16 + ctx]
aesenc xmm0, xmm7
i = i+1
ENDM
movdqu xmm7, [rnds*16 + ctx]
aesenclast xmm0, xmm7
movdqu xmm7, [input]
pxor xmm0, xmm7
movdqu [output], xmm0
lea input, [1*16 + input]
lea output, [1*16 + output]
sub inputLen, 1*16
jmp loop1
bail:
movdqu xmm0, [rsp]
movdqu xmm1, [ctx + 0*16]
pxor xmm0, xmm1
movdqu [16+ctrCtx], xmm0
xor rax, rax
mov rsp, rbp
pop rbp
movdqu xmm6, [rsp + 0*16]
movdqu xmm7, [rsp + 1*16]
movdqu xmm8, [rsp + 2*16]
add rsp, 3*16
ret
ENDM
intel_aes_encrypt_ctr_128 PROC
gen_aes_ctr_func 10
intel_aes_encrypt_ctr_128 ENDP
intel_aes_encrypt_ctr_192 PROC
gen_aes_ctr_func 12
intel_aes_encrypt_ctr_192 ENDP
intel_aes_encrypt_ctr_256 PROC
gen_aes_ctr_func 14
intel_aes_encrypt_ctr_256 ENDP
END