RetroZilla/security/nss/lib/freebl/intel-aes-x86-masm.asm

; LICENSE:
; This submission to NSS is to be made available under the terms of the
; Mozilla Public License, v. 2.0. You can obtain one at http:
; //mozilla.org/MPL/2.0/.
;###############################################################################
; Copyright(c) 2014, Intel Corp.
; Developers and authors:
; Shay Gueron and Vlad Krasnov
; Intel Corporation, Israel Development Centre, Haifa, Israel
; Please send feedback directly to crypto.feedback.alias@intel.com


.MODEL FLAT, C
.XMM

.DATA
ALIGN 16
Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
Lcon1 dd 1,1,1,1
Lcon2 dd 1bh,1bh,1bh,1bh

.CODE

ctx     textequ <ecx>
output  textequ <edx>
input   textequ <eax>
inputLen textequ <edi>


aes_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesenc  xmm0, xmm7
    aesenc  xmm1, xmm7
    aesenc  xmm2, xmm7
    aesenc  xmm3, xmm7
    aesenc  xmm4, xmm7
    aesenc  xmm5, xmm7
    aesenc  xmm6, xmm7
    ENDM

aes_last_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesenclast  xmm0, xmm7
    aesenclast  xmm1, xmm7
    aesenclast  xmm2, xmm7
    aesenclast  xmm3, xmm7
    aesenclast  xmm4, xmm7
    aesenclast  xmm5, xmm7
    aesenclast  xmm6, xmm7
    ENDM

aes_dec_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesdec  xmm0, xmm7
    aesdec  xmm1, xmm7
    aesdec  xmm2, xmm7
    aesdec  xmm3, xmm7
    aesdec  xmm4, xmm7
    aesdec  xmm5, xmm7
    aesdec  xmm6, xmm7
    ENDM

aes_dec_last_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesdeclast  xmm0, xmm7
    aesdeclast  xmm1, xmm7
    aesdeclast  xmm2, xmm7
    aesdeclast  xmm3, xmm7
    aesdeclast  xmm4, xmm7
    aesdeclast  xmm5, xmm7
    aesdeclast  xmm6, xmm7
    ENDM


gen_aes_ecb_func MACRO enc, rnds

LOCAL   loop7
LOCAL   loop1
LOCAL   bail

        push    inputLen

        mov     ctx,    [esp + 2*4 + 0*4]
        mov     output,     [esp + 2*4 + 1*4]
        mov     input,      [esp + 2*4 + 4*4]
        mov     inputLen,   [esp + 2*4 + 5*4]

        lea     ctx, [44+ctx]

loop7:
        cmp     inputLen, 7*16
        jb      loop1

        movdqu  xmm0, [0*16 + input]
        movdqu  xmm1, [1*16 + input]
        movdqu  xmm2, [2*16 + input]
        movdqu  xmm3, [3*16 + input]
        movdqu  xmm4, [4*16 + input]
        movdqu  xmm5, [5*16 + input]
        movdqu  xmm6, [6*16 + input]

        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7
        pxor    xmm1, xmm7
        pxor    xmm2, xmm7
        pxor    xmm3, xmm7
        pxor    xmm4, xmm7
        pxor    xmm5, xmm7
        pxor    xmm6, xmm7

IF enc eq 1
        rnd textequ <aes_rnd>
        lastrnd textequ <aes_last_rnd>
        aesinst textequ <aesenc>
        aeslastinst textequ <aesenclast>
ELSE
        rnd textequ <aes_dec_rnd>
        lastrnd textequ <aes_dec_last_rnd>
        aesinst textequ <aesdec>
        aeslastinst textequ <aesdeclast>
ENDIF

        i = 1
        WHILE i LT rnds
            rnd i
            i = i+1
            ENDM
        lastrnd rnds

        movdqu  [0*16 + output], xmm0
        movdqu  [1*16 + output], xmm1
        movdqu  [2*16 + output], xmm2
        movdqu  [3*16 + output], xmm3
        movdqu  [4*16 + output], xmm4
        movdqu  [5*16 + output], xmm5
        movdqu  [6*16 + output], xmm6

        lea input, [7*16 + input]
        lea output, [7*16 + output]
        sub inputLen, 7*16
        jmp loop7

loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm0, [input]
        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7

        i = 1
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesinst  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aeslastinst xmm0, xmm7

        movdqu  [output], xmm0

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:
        xor eax, eax
        pop     inputLen
        ret

ENDM

ALIGN 16
intel_aes_encrypt_ecb_128 PROC
gen_aes_ecb_func 1, 10
intel_aes_encrypt_ecb_128 ENDP

ALIGN 16
intel_aes_encrypt_ecb_192 PROC
gen_aes_ecb_func 1, 12
intel_aes_encrypt_ecb_192 ENDP

ALIGN 16
intel_aes_encrypt_ecb_256 PROC
gen_aes_ecb_func 1, 14
intel_aes_encrypt_ecb_256 ENDP

ALIGN 16
intel_aes_decrypt_ecb_128 PROC
gen_aes_ecb_func 0, 10
intel_aes_decrypt_ecb_128 ENDP

ALIGN 16
intel_aes_decrypt_ecb_192 PROC
gen_aes_ecb_func 0, 12
intel_aes_decrypt_ecb_192 ENDP

ALIGN 16
intel_aes_decrypt_ecb_256 PROC
gen_aes_ecb_func 0, 14
intel_aes_decrypt_ecb_256 ENDP


KEY textequ <ecx>
KS  textequ <edx>
ITR textequ <eax>

ALIGN 16
intel_aes_encrypt_init_128  PROC

    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]


    movdqu  xmm1, [KEY]
    movdqu  [KS], xmm1
    movdqa  xmm2, xmm1

    lea ITR, Lcon1
    movdqa  xmm0, [ITR]
    lea ITR, Lmask
    movdqa  xmm4, [ITR]

    mov ITR, 8

Lenc_128_ks_loop:
        lea KS, [16 + KS]
        dec ITR

        pshufb  xmm2, xmm4
        aesenclast  xmm2, xmm0
        pslld   xmm0, 1
        movdqa  xmm3, xmm1
        pslldq  xmm3, 4
        pxor    xmm1, xmm3
        pslldq  xmm3, 4
        pxor    xmm1, xmm3
        pslldq  xmm3, 4
        pxor    xmm1, xmm3
        pxor    xmm1, xmm2
        movdqu  [KS], xmm1
        movdqa  xmm2, xmm1

        jne Lenc_128_ks_loop

    lea ITR, Lcon2
    movdqa  xmm0, [ITR]

    pshufb  xmm2, xmm4
    aesenclast  xmm2, xmm0
    pslld   xmm0, 1
    movdqa  xmm3, xmm1
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pxor    xmm1, xmm2
    movdqu  [16 + KS], xmm1
    movdqa  xmm2, xmm1

    pshufb  xmm2, xmm4
    aesenclast  xmm2, xmm0
    movdqa  xmm3, xmm1
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pxor    xmm1, xmm2
    movdqu  [32 + KS], xmm1
    movdqa  xmm2, xmm1

    ret
intel_aes_encrypt_init_128  ENDP


ALIGN 16
intel_aes_decrypt_init_128  PROC

    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]

    push    KS
    push    KEY

    call    intel_aes_encrypt_init_128

    pop     KEY
    pop     KS

    movdqu  xmm0, [0*16 + KS]
    movdqu  xmm1, [10*16 + KS]
    movdqu  [10*16 + KS], xmm0
    movdqu  [0*16 + KS], xmm1

    i = 1
    WHILE i LT 5
        movdqu  xmm0, [i*16 + KS]
        movdqu  xmm1, [(10-i)*16 + KS]

        aesimc  xmm0, xmm0
        aesimc  xmm1, xmm1

        movdqu  [(10-i)*16 + KS], xmm0
        movdqu  [i*16 + KS], xmm1

        i = i+1
    ENDM

    movdqu  xmm0, [5*16 + KS]
    aesimc  xmm0, xmm0
    movdqu  [5*16 + KS], xmm0
    ret
intel_aes_decrypt_init_128  ENDP


ALIGN 16
intel_aes_encrypt_init_192  PROC

    mov     KEY, [esp + 1*4 + 0*4]
    mov     KS,  [esp + 1*4 + 1*4]

    pxor    xmm3, xmm3
    movdqu  xmm1, [KEY]
    pinsrd  xmm3, DWORD PTR [16 + KEY], 0
    pinsrd  xmm3, DWORD PTR [20 + KEY], 1

    movdqu  [KS], xmm1
    movdqa  xmm5, xmm3

    lea ITR, Lcon1
    movdqu  xmm0, [ITR]
    lea ITR, Lmask192
    movdqu  xmm4, [ITR]

    mov ITR, 4

Lenc_192_ks_loop:
        movdqa  xmm2, xmm3
        pshufb  xmm2, xmm4
        aesenclast xmm2, xmm0
        pslld   xmm0, 1

        movdqa  xmm6, xmm1
        movdqa  xmm7, xmm3
        pslldq  xmm6, 4
        pslldq  xmm7, 4
        pxor    xmm1, xmm6
        pxor    xmm3, xmm7
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pxor    xmm1, xmm2
        pshufd  xmm2, xmm1, 0ffh
        pxor    xmm3, xmm2

        movdqa  xmm6, xmm1
        shufpd  xmm5, xmm1, 00h
        shufpd  xmm6, xmm3, 01h

        movdqu  [16 + KS], xmm5
        movdqu  [32 + KS], xmm6

        movdqa  xmm2, xmm3
        pshufb  xmm2, xmm4
        aesenclast  xmm2, xmm0
        pslld   xmm0, 1

        movdqa  xmm6, xmm1
        movdqa  xmm7, xmm3
        pslldq  xmm6, 4
        pslldq  xmm7, 4
        pxor    xmm1, xmm6
        pxor    xmm3, xmm7
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pxor    xmm1, xmm2
        pshufd  xmm2, xmm1, 0ffh
        pxor    xmm3, xmm2

        movdqu  [48 + KS], xmm1
        movdqa  xmm5, xmm3

        lea KS, [48 + KS]

        dec ITR
        jnz Lenc_192_ks_loop

    movdqu  [16 + KS], xmm5
ret
intel_aes_encrypt_init_192  ENDP

ALIGN 16
intel_aes_decrypt_init_192  PROC
    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]

    push    KS
    push    KEY

    call    intel_aes_encrypt_init_192

    pop     KEY
    pop     KS

    movdqu  xmm0, [0*16 + KS]
    movdqu  xmm1, [12*16 + KS]
    movdqu  [12*16 + KS], xmm0
    movdqu  [0*16 + KS], xmm1

    i = 1
    WHILE i LT 6
        movdqu  xmm0, [i*16 + KS]
        movdqu  xmm1, [(12-i)*16 + KS]

        aesimc  xmm0, xmm0
        aesimc  xmm1, xmm1

        movdqu  [(12-i)*16 + KS], xmm0
        movdqu  [i*16 + KS], xmm1

        i = i+1
    ENDM

    movdqu  xmm0, [6*16 + KS]
    aesimc  xmm0, xmm0
    movdqu  [6*16 + KS], xmm0
    ret
intel_aes_decrypt_init_192  ENDP

ALIGN 16
intel_aes_encrypt_init_256  PROC

    mov     KEY,    [esp + 1*4 + 0*4]
    mov     KS,     [esp + 1*4 + 1*4]
    movdqu  xmm1, [16*0 + KEY]
    movdqu  xmm3, [16*1 + KEY]

    movdqu  [16*0 + KS], xmm1
    movdqu  [16*1 + KS], xmm3

    lea ITR, Lcon1
    movdqu  xmm0, [ITR]
    lea ITR, Lmask256
    movdqu  xmm5, [ITR]

    pxor    xmm6, xmm6

    mov ITR, 6

Lenc_256_ks_loop:

        movdqa  xmm2, xmm3
        pshufb  xmm2, xmm5
        aesenclast  xmm2, xmm0
        pslld   xmm0, 1
        movdqa  xmm4, xmm1
        pslldq  xmm4, 4
        pxor    xmm1, xmm4
        pslldq  xmm4, 4
        pxor    xmm1, xmm4
        pslldq  xmm4, 4
        pxor    xmm1, xmm4
        pxor    xmm1, xmm2
        movdqu  [16*2 + KS], xmm1

        pshufd  xmm2, xmm1, 0ffh
        aesenclast  xmm2, xmm6
        movdqa  xmm4, xmm3
        pslldq  xmm4, 4
        pxor    xmm3, xmm4
        pslldq  xmm4, 4
        pxor    xmm3, xmm4
        pslldq  xmm4, 4
        pxor    xmm3, xmm4
        pxor    xmm3, xmm2
        movdqu  [16*3 + KS], xmm3

        lea KS, [32 + KS]
        dec ITR
        jnz Lenc_256_ks_loop

    movdqa  xmm2, xmm3
    pshufb  xmm2, xmm5
    aesenclast  xmm2, xmm0
    movdqa  xmm4, xmm1
    pslldq  xmm4, 4
    pxor    xmm1, xmm4
    pslldq  xmm4, 4
    pxor    xmm1, xmm4
    pslldq  xmm4, 4
    pxor    xmm1, xmm4
    pxor    xmm1, xmm2
    movdqu  [16*2 + KS], xmm1

    ret
intel_aes_encrypt_init_256  ENDP

ALIGN 16
intel_aes_decrypt_init_256  PROC
    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]

    push    KS
    push    KEY

    call    intel_aes_encrypt_init_256

    pop     KEY
    pop     KS

    movdqu  xmm0, [0*16 + KS]
    movdqu  xmm1, [14*16 + KS]
    movdqu  [14*16 + KS], xmm0
    movdqu  [0*16 + KS], xmm1

    i = 1
    WHILE i LT 7
        movdqu  xmm0, [i*16 + KS]
        movdqu  xmm1, [(14-i)*16 + KS]

        aesimc  xmm0, xmm0
        aesimc  xmm1, xmm1

        movdqu  [(14-i)*16 + KS], xmm0
        movdqu  [i*16 + KS], xmm1

        i = i+1
    ENDM

    movdqu  xmm0, [7*16 + KS]
    aesimc  xmm0, xmm0
    movdqu  [7*16 + KS], xmm0
    ret
intel_aes_decrypt_init_256  ENDP


gen_aes_cbc_enc_func MACRO rnds

LOCAL   loop1
LOCAL   bail

        push    inputLen

        mov     ctx,    [esp + 2*4 + 0*4]
        mov     output,     [esp + 2*4 + 1*4]
        mov     input,      [esp + 2*4 + 4*4]
        mov     inputLen,   [esp + 2*4 + 5*4]

        lea     ctx, [44+ctx]

        movdqu  xmm0, [-32+ctx]

        movdqu  xmm2, [0*16 + ctx]
        movdqu  xmm3, [1*16 + ctx]
        movdqu  xmm4, [2*16 + ctx]
        movdqu  xmm5, [3*16 + ctx]
        movdqu  xmm6, [4*16 + ctx]

loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm1, [input]
        pxor    xmm1, xmm2
        pxor    xmm0, xmm1

        aesenc  xmm0, xmm3
        aesenc  xmm0, xmm4
        aesenc  xmm0, xmm5
        aesenc  xmm0, xmm6

        i = 5
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesenc  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aesenclast xmm0, xmm7

        movdqu  [output], xmm0

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:
        movdqu  [-32+ctx], xmm0

        xor eax, eax
        pop inputLen
        ret

ENDM

gen_aes_cbc_dec_func MACRO rnds

LOCAL   loop7
LOCAL   loop1
LOCAL   dec1
LOCAL   bail

        push    inputLen

        mov     ctx,    [esp + 2*4 + 0*4]
        mov     output,     [esp + 2*4 + 1*4]
        mov     input,      [esp + 2*4 + 4*4]
        mov     inputLen,   [esp + 2*4 + 5*4]

        lea     ctx, [44+ctx]

loop7:
        cmp     inputLen, 7*16
        jb      dec1

        movdqu  xmm0, [0*16 + input]
        movdqu  xmm1, [1*16 + input]
        movdqu  xmm2, [2*16 + input]
        movdqu  xmm3, [3*16 + input]
        movdqu  xmm4, [4*16 + input]
        movdqu  xmm5, [5*16 + input]
        movdqu  xmm6, [6*16 + input]

        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7
        pxor    xmm1, xmm7
        pxor    xmm2, xmm7
        pxor    xmm3, xmm7
        pxor    xmm4, xmm7
        pxor    xmm5, xmm7
        pxor    xmm6, xmm7

        i = 1
        WHILE i LT rnds
            aes_dec_rnd i
            i = i+1
            ENDM
        aes_dec_last_rnd rnds

        movdqu  xmm7, [-32 + ctx]
        pxor    xmm0, xmm7
        movdqu  xmm7, [0*16 + input]
        pxor    xmm1, xmm7
        movdqu  xmm7, [1*16 + input]
        pxor    xmm2, xmm7
        movdqu  xmm7, [2*16 + input]
        pxor    xmm3, xmm7
        movdqu  xmm7, [3*16 + input]
        pxor    xmm4, xmm7
        movdqu  xmm7, [4*16 + input]
        pxor    xmm5, xmm7
        movdqu  xmm7, [5*16 + input]
        pxor    xmm6, xmm7
        movdqu  xmm7, [6*16 + input]

        movdqu  [0*16 + output], xmm0
        movdqu  [1*16 + output], xmm1
        movdqu  [2*16 + output], xmm2
        movdqu  [3*16 + output], xmm3
        movdqu  [4*16 + output], xmm4
        movdqu  [5*16 + output], xmm5
        movdqu  [6*16 + output], xmm6
        movdqu  [-32 + ctx], xmm7

        lea input, [7*16 + input]
        lea output, [7*16 + output]
        sub inputLen, 7*16
        jmp loop7
dec1:

        movdqu  xmm3, [-32 + ctx]

loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm0, [input]
        movdqa  xmm4, xmm0
        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7

        i = 1
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesdec  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aesdeclast xmm0, xmm7
        pxor    xmm3, xmm0

        movdqu  [output], xmm3
        movdqa  xmm3, xmm4

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:
        movdqu  [-32 + ctx], xmm3
        xor eax, eax
        pop     inputLen
        ret
ENDM

ALIGN 16
intel_aes_encrypt_cbc_128 PROC
gen_aes_cbc_enc_func  10
intel_aes_encrypt_cbc_128 ENDP

ALIGN 16
intel_aes_encrypt_cbc_192 PROC
gen_aes_cbc_enc_func  12
intel_aes_encrypt_cbc_192 ENDP

ALIGN 16
intel_aes_encrypt_cbc_256 PROC
gen_aes_cbc_enc_func  14
intel_aes_encrypt_cbc_256 ENDP

ALIGN 16
intel_aes_decrypt_cbc_128 PROC
gen_aes_cbc_dec_func  10
intel_aes_decrypt_cbc_128 ENDP

ALIGN 16
intel_aes_decrypt_cbc_192 PROC
gen_aes_cbc_dec_func  12
intel_aes_decrypt_cbc_192 ENDP

ALIGN 16
intel_aes_decrypt_cbc_256 PROC
gen_aes_cbc_dec_func  14
intel_aes_decrypt_cbc_256 ENDP


ctrCtx textequ <esi>
CTR textequ <ebx>

gen_aes_ctr_func MACRO rnds

LOCAL   loop7
LOCAL   loop1
LOCAL   enc1
LOCAL   bail

        push    inputLen
        push    ctrCtx
        push    CTR
        push    ebp

        mov     ctrCtx, [esp + 4*5 + 0*4]
        mov     output, [esp + 4*5 + 1*4]
        mov     input,  [esp + 4*5 + 4*4]
        mov     inputLen, [esp + 4*5 + 5*4]

        mov     ctx, [4+ctrCtx]
        lea     ctx, [44+ctx]

        mov     ebp, esp
        sub     esp, 7*16
        and     esp, -16

        movdqu  xmm0, [8+ctrCtx]
        mov     ctrCtx, [ctrCtx + 8 + 3*4]
        bswap   ctrCtx
        movdqu  xmm1, [ctx + 0*16]

        pxor    xmm0, xmm1

        movdqa  [esp + 0*16], xmm0
        movdqa  [esp + 1*16], xmm0
        movdqa  [esp + 2*16], xmm0
        movdqa  [esp + 3*16], xmm0
        movdqa  [esp + 4*16], xmm0
        movdqa  [esp + 5*16], xmm0
        movdqa  [esp + 6*16], xmm0

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 1*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 2*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 3*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 4*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 5*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 6*16 + 3*4], CTR


loop7:
        cmp     inputLen, 7*16
        jb      loop1

        movdqu  xmm0, [0*16 + esp]
        movdqu  xmm1, [1*16 + esp]
        movdqu  xmm2, [2*16 + esp]
        movdqu  xmm3, [3*16 + esp]
        movdqu  xmm4, [4*16 + esp]
        movdqu  xmm5, [5*16 + esp]
        movdqu  xmm6, [6*16 + esp]

        i = 1
        WHILE i LE 7
            aes_rnd i

            inc     ctrCtx
            mov     CTR, ctrCtx
            bswap   CTR
            xor     CTR, [ctx + 3*4]
            mov     [esp + (i-1)*16 + 3*4], CTR

            i = i+1
        ENDM
        WHILE i LT rnds
            aes_rnd i
            i = i+1
            ENDM
        aes_last_rnd rnds

        movdqu  xmm7, [0*16 + input]
        pxor    xmm0, xmm7
        movdqu  xmm7, [1*16 + input]
        pxor    xmm1, xmm7
        movdqu  xmm7, [2*16 + input]
        pxor    xmm2, xmm7
        movdqu  xmm7, [3*16 + input]
        pxor    xmm3, xmm7
        movdqu  xmm7, [4*16 + input]
        pxor    xmm4, xmm7
        movdqu  xmm7, [5*16 + input]
        pxor    xmm5, xmm7
        movdqu  xmm7, [6*16 + input]
        pxor    xmm6, xmm7

        movdqu  [0*16 + output], xmm0
        movdqu  [1*16 + output], xmm1
        movdqu  [2*16 + output], xmm2
        movdqu  [3*16 + output], xmm3
        movdqu  [4*16 + output], xmm4
        movdqu  [5*16 + output], xmm5
        movdqu  [6*16 + output], xmm6

        lea input, [7*16 + input]
        lea output, [7*16 + output]
        sub inputLen, 7*16
        jmp loop7


loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm0, [esp]
        add     esp, 16

        i = 1
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesenc  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aesenclast xmm0, xmm7

        movdqu  xmm7, [input]
        pxor    xmm0, xmm7
        movdqu  [output], xmm0

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:

        mov     ctrCtx, [ebp + 4*5 + 0*4]
        movdqu  xmm0, [esp]
        movdqu  xmm1, [ctx + 0*16]
        pxor    xmm0, xmm1
        movdqu  [8+ctrCtx], xmm0


        xor     eax, eax
        mov     esp, ebp
        pop     ebp
        pop     CTR
        pop     ctrCtx
        pop     inputLen
        ret
ENDM


ALIGN 16
intel_aes_encrypt_ctr_128 PROC
gen_aes_ctr_func  10
intel_aes_encrypt_ctr_128 ENDP

ALIGN 16
intel_aes_encrypt_ctr_192 PROC
gen_aes_ctr_func  12
intel_aes_encrypt_ctr_192 ENDP

ALIGN 16
intel_aes_encrypt_ctr_256 PROC
gen_aes_ctr_func  14
intel_aes_encrypt_ctr_256 ENDP


END