RetroZilla/security/nss/lib/freebl/intel-aes-x86-masm.asm

; LICENSE:
; This submission to NSS is to be made available under the terms of the
; Mozilla Public License, v. 2.0. You can obtain one at http:
; //mozilla.org/MPL/2.0/.
;###############################################################################
; Copyright(c) 2014, Intel Corp.
; Developers and authors:
; Shay Gueron and Vlad Krasnov
; Intel Corporation, Israel Development Centre, Haifa, Israel
; Please send feedback directly to crypto.feedback.alias@intel.com


.MODEL FLAT, C
.XMM

.DATA
ALIGN 16
Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
Lcon1 dd 1,1,1,1
Lcon2 dd 1bh,1bh,1bh,1bh

.CODE

ctx     textequ <ecx>
output  textequ <edx>
input   textequ <eax>
inputLen textequ <edi>


aes_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesenc  xmm0, xmm7
    aesenc  xmm1, xmm7
    aesenc  xmm2, xmm7
    aesenc  xmm3, xmm7
    aesenc  xmm4, xmm7
    aesenc  xmm5, xmm7
    aesenc  xmm6, xmm7
    ENDM

aes_last_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesenclast  xmm0, xmm7
    aesenclast  xmm1, xmm7
    aesenclast  xmm2, xmm7
    aesenclast  xmm3, xmm7
    aesenclast  xmm4, xmm7
    aesenclast  xmm5, xmm7
    aesenclast  xmm6, xmm7
    ENDM

aes_dec_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesdec  xmm0, xmm7
    aesdec  xmm1, xmm7
    aesdec  xmm2, xmm7
    aesdec  xmm3, xmm7
    aesdec  xmm4, xmm7
    aesdec  xmm5, xmm7
    aesdec  xmm6, xmm7
    ENDM

aes_dec_last_rnd MACRO i
    movdqu  xmm7, [i*16 + ctx]
    aesdeclast  xmm0, xmm7
    aesdeclast  xmm1, xmm7
    aesdeclast  xmm2, xmm7
    aesdeclast  xmm3, xmm7
    aesdeclast  xmm4, xmm7
    aesdeclast  xmm5, xmm7
    aesdeclast  xmm6, xmm7
    ENDM


gen_aes_ecb_func MACRO enc, rnds

LOCAL   loop7
LOCAL   loop1
LOCAL   bail

        push    inputLen

        mov     ctx,    [esp + 2*4 + 0*4]
        mov     output,     [esp + 2*4 + 1*4]
        mov     input,      [esp + 2*4 + 4*4]
        mov     inputLen,   [esp + 2*4 + 5*4]

        lea     ctx, [44+ctx]

loop7:
        cmp     inputLen, 7*16
        jb      loop1

        movdqu  xmm0, [0*16 + input]
        movdqu  xmm1, [1*16 + input]
        movdqu  xmm2, [2*16 + input]
        movdqu  xmm3, [3*16 + input]
        movdqu  xmm4, [4*16 + input]
        movdqu  xmm5, [5*16 + input]
        movdqu  xmm6, [6*16 + input]

        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7
        pxor    xmm1, xmm7
        pxor    xmm2, xmm7
        pxor    xmm3, xmm7
        pxor    xmm4, xmm7
        pxor    xmm5, xmm7
        pxor    xmm6, xmm7

IF enc eq 1
        rnd textequ <aes_rnd>
        lastrnd textequ <aes_last_rnd>
        aesinst textequ <aesenc>
        aeslastinst textequ <aesenclast>
ELSE
        rnd textequ <aes_dec_rnd>
        lastrnd textequ <aes_dec_last_rnd>
        aesinst textequ <aesdec>
        aeslastinst textequ <aesdeclast>
ENDIF

        i = 1
        WHILE i LT rnds
            rnd i
            i = i+1
            ENDM
        lastrnd rnds

        movdqu  [0*16 + output], xmm0
        movdqu  [1*16 + output], xmm1
        movdqu  [2*16 + output], xmm2
        movdqu  [3*16 + output], xmm3
        movdqu  [4*16 + output], xmm4
        movdqu  [5*16 + output], xmm5
        movdqu  [6*16 + output], xmm6

        lea input, [7*16 + input]
        lea output, [7*16 + output]
        sub inputLen, 7*16
        jmp loop7

loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm0, [input]
        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7

        i = 1
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesinst  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aeslastinst xmm0, xmm7

        movdqu  [output], xmm0

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:
        xor eax, eax
        pop     inputLen
        ret

ENDM

ALIGN 16
intel_aes_encrypt_ecb_128 PROC
gen_aes_ecb_func 1, 10
intel_aes_encrypt_ecb_128 ENDP

ALIGN 16
intel_aes_encrypt_ecb_192 PROC
gen_aes_ecb_func 1, 12
intel_aes_encrypt_ecb_192 ENDP

ALIGN 16
intel_aes_encrypt_ecb_256 PROC
gen_aes_ecb_func 1, 14
intel_aes_encrypt_ecb_256 ENDP

ALIGN 16
intel_aes_decrypt_ecb_128 PROC
gen_aes_ecb_func 0, 10
intel_aes_decrypt_ecb_128 ENDP

ALIGN 16
intel_aes_decrypt_ecb_192 PROC
gen_aes_ecb_func 0, 12
intel_aes_decrypt_ecb_192 ENDP

ALIGN 16
intel_aes_decrypt_ecb_256 PROC
gen_aes_ecb_func 0, 14
intel_aes_decrypt_ecb_256 ENDP


KEY textequ <ecx>
KS  textequ <edx>
ITR textequ <eax>

ALIGN 16
intel_aes_encrypt_init_128  PROC

    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]


    movdqu  xmm1, [KEY]
    movdqu  [KS], xmm1
    movdqa  xmm2, xmm1

    lea ITR, Lcon1
    movdqa  xmm0, [ITR]
    lea ITR, Lmask
    movdqa  xmm4, [ITR]

    mov ITR, 8

Lenc_128_ks_loop:
        lea KS, [16 + KS]
        dec ITR

        pshufb  xmm2, xmm4
        aesenclast  xmm2, xmm0
        pslld   xmm0, 1
        movdqa  xmm3, xmm1
        pslldq  xmm3, 4
        pxor    xmm1, xmm3
        pslldq  xmm3, 4
        pxor    xmm1, xmm3
        pslldq  xmm3, 4
        pxor    xmm1, xmm3
        pxor    xmm1, xmm2
        movdqu  [KS], xmm1
        movdqa  xmm2, xmm1

        jne Lenc_128_ks_loop

    lea ITR, Lcon2
    movdqa  xmm0, [ITR]

    pshufb  xmm2, xmm4
    aesenclast  xmm2, xmm0
    pslld   xmm0, 1
    movdqa  xmm3, xmm1
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pxor    xmm1, xmm2
    movdqu  [16 + KS], xmm1
    movdqa  xmm2, xmm1

    pshufb  xmm2, xmm4
    aesenclast  xmm2, xmm0
    movdqa  xmm3, xmm1
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pslldq  xmm3, 4
    pxor    xmm1, xmm3
    pxor    xmm1, xmm2
    movdqu  [32 + KS], xmm1
    movdqa  xmm2, xmm1

    ret
intel_aes_encrypt_init_128  ENDP


ALIGN 16
intel_aes_decrypt_init_128  PROC

    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]

    push    KS
    push    KEY

    call    intel_aes_encrypt_init_128

    pop     KEY
    pop     KS

    movdqu  xmm0, [0*16 + KS]
    movdqu  xmm1, [10*16 + KS]
    movdqu  [10*16 + KS], xmm0
    movdqu  [0*16 + KS], xmm1

    i = 1
    WHILE i LT 5
        movdqu  xmm0, [i*16 + KS]
        movdqu  xmm1, [(10-i)*16 + KS]

        aesimc  xmm0, xmm0
        aesimc  xmm1, xmm1

        movdqu  [(10-i)*16 + KS], xmm0
        movdqu  [i*16 + KS], xmm1

        i = i+1
    ENDM

    movdqu  xmm0, [5*16 + KS]
    aesimc  xmm0, xmm0
    movdqu  [5*16 + KS], xmm0
    ret
intel_aes_decrypt_init_128  ENDP


ALIGN 16
intel_aes_encrypt_init_192  PROC

    mov     KEY, [esp + 1*4 + 0*4]
    mov     KS,  [esp + 1*4 + 1*4]

    pxor    xmm3, xmm3
    movdqu  xmm1, [KEY]
    pinsrd  xmm3, DWORD PTR [16 + KEY], 0
    pinsrd  xmm3, DWORD PTR [20 + KEY], 1

    movdqu  [KS], xmm1
    movdqa  xmm5, xmm3

    lea ITR, Lcon1
    movdqu  xmm0, [ITR]
    lea ITR, Lmask192
    movdqu  xmm4, [ITR]

    mov ITR, 4

Lenc_192_ks_loop:
        movdqa  xmm2, xmm3
        pshufb  xmm2, xmm4
        aesenclast xmm2, xmm0
        pslld   xmm0, 1

        movdqa  xmm6, xmm1
        movdqa  xmm7, xmm3
        pslldq  xmm6, 4
        pslldq  xmm7, 4
        pxor    xmm1, xmm6
        pxor    xmm3, xmm7
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pxor    xmm1, xmm2
        pshufd  xmm2, xmm1, 0ffh
        pxor    xmm3, xmm2

        movdqa  xmm6, xmm1
        shufpd  xmm5, xmm1, 00h
        shufpd  xmm6, xmm3, 01h

        movdqu  [16 + KS], xmm5
        movdqu  [32 + KS], xmm6

        movdqa  xmm2, xmm3
        pshufb  xmm2, xmm4
        aesenclast  xmm2, xmm0
        pslld   xmm0, 1

        movdqa  xmm6, xmm1
        movdqa  xmm7, xmm3
        pslldq  xmm6, 4
        pslldq  xmm7, 4
        pxor    xmm1, xmm6
        pxor    xmm3, xmm7
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pslldq  xmm6, 4
        pxor    xmm1, xmm6
        pxor    xmm1, xmm2
        pshufd  xmm2, xmm1, 0ffh
        pxor    xmm3, xmm2

        movdqu  [48 + KS], xmm1
        movdqa  xmm5, xmm3

        lea KS, [48 + KS]

        dec ITR
        jnz Lenc_192_ks_loop

    movdqu  [16 + KS], xmm5
ret
intel_aes_encrypt_init_192  ENDP

ALIGN 16
intel_aes_decrypt_init_192  PROC
    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]

    push    KS
    push    KEY

    call    intel_aes_encrypt_init_192

    pop     KEY
    pop     KS

    movdqu  xmm0, [0*16 + KS]
    movdqu  xmm1, [12*16 + KS]
    movdqu  [12*16 + KS], xmm0
    movdqu  [0*16 + KS], xmm1

    i = 1
    WHILE i LT 6
        movdqu  xmm0, [i*16 + KS]
        movdqu  xmm1, [(12-i)*16 + KS]

        aesimc  xmm0, xmm0
        aesimc  xmm1, xmm1

        movdqu  [(12-i)*16 + KS], xmm0
        movdqu  [i*16 + KS], xmm1

        i = i+1
    ENDM

    movdqu  xmm0, [6*16 + KS]
    aesimc  xmm0, xmm0
    movdqu  [6*16 + KS], xmm0
    ret
intel_aes_decrypt_init_192  ENDP

ALIGN 16
intel_aes_encrypt_init_256  PROC

    mov     KEY,    [esp + 1*4 + 0*4]
    mov     KS,     [esp + 1*4 + 1*4]
    movdqu  xmm1, [16*0 + KEY]
    movdqu  xmm3, [16*1 + KEY]

    movdqu  [16*0 + KS], xmm1
    movdqu  [16*1 + KS], xmm3

    lea ITR, Lcon1
    movdqu  xmm0, [ITR]
    lea ITR, Lmask256
    movdqu  xmm5, [ITR]

    pxor    xmm6, xmm6

    mov ITR, 6

Lenc_256_ks_loop:

        movdqa  xmm2, xmm3
        pshufb  xmm2, xmm5
        aesenclast  xmm2, xmm0
        pslld   xmm0, 1
        movdqa  xmm4, xmm1
        pslldq  xmm4, 4
        pxor    xmm1, xmm4
        pslldq  xmm4, 4
        pxor    xmm1, xmm4
        pslldq  xmm4, 4
        pxor    xmm1, xmm4
        pxor    xmm1, xmm2
        movdqu  [16*2 + KS], xmm1

        pshufd  xmm2, xmm1, 0ffh
        aesenclast  xmm2, xmm6
        movdqa  xmm4, xmm3
        pslldq  xmm4, 4
        pxor    xmm3, xmm4
        pslldq  xmm4, 4
        pxor    xmm3, xmm4
        pslldq  xmm4, 4
        pxor    xmm3, xmm4
        pxor    xmm3, xmm2
        movdqu  [16*3 + KS], xmm3

        lea KS, [32 + KS]
        dec ITR
        jnz Lenc_256_ks_loop

    movdqa  xmm2, xmm3
    pshufb  xmm2, xmm5
    aesenclast  xmm2, xmm0
    movdqa  xmm4, xmm1
    pslldq  xmm4, 4
    pxor    xmm1, xmm4
    pslldq  xmm4, 4
    pxor    xmm1, xmm4
    pslldq  xmm4, 4
    pxor    xmm1, xmm4
    pxor    xmm1, xmm2
    movdqu  [16*2 + KS], xmm1

    ret
intel_aes_encrypt_init_256  ENDP

ALIGN 16
intel_aes_decrypt_init_256  PROC
    mov     KEY,        [esp + 1*4 + 0*4]
    mov     KS,         [esp + 1*4 + 1*4]

    push    KS
    push    KEY

    call    intel_aes_encrypt_init_256

    pop     KEY
    pop     KS

    movdqu  xmm0, [0*16 + KS]
    movdqu  xmm1, [14*16 + KS]
    movdqu  [14*16 + KS], xmm0
    movdqu  [0*16 + KS], xmm1

    i = 1
    WHILE i LT 7
        movdqu  xmm0, [i*16 + KS]
        movdqu  xmm1, [(14-i)*16 + KS]

        aesimc  xmm0, xmm0
        aesimc  xmm1, xmm1

        movdqu  [(14-i)*16 + KS], xmm0
        movdqu  [i*16 + KS], xmm1

        i = i+1
    ENDM

    movdqu  xmm0, [7*16 + KS]
    aesimc  xmm0, xmm0
    movdqu  [7*16 + KS], xmm0
    ret
intel_aes_decrypt_init_256  ENDP


gen_aes_cbc_enc_func MACRO rnds

LOCAL   loop1
LOCAL   bail

        push    inputLen

        mov     ctx,    [esp + 2*4 + 0*4]
        mov     output,     [esp + 2*4 + 1*4]
        mov     input,      [esp + 2*4 + 4*4]
        mov     inputLen,   [esp + 2*4 + 5*4]

        lea     ctx, [44+ctx]

        movdqu  xmm0, [-32+ctx]

        movdqu  xmm2, [0*16 + ctx]
        movdqu  xmm3, [1*16 + ctx]
        movdqu  xmm4, [2*16 + ctx]
        movdqu  xmm5, [3*16 + ctx]
        movdqu  xmm6, [4*16 + ctx]

loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm1, [input]
        pxor    xmm1, xmm2
        pxor    xmm0, xmm1

        aesenc  xmm0, xmm3
        aesenc  xmm0, xmm4
        aesenc  xmm0, xmm5
        aesenc  xmm0, xmm6

        i = 5
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesenc  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aesenclast xmm0, xmm7

        movdqu  [output], xmm0

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:
        movdqu  [-32+ctx], xmm0

        xor eax, eax
        pop inputLen
        ret

ENDM

gen_aes_cbc_dec_func MACRO rnds

LOCAL   loop7
LOCAL   loop1
LOCAL   dec1
LOCAL   bail

        push    inputLen

        mov     ctx,    [esp + 2*4 + 0*4]
        mov     output,     [esp + 2*4 + 1*4]
        mov     input,      [esp + 2*4 + 4*4]
        mov     inputLen,   [esp + 2*4 + 5*4]

        lea     ctx, [44+ctx]

loop7:
        cmp     inputLen, 7*16
        jb      dec1

        movdqu  xmm0, [0*16 + input]
        movdqu  xmm1, [1*16 + input]
        movdqu  xmm2, [2*16 + input]
        movdqu  xmm3, [3*16 + input]
        movdqu  xmm4, [4*16 + input]
        movdqu  xmm5, [5*16 + input]
        movdqu  xmm6, [6*16 + input]

        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7
        pxor    xmm1, xmm7
        pxor    xmm2, xmm7
        pxor    xmm3, xmm7
        pxor    xmm4, xmm7
        pxor    xmm5, xmm7
        pxor    xmm6, xmm7

        i = 1
        WHILE i LT rnds
            aes_dec_rnd i
            i = i+1
            ENDM
        aes_dec_last_rnd rnds

        movdqu  xmm7, [-32 + ctx]
        pxor    xmm0, xmm7
        movdqu  xmm7, [0*16 + input]
        pxor    xmm1, xmm7
        movdqu  xmm7, [1*16 + input]
        pxor    xmm2, xmm7
        movdqu  xmm7, [2*16 + input]
        pxor    xmm3, xmm7
        movdqu  xmm7, [3*16 + input]
        pxor    xmm4, xmm7
        movdqu  xmm7, [4*16 + input]
        pxor    xmm5, xmm7
        movdqu  xmm7, [5*16 + input]
        pxor    xmm6, xmm7
        movdqu  xmm7, [6*16 + input]

        movdqu  [0*16 + output], xmm0
        movdqu  [1*16 + output], xmm1
        movdqu  [2*16 + output], xmm2
        movdqu  [3*16 + output], xmm3
        movdqu  [4*16 + output], xmm4
        movdqu  [5*16 + output], xmm5
        movdqu  [6*16 + output], xmm6
        movdqu  [-32 + ctx], xmm7

        lea input, [7*16 + input]
        lea output, [7*16 + output]
        sub inputLen, 7*16
        jmp loop7
dec1:

        movdqu  xmm3, [-32 + ctx]

loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm0, [input]
        movdqa  xmm4, xmm0
        movdqu  xmm7, [0*16 + ctx]
        pxor    xmm0, xmm7

        i = 1
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesdec  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aesdeclast xmm0, xmm7
        pxor    xmm3, xmm0

        movdqu  [output], xmm3
        movdqa  xmm3, xmm4

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:
        movdqu  [-32 + ctx], xmm3
        xor eax, eax
        pop     inputLen
        ret
ENDM

ALIGN 16
intel_aes_encrypt_cbc_128 PROC
gen_aes_cbc_enc_func  10
intel_aes_encrypt_cbc_128 ENDP

ALIGN 16
intel_aes_encrypt_cbc_192 PROC
gen_aes_cbc_enc_func  12
intel_aes_encrypt_cbc_192 ENDP

ALIGN 16
intel_aes_encrypt_cbc_256 PROC
gen_aes_cbc_enc_func  14
intel_aes_encrypt_cbc_256 ENDP

ALIGN 16
intel_aes_decrypt_cbc_128 PROC
gen_aes_cbc_dec_func  10
intel_aes_decrypt_cbc_128 ENDP

ALIGN 16
intel_aes_decrypt_cbc_192 PROC
gen_aes_cbc_dec_func  12
intel_aes_decrypt_cbc_192 ENDP

ALIGN 16
intel_aes_decrypt_cbc_256 PROC
gen_aes_cbc_dec_func  14
intel_aes_decrypt_cbc_256 ENDP


ctrCtx textequ <esi>
CTR textequ <ebx>

gen_aes_ctr_func MACRO rnds

LOCAL   loop7
LOCAL   loop1
LOCAL   enc1
LOCAL   bail

        push    inputLen
        push    ctrCtx
        push    CTR
        push    ebp

        mov     ctrCtx, [esp + 4*5 + 0*4]
        mov     output, [esp + 4*5 + 1*4]
        mov     input,  [esp + 4*5 + 4*4]
        mov     inputLen, [esp + 4*5 + 5*4]

        mov     ctx, [4+ctrCtx]
        lea     ctx, [44+ctx]

        mov     ebp, esp
        sub     esp, 7*16
        and     esp, -16

        movdqu  xmm0, [8+ctrCtx]
        mov     ctrCtx, [ctrCtx + 8 + 3*4]
        bswap   ctrCtx
        movdqu  xmm1, [ctx + 0*16]

        pxor    xmm0, xmm1

        movdqa  [esp + 0*16], xmm0
        movdqa  [esp + 1*16], xmm0
        movdqa  [esp + 2*16], xmm0
        movdqa  [esp + 3*16], xmm0
        movdqa  [esp + 4*16], xmm0
        movdqa  [esp + 5*16], xmm0
        movdqa  [esp + 6*16], xmm0

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 1*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 2*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 3*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 4*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 5*16 + 3*4], CTR

        inc     ctrCtx
        mov     CTR, ctrCtx
        bswap   CTR
        xor     CTR, [ctx + 3*4]
        mov     [esp + 6*16 + 3*4], CTR


loop7:
        cmp     inputLen, 7*16
        jb      loop1

        movdqu  xmm0, [0*16 + esp]
        movdqu  xmm1, [1*16 + esp]
        movdqu  xmm2, [2*16 + esp]
        movdqu  xmm3, [3*16 + esp]
        movdqu  xmm4, [4*16 + esp]
        movdqu  xmm5, [5*16 + esp]
        movdqu  xmm6, [6*16 + esp]

        i = 1
        WHILE i LE 7
            aes_rnd i

            inc     ctrCtx
            mov     CTR, ctrCtx
            bswap   CTR
            xor     CTR, [ctx + 3*4]
            mov     [esp + (i-1)*16 + 3*4], CTR

            i = i+1
        ENDM
        WHILE i LT rnds
            aes_rnd i
            i = i+1
            ENDM
        aes_last_rnd rnds

        movdqu  xmm7, [0*16 + input]
        pxor    xmm0, xmm7
        movdqu  xmm7, [1*16 + input]
        pxor    xmm1, xmm7
        movdqu  xmm7, [2*16 + input]
        pxor    xmm2, xmm7
        movdqu  xmm7, [3*16 + input]
        pxor    xmm3, xmm7
        movdqu  xmm7, [4*16 + input]
        pxor    xmm4, xmm7
        movdqu  xmm7, [5*16 + input]
        pxor    xmm5, xmm7
        movdqu  xmm7, [6*16 + input]
        pxor    xmm6, xmm7

        movdqu  [0*16 + output], xmm0
        movdqu  [1*16 + output], xmm1
        movdqu  [2*16 + output], xmm2
        movdqu  [3*16 + output], xmm3
        movdqu  [4*16 + output], xmm4
        movdqu  [5*16 + output], xmm5
        movdqu  [6*16 + output], xmm6

        lea input, [7*16 + input]
        lea output, [7*16 + output]
        sub inputLen, 7*16
        jmp loop7


loop1:
        cmp     inputLen, 1*16
        jb      bail

        movdqu  xmm0, [esp]
        add     esp, 16

        i = 1
    WHILE i LT rnds
            movdqu  xmm7, [i*16 + ctx]
            aesenc  xmm0, xmm7
            i = i+1
        ENDM
        movdqu  xmm7, [rnds*16 + ctx]
        aesenclast xmm0, xmm7

        movdqu  xmm7, [input]
        pxor    xmm0, xmm7
        movdqu  [output], xmm0

        lea input, [1*16 + input]
        lea output, [1*16 + output]
        sub inputLen, 1*16
        jmp loop1

bail:

        mov     ctrCtx, [ebp + 4*5 + 0*4]
        movdqu  xmm0, [esp]
        movdqu  xmm1, [ctx + 0*16]
        pxor    xmm0, xmm1
        movdqu  [8+ctrCtx], xmm0


        xor     eax, eax
        mov     esp, ebp
        pop     ebp
        pop     CTR
        pop     ctrCtx
        pop     inputLen
        ret
ENDM


ALIGN 16
intel_aes_encrypt_ctr_128 PROC
gen_aes_ctr_func  10
intel_aes_encrypt_ctr_128 ENDP

ALIGN 16
intel_aes_encrypt_ctr_192 PROC
gen_aes_ctr_func  12
intel_aes_encrypt_ctr_192 ENDP

ALIGN 16
intel_aes_encrypt_ctr_256 PROC
gen_aes_ctr_func  14
intel_aes_encrypt_ctr_256 ENDP


END
cherry-picked mozilla NSS upstream changes (to rev f7a4c771997e, which is on par with 3.16.1 but without windows rand() changes): 9934c8faef29, 3c3b381c4865, 5a67f6beee9a, 1b1eb6d77728, a8b668fd72f7, bug962760, bug743700, bug857304, bug972653, bug972450, bug971358, bug903885, bug977073, bug976111, bug949939, bug947653, bug947572, bug903885, bug979106, bug966596, bug979004, bug979752, bug980848, bug938369, bug981170, bug668130, bug974693, bug975056, bug979132, bug370717, bug979070, bug985070, bug900067, bug977673, bug519255, bug989558, bug557299, bug987263, bug369802, a751a5146718, bug992343, bug952572, bug979703, bug994883, bug994869, bug993489, bug984608, bug977869, bug667371, bug672828, bug793347, bug977869 2018-07-10 17:07:31 +02:00			`; LICENSE:`
			`; This submission to NSS is to be made available under the terms of the`
			`; Mozilla Public License, v. 2.0. You can obtain one at http:`
			`; //mozilla.org/MPL/2.0/.`
			`;###############################################################################`
			`; Copyright(c) 2014, Intel Corp.`
			`; Developers and authors:`
			`; Shay Gueron and Vlad Krasnov`
			`; Intel Corporation, Israel Development Centre, Haifa, Israel`
			`; Please send feedback directly to crypto.feedback.alias@intel.com`


			`.MODEL FLAT, C`
			`.XMM`

			`.DATA`
			`ALIGN 16`
			`Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh`
			`Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h`
			`Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh`
			`Lcon1 dd 1,1,1,1`
			`Lcon2 dd 1bh,1bh,1bh,1bh`

			`.CODE`

			`ctx textequ <ecx>`
			`output textequ <edx>`
			`input textequ <eax>`
			`inputLen textequ <edi>`


			`aes_rnd MACRO i`
			`movdqu xmm7, [i*16 + ctx]`
			`aesenc xmm0, xmm7`
			`aesenc xmm1, xmm7`
			`aesenc xmm2, xmm7`
			`aesenc xmm3, xmm7`
			`aesenc xmm4, xmm7`
			`aesenc xmm5, xmm7`
			`aesenc xmm6, xmm7`
			`ENDM`

			`aes_last_rnd MACRO i`
			`movdqu xmm7, [i*16 + ctx]`
			`aesenclast xmm0, xmm7`
			`aesenclast xmm1, xmm7`
			`aesenclast xmm2, xmm7`
			`aesenclast xmm3, xmm7`
			`aesenclast xmm4, xmm7`
			`aesenclast xmm5, xmm7`
			`aesenclast xmm6, xmm7`
			`ENDM`

			`aes_dec_rnd MACRO i`
			`movdqu xmm7, [i*16 + ctx]`
			`aesdec xmm0, xmm7`
			`aesdec xmm1, xmm7`
			`aesdec xmm2, xmm7`
			`aesdec xmm3, xmm7`
			`aesdec xmm4, xmm7`
			`aesdec xmm5, xmm7`
			`aesdec xmm6, xmm7`
			`ENDM`

			`aes_dec_last_rnd MACRO i`
			`movdqu xmm7, [i*16 + ctx]`
			`aesdeclast xmm0, xmm7`
			`aesdeclast xmm1, xmm7`
			`aesdeclast xmm2, xmm7`
			`aesdeclast xmm3, xmm7`
			`aesdeclast xmm4, xmm7`
			`aesdeclast xmm5, xmm7`
			`aesdeclast xmm6, xmm7`
			`ENDM`


			`gen_aes_ecb_func MACRO enc, rnds`

			`LOCAL loop7`
			`LOCAL loop1`
			`LOCAL bail`

			`push inputLen`

			`mov ctx, [esp + 24 + 04]`
			`mov output, [esp + 24 + 14]`
			`mov input, [esp + 24 + 44]`
			`mov inputLen, [esp + 24 + 54]`

			`lea ctx, [44+ctx]`

			`loop7:`
			`cmp inputLen, 7*16`
			`jb loop1`

			`movdqu xmm0, [0*16 + input]`
			`movdqu xmm1, [1*16 + input]`
			`movdqu xmm2, [2*16 + input]`
			`movdqu xmm3, [3*16 + input]`
			`movdqu xmm4, [4*16 + input]`
			`movdqu xmm5, [5*16 + input]`
			`movdqu xmm6, [6*16 + input]`

			`movdqu xmm7, [0*16 + ctx]`
			`pxor xmm0, xmm7`
			`pxor xmm1, xmm7`
			`pxor xmm2, xmm7`
			`pxor xmm3, xmm7`
			`pxor xmm4, xmm7`
			`pxor xmm5, xmm7`
			`pxor xmm6, xmm7`

			`IF enc eq 1`
			`rnd textequ <aes_rnd>`
			`lastrnd textequ <aes_last_rnd>`
			`aesinst textequ <aesenc>`
			`aeslastinst textequ <aesenclast>`
			`ELSE`
			`rnd textequ <aes_dec_rnd>`
			`lastrnd textequ <aes_dec_last_rnd>`
			`aesinst textequ <aesdec>`
			`aeslastinst textequ <aesdeclast>`
			`ENDIF`

			`i = 1`
			`WHILE i LT rnds`
			`rnd i`
			`i = i+1`
			`ENDM`
			`lastrnd rnds`

			`movdqu [0*16 + output], xmm0`
			`movdqu [1*16 + output], xmm1`
			`movdqu [2*16 + output], xmm2`
			`movdqu [3*16 + output], xmm3`
			`movdqu [4*16 + output], xmm4`
			`movdqu [5*16 + output], xmm5`
			`movdqu [6*16 + output], xmm6`

			`lea input, [7*16 + input]`
			`lea output, [7*16 + output]`
			`sub inputLen, 7*16`
			`jmp loop7`

			`loop1:`
			`cmp inputLen, 1*16`
			`jb bail`

			`movdqu xmm0, [input]`
			`movdqu xmm7, [0*16 + ctx]`
			`pxor xmm0, xmm7`

			`i = 1`
			`WHILE i LT rnds`
			`movdqu xmm7, [i*16 + ctx]`
			`aesinst xmm0, xmm7`
			`i = i+1`
			`ENDM`
			`movdqu xmm7, [rnds*16 + ctx]`
			`aeslastinst xmm0, xmm7`

			`movdqu [output], xmm0`

			`lea input, [1*16 + input]`
			`lea output, [1*16 + output]`
			`sub inputLen, 1*16`
			`jmp loop1`

			`bail:`
			`xor eax, eax`
			`pop inputLen`
			`ret`

			`ENDM`

			`ALIGN 16`
			`intel_aes_encrypt_ecb_128 PROC`
			`gen_aes_ecb_func 1, 10`
			`intel_aes_encrypt_ecb_128 ENDP`

			`ALIGN 16`
			`intel_aes_encrypt_ecb_192 PROC`
			`gen_aes_ecb_func 1, 12`
			`intel_aes_encrypt_ecb_192 ENDP`

			`ALIGN 16`
			`intel_aes_encrypt_ecb_256 PROC`
			`gen_aes_ecb_func 1, 14`
			`intel_aes_encrypt_ecb_256 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_ecb_128 PROC`
			`gen_aes_ecb_func 0, 10`
			`intel_aes_decrypt_ecb_128 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_ecb_192 PROC`
			`gen_aes_ecb_func 0, 12`
			`intel_aes_decrypt_ecb_192 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_ecb_256 PROC`
			`gen_aes_ecb_func 0, 14`
			`intel_aes_decrypt_ecb_256 ENDP`


			`KEY textequ <ecx>`
			`KS textequ <edx>`
			`ITR textequ <eax>`

			`ALIGN 16`
			`intel_aes_encrypt_init_128 PROC`

			`mov KEY, [esp + 14 + 04]`
			`mov KS, [esp + 14 + 14]`


			`movdqu xmm1, [KEY]`
			`movdqu [KS], xmm1`
			`movdqa xmm2, xmm1`

			`lea ITR, Lcon1`
			`movdqa xmm0, [ITR]`
			`lea ITR, Lmask`
			`movdqa xmm4, [ITR]`

			`mov ITR, 8`

			`Lenc_128_ks_loop:`
			`lea KS, [16 + KS]`
			`dec ITR`

			`pshufb xmm2, xmm4`
			`aesenclast xmm2, xmm0`
			`pslld xmm0, 1`
			`movdqa xmm3, xmm1`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pxor xmm1, xmm2`
			`movdqu [KS], xmm1`
			`movdqa xmm2, xmm1`

			`jne Lenc_128_ks_loop`

			`lea ITR, Lcon2`
			`movdqa xmm0, [ITR]`

			`pshufb xmm2, xmm4`
			`aesenclast xmm2, xmm0`
			`pslld xmm0, 1`
			`movdqa xmm3, xmm1`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pxor xmm1, xmm2`
			`movdqu [16 + KS], xmm1`
			`movdqa xmm2, xmm1`

			`pshufb xmm2, xmm4`
			`aesenclast xmm2, xmm0`
			`movdqa xmm3, xmm1`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pslldq xmm3, 4`
			`pxor xmm1, xmm3`
			`pxor xmm1, xmm2`
			`movdqu [32 + KS], xmm1`
			`movdqa xmm2, xmm1`

			`ret`
			`intel_aes_encrypt_init_128 ENDP`


			`ALIGN 16`
			`intel_aes_decrypt_init_128 PROC`

			`mov KEY, [esp + 14 + 04]`
			`mov KS, [esp + 14 + 14]`

			`push KS`
			`push KEY`

			`call intel_aes_encrypt_init_128`

			`pop KEY`
			`pop KS`

			`movdqu xmm0, [0*16 + KS]`
			`movdqu xmm1, [10*16 + KS]`
			`movdqu [10*16 + KS], xmm0`
			`movdqu [0*16 + KS], xmm1`

			`i = 1`
			`WHILE i LT 5`
			`movdqu xmm0, [i*16 + KS]`
			`movdqu xmm1, [(10-i)*16 + KS]`

			`aesimc xmm0, xmm0`
			`aesimc xmm1, xmm1`

			`movdqu [(10-i)*16 + KS], xmm0`
			`movdqu [i*16 + KS], xmm1`

			`i = i+1`
			`ENDM`

			`movdqu xmm0, [5*16 + KS]`
			`aesimc xmm0, xmm0`
			`movdqu [5*16 + KS], xmm0`
			`ret`
			`intel_aes_decrypt_init_128 ENDP`


			`ALIGN 16`
			`intel_aes_encrypt_init_192 PROC`

			`mov KEY, [esp + 14 + 04]`
			`mov KS, [esp + 14 + 14]`

			`pxor xmm3, xmm3`
			`movdqu xmm1, [KEY]`
			`pinsrd xmm3, DWORD PTR [16 + KEY], 0`
			`pinsrd xmm3, DWORD PTR [20 + KEY], 1`

			`movdqu [KS], xmm1`
			`movdqa xmm5, xmm3`

			`lea ITR, Lcon1`
			`movdqu xmm0, [ITR]`
			`lea ITR, Lmask192`
			`movdqu xmm4, [ITR]`

			`mov ITR, 4`

			`Lenc_192_ks_loop:`
			`movdqa xmm2, xmm3`
			`pshufb xmm2, xmm4`
			`aesenclast xmm2, xmm0`
			`pslld xmm0, 1`

			`movdqa xmm6, xmm1`
			`movdqa xmm7, xmm3`
			`pslldq xmm6, 4`
			`pslldq xmm7, 4`
			`pxor xmm1, xmm6`
			`pxor xmm3, xmm7`
			`pslldq xmm6, 4`
			`pxor xmm1, xmm6`
			`pslldq xmm6, 4`
			`pxor xmm1, xmm6`
			`pxor xmm1, xmm2`
			`pshufd xmm2, xmm1, 0ffh`
			`pxor xmm3, xmm2`

			`movdqa xmm6, xmm1`
			`shufpd xmm5, xmm1, 00h`
			`shufpd xmm6, xmm3, 01h`

			`movdqu [16 + KS], xmm5`
			`movdqu [32 + KS], xmm6`

			`movdqa xmm2, xmm3`
			`pshufb xmm2, xmm4`
			`aesenclast xmm2, xmm0`
			`pslld xmm0, 1`

			`movdqa xmm6, xmm1`
			`movdqa xmm7, xmm3`
			`pslldq xmm6, 4`
			`pslldq xmm7, 4`
			`pxor xmm1, xmm6`
			`pxor xmm3, xmm7`
			`pslldq xmm6, 4`
			`pxor xmm1, xmm6`
			`pslldq xmm6, 4`
			`pxor xmm1, xmm6`
			`pxor xmm1, xmm2`
			`pshufd xmm2, xmm1, 0ffh`
			`pxor xmm3, xmm2`

			`movdqu [48 + KS], xmm1`
			`movdqa xmm5, xmm3`

			`lea KS, [48 + KS]`

			`dec ITR`
			`jnz Lenc_192_ks_loop`

			`movdqu [16 + KS], xmm5`
			`ret`
			`intel_aes_encrypt_init_192 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_init_192 PROC`
			`mov KEY, [esp + 14 + 04]`
			`mov KS, [esp + 14 + 14]`

			`push KS`
			`push KEY`

			`call intel_aes_encrypt_init_192`

			`pop KEY`
			`pop KS`

			`movdqu xmm0, [0*16 + KS]`
			`movdqu xmm1, [12*16 + KS]`
			`movdqu [12*16 + KS], xmm0`
			`movdqu [0*16 + KS], xmm1`

			`i = 1`
			`WHILE i LT 6`
			`movdqu xmm0, [i*16 + KS]`
			`movdqu xmm1, [(12-i)*16 + KS]`

			`aesimc xmm0, xmm0`
			`aesimc xmm1, xmm1`

			`movdqu [(12-i)*16 + KS], xmm0`
			`movdqu [i*16 + KS], xmm1`

			`i = i+1`
			`ENDM`

			`movdqu xmm0, [6*16 + KS]`
			`aesimc xmm0, xmm0`
			`movdqu [6*16 + KS], xmm0`
			`ret`
			`intel_aes_decrypt_init_192 ENDP`

			`ALIGN 16`
			`intel_aes_encrypt_init_256 PROC`

			`mov KEY, [esp + 14 + 04]`
			`mov KS, [esp + 14 + 14]`
			`movdqu xmm1, [16*0 + KEY]`
			`movdqu xmm3, [16*1 + KEY]`

			`movdqu [16*0 + KS], xmm1`
			`movdqu [16*1 + KS], xmm3`

			`lea ITR, Lcon1`
			`movdqu xmm0, [ITR]`
			`lea ITR, Lmask256`
			`movdqu xmm5, [ITR]`

			`pxor xmm6, xmm6`

			`mov ITR, 6`

			`Lenc_256_ks_loop:`

			`movdqa xmm2, xmm3`
			`pshufb xmm2, xmm5`
			`aesenclast xmm2, xmm0`
			`pslld xmm0, 1`
			`movdqa xmm4, xmm1`
			`pslldq xmm4, 4`
			`pxor xmm1, xmm4`
			`pslldq xmm4, 4`
			`pxor xmm1, xmm4`
			`pslldq xmm4, 4`
			`pxor xmm1, xmm4`
			`pxor xmm1, xmm2`
			`movdqu [16*2 + KS], xmm1`

			`pshufd xmm2, xmm1, 0ffh`
			`aesenclast xmm2, xmm6`
			`movdqa xmm4, xmm3`
			`pslldq xmm4, 4`
			`pxor xmm3, xmm4`
			`pslldq xmm4, 4`
			`pxor xmm3, xmm4`
			`pslldq xmm4, 4`
			`pxor xmm3, xmm4`
			`pxor xmm3, xmm2`
			`movdqu [16*3 + KS], xmm3`

			`lea KS, [32 + KS]`
			`dec ITR`
			`jnz Lenc_256_ks_loop`

			`movdqa xmm2, xmm3`
			`pshufb xmm2, xmm5`
			`aesenclast xmm2, xmm0`
			`movdqa xmm4, xmm1`
			`pslldq xmm4, 4`
			`pxor xmm1, xmm4`
			`pslldq xmm4, 4`
			`pxor xmm1, xmm4`
			`pslldq xmm4, 4`
			`pxor xmm1, xmm4`
			`pxor xmm1, xmm2`
			`movdqu [16*2 + KS], xmm1`

			`ret`
			`intel_aes_encrypt_init_256 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_init_256 PROC`
			`mov KEY, [esp + 14 + 04]`
			`mov KS, [esp + 14 + 14]`

			`push KS`
			`push KEY`

			`call intel_aes_encrypt_init_256`

			`pop KEY`
			`pop KS`

			`movdqu xmm0, [0*16 + KS]`
			`movdqu xmm1, [14*16 + KS]`
			`movdqu [14*16 + KS], xmm0`
			`movdqu [0*16 + KS], xmm1`

			`i = 1`
			`WHILE i LT 7`
			`movdqu xmm0, [i*16 + KS]`
			`movdqu xmm1, [(14-i)*16 + KS]`

			`aesimc xmm0, xmm0`
			`aesimc xmm1, xmm1`

			`movdqu [(14-i)*16 + KS], xmm0`
			`movdqu [i*16 + KS], xmm1`

			`i = i+1`
			`ENDM`

			`movdqu xmm0, [7*16 + KS]`
			`aesimc xmm0, xmm0`
			`movdqu [7*16 + KS], xmm0`
			`ret`
			`intel_aes_decrypt_init_256 ENDP`



			`gen_aes_cbc_enc_func MACRO rnds`

			`LOCAL loop1`
			`LOCAL bail`

			`push inputLen`

			`mov ctx, [esp + 24 + 04]`
			`mov output, [esp + 24 + 14]`
			`mov input, [esp + 24 + 44]`
			`mov inputLen, [esp + 24 + 54]`

			`lea ctx, [44+ctx]`

			`movdqu xmm0, [-32+ctx]`

			`movdqu xmm2, [0*16 + ctx]`
			`movdqu xmm3, [1*16 + ctx]`
			`movdqu xmm4, [2*16 + ctx]`
			`movdqu xmm5, [3*16 + ctx]`
			`movdqu xmm6, [4*16 + ctx]`

			`loop1:`
			`cmp inputLen, 1*16`
			`jb bail`

			`movdqu xmm1, [input]`
			`pxor xmm1, xmm2`
			`pxor xmm0, xmm1`

			`aesenc xmm0, xmm3`
			`aesenc xmm0, xmm4`
			`aesenc xmm0, xmm5`
			`aesenc xmm0, xmm6`

			`i = 5`
			`WHILE i LT rnds`
			`movdqu xmm7, [i*16 + ctx]`
			`aesenc xmm0, xmm7`
			`i = i+1`
			`ENDM`
			`movdqu xmm7, [rnds*16 + ctx]`
			`aesenclast xmm0, xmm7`

			`movdqu [output], xmm0`

			`lea input, [1*16 + input]`
			`lea output, [1*16 + output]`
			`sub inputLen, 1*16`
			`jmp loop1`

			`bail:`
			`movdqu [-32+ctx], xmm0`

			`xor eax, eax`
			`pop inputLen`
			`ret`

			`ENDM`

			`gen_aes_cbc_dec_func MACRO rnds`

			`LOCAL loop7`
			`LOCAL loop1`
			`LOCAL dec1`
			`LOCAL bail`

			`push inputLen`

			`mov ctx, [esp + 24 + 04]`
			`mov output, [esp + 24 + 14]`
			`mov input, [esp + 24 + 44]`
			`mov inputLen, [esp + 24 + 54]`

			`lea ctx, [44+ctx]`

			`loop7:`
			`cmp inputLen, 7*16`
			`jb dec1`

			`movdqu xmm0, [0*16 + input]`
			`movdqu xmm1, [1*16 + input]`
			`movdqu xmm2, [2*16 + input]`
			`movdqu xmm3, [3*16 + input]`
			`movdqu xmm4, [4*16 + input]`
			`movdqu xmm5, [5*16 + input]`
			`movdqu xmm6, [6*16 + input]`

			`movdqu xmm7, [0*16 + ctx]`
			`pxor xmm0, xmm7`
			`pxor xmm1, xmm7`
			`pxor xmm2, xmm7`
			`pxor xmm3, xmm7`
			`pxor xmm4, xmm7`
			`pxor xmm5, xmm7`
			`pxor xmm6, xmm7`

			`i = 1`
			`WHILE i LT rnds`
			`aes_dec_rnd i`
			`i = i+1`
			`ENDM`
			`aes_dec_last_rnd rnds`

			`movdqu xmm7, [-32 + ctx]`
			`pxor xmm0, xmm7`
			`movdqu xmm7, [0*16 + input]`
			`pxor xmm1, xmm7`
			`movdqu xmm7, [1*16 + input]`
			`pxor xmm2, xmm7`
			`movdqu xmm7, [2*16 + input]`
			`pxor xmm3, xmm7`
			`movdqu xmm7, [3*16 + input]`
			`pxor xmm4, xmm7`
			`movdqu xmm7, [4*16 + input]`
			`pxor xmm5, xmm7`
			`movdqu xmm7, [5*16 + input]`
			`pxor xmm6, xmm7`
			`movdqu xmm7, [6*16 + input]`

			`movdqu [0*16 + output], xmm0`
			`movdqu [1*16 + output], xmm1`
			`movdqu [2*16 + output], xmm2`
			`movdqu [3*16 + output], xmm3`
			`movdqu [4*16 + output], xmm4`
			`movdqu [5*16 + output], xmm5`
			`movdqu [6*16 + output], xmm6`
			`movdqu [-32 + ctx], xmm7`

			`lea input, [7*16 + input]`
			`lea output, [7*16 + output]`
			`sub inputLen, 7*16`
			`jmp loop7`
			`dec1:`

			`movdqu xmm3, [-32 + ctx]`

			`loop1:`
			`cmp inputLen, 1*16`
			`jb bail`

			`movdqu xmm0, [input]`
			`movdqa xmm4, xmm0`
			`movdqu xmm7, [0*16 + ctx]`
			`pxor xmm0, xmm7`

			`i = 1`
			`WHILE i LT rnds`
			`movdqu xmm7, [i*16 + ctx]`
			`aesdec xmm0, xmm7`
			`i = i+1`
			`ENDM`
			`movdqu xmm7, [rnds*16 + ctx]`
			`aesdeclast xmm0, xmm7`
			`pxor xmm3, xmm0`

			`movdqu [output], xmm3`
			`movdqa xmm3, xmm4`

			`lea input, [1*16 + input]`
			`lea output, [1*16 + output]`
			`sub inputLen, 1*16`
			`jmp loop1`

			`bail:`
			`movdqu [-32 + ctx], xmm3`
			`xor eax, eax`
			`pop inputLen`
			`ret`
			`ENDM`

			`ALIGN 16`
			`intel_aes_encrypt_cbc_128 PROC`
			`gen_aes_cbc_enc_func 10`
			`intel_aes_encrypt_cbc_128 ENDP`

			`ALIGN 16`
			`intel_aes_encrypt_cbc_192 PROC`
			`gen_aes_cbc_enc_func 12`
			`intel_aes_encrypt_cbc_192 ENDP`

			`ALIGN 16`
			`intel_aes_encrypt_cbc_256 PROC`
			`gen_aes_cbc_enc_func 14`
			`intel_aes_encrypt_cbc_256 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_cbc_128 PROC`
			`gen_aes_cbc_dec_func 10`
			`intel_aes_decrypt_cbc_128 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_cbc_192 PROC`
			`gen_aes_cbc_dec_func 12`
			`intel_aes_decrypt_cbc_192 ENDP`

			`ALIGN 16`
			`intel_aes_decrypt_cbc_256 PROC`
			`gen_aes_cbc_dec_func 14`
			`intel_aes_decrypt_cbc_256 ENDP`



			`ctrCtx textequ <esi>`
			`CTR textequ <ebx>`

			`gen_aes_ctr_func MACRO rnds`

			`LOCAL loop7`
			`LOCAL loop1`
			`LOCAL enc1`
			`LOCAL bail`

			`push inputLen`
			`push ctrCtx`
			`push CTR`
			`push ebp`

			`mov ctrCtx, [esp + 45 + 04]`
			`mov output, [esp + 45 + 14]`
			`mov input, [esp + 45 + 44]`
			`mov inputLen, [esp + 45 + 54]`

			`mov ctx, [4+ctrCtx]`
			`lea ctx, [44+ctx]`

			`mov ebp, esp`
			`sub esp, 7*16`
			`and esp, -16`

			`movdqu xmm0, [8+ctrCtx]`
			`mov ctrCtx, [ctrCtx + 8 + 3*4]`
			`bswap ctrCtx`
			`movdqu xmm1, [ctx + 0*16]`

			`pxor xmm0, xmm1`

			`movdqa [esp + 0*16], xmm0`
			`movdqa [esp + 1*16], xmm0`
			`movdqa [esp + 2*16], xmm0`
			`movdqa [esp + 3*16], xmm0`
			`movdqa [esp + 4*16], xmm0`
			`movdqa [esp + 5*16], xmm0`
			`movdqa [esp + 6*16], xmm0`

			`inc ctrCtx`
			`mov CTR, ctrCtx`
			`bswap CTR`
			`xor CTR, [ctx + 3*4]`
			`mov [esp + 116 + 34], CTR`

			`inc ctrCtx`
			`mov CTR, ctrCtx`
			`bswap CTR`
			`xor CTR, [ctx + 3*4]`
			`mov [esp + 216 + 34], CTR`

			`inc ctrCtx`
			`mov CTR, ctrCtx`
			`bswap CTR`
			`xor CTR, [ctx + 3*4]`
			`mov [esp + 316 + 34], CTR`

			`inc ctrCtx`
			`mov CTR, ctrCtx`
			`bswap CTR`
			`xor CTR, [ctx + 3*4]`
			`mov [esp + 416 + 34], CTR`

			`inc ctrCtx`
			`mov CTR, ctrCtx`
			`bswap CTR`
			`xor CTR, [ctx + 3*4]`
			`mov [esp + 516 + 34], CTR`

			`inc ctrCtx`
			`mov CTR, ctrCtx`
			`bswap CTR`
			`xor CTR, [ctx + 3*4]`
			`mov [esp + 616 + 34], CTR`


			`loop7:`
			`cmp inputLen, 7*16`
			`jb loop1`

			`movdqu xmm0, [0*16 + esp]`
			`movdqu xmm1, [1*16 + esp]`
			`movdqu xmm2, [2*16 + esp]`
			`movdqu xmm3, [3*16 + esp]`
			`movdqu xmm4, [4*16 + esp]`
			`movdqu xmm5, [5*16 + esp]`
			`movdqu xmm6, [6*16 + esp]`

			`i = 1`
			`WHILE i LE 7`
			`aes_rnd i`

			`inc ctrCtx`
			`mov CTR, ctrCtx`
			`bswap CTR`
			`xor CTR, [ctx + 3*4]`
			`mov [esp + (i-1)16 + 34], CTR`

			`i = i+1`
			`ENDM`
			`WHILE i LT rnds`
			`aes_rnd i`
			`i = i+1`
			`ENDM`
			`aes_last_rnd rnds`

			`movdqu xmm7, [0*16 + input]`
			`pxor xmm0, xmm7`
			`movdqu xmm7, [1*16 + input]`
			`pxor xmm1, xmm7`
			`movdqu xmm7, [2*16 + input]`
			`pxor xmm2, xmm7`
			`movdqu xmm7, [3*16 + input]`
			`pxor xmm3, xmm7`
			`movdqu xmm7, [4*16 + input]`
			`pxor xmm4, xmm7`
			`movdqu xmm7, [5*16 + input]`
			`pxor xmm5, xmm7`
			`movdqu xmm7, [6*16 + input]`
			`pxor xmm6, xmm7`

			`movdqu [0*16 + output], xmm0`
			`movdqu [1*16 + output], xmm1`
			`movdqu [2*16 + output], xmm2`
			`movdqu [3*16 + output], xmm3`
			`movdqu [4*16 + output], xmm4`
			`movdqu [5*16 + output], xmm5`
			`movdqu [6*16 + output], xmm6`

			`lea input, [7*16 + input]`
			`lea output, [7*16 + output]`
			`sub inputLen, 7*16`
			`jmp loop7`


			`loop1:`
			`cmp inputLen, 1*16`
			`jb bail`

			`movdqu xmm0, [esp]`
			`add esp, 16`

			`i = 1`
			`WHILE i LT rnds`
			`movdqu xmm7, [i*16 + ctx]`
			`aesenc xmm0, xmm7`
			`i = i+1`
			`ENDM`
			`movdqu xmm7, [rnds*16 + ctx]`
			`aesenclast xmm0, xmm7`

			`movdqu xmm7, [input]`
			`pxor xmm0, xmm7`
			`movdqu [output], xmm0`

			`lea input, [1*16 + input]`
			`lea output, [1*16 + output]`
			`sub inputLen, 1*16`
			`jmp loop1`

			`bail:`

			`mov ctrCtx, [ebp + 45 + 04]`
			`movdqu xmm0, [esp]`
			`movdqu xmm1, [ctx + 0*16]`
			`pxor xmm0, xmm1`
			`movdqu [8+ctrCtx], xmm0`


			`xor eax, eax`
			`mov esp, ebp`
			`pop ebp`
			`pop CTR`
			`pop ctrCtx`
			`pop inputLen`
			`ret`
			`ENDM`


			`ALIGN 16`
			`intel_aes_encrypt_ctr_128 PROC`
			`gen_aes_ctr_func 10`
			`intel_aes_encrypt_ctr_128 ENDP`

			`ALIGN 16`
			`intel_aes_encrypt_ctr_192 PROC`
			`gen_aes_ctr_func 12`
			`intel_aes_encrypt_ctr_192 ENDP`

			`ALIGN 16`
			`intel_aes_encrypt_ctr_256 PROC`
			`gen_aes_ctr_func 14`
			`intel_aes_encrypt_ctr_256 ENDP`


			`END`