RetroZilla/security/nss/lib/freebl/mpi/mpi_sse2.s
2018-05-19 22:01:21 +08:00

295 lines
7.8 KiB
ArmAsm

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifdef DARWIN
#define s_mpv_mul_d _s_mpv_mul_d
#define s_mpv_mul_d_add _s_mpv_mul_d_add
#define s_mpv_mul_d_add_prop _s_mpv_mul_d_add_prop
#define s_mpv_sqr_add_prop _s_mpv_sqr_add_prop
#define s_mpv_div_2dx1d _s_mpv_div_2dx1d
#define TYPE_FUNCTION(x)
#else
#define TYPE_FUNCTION(x) .type x, @function
#endif
.text
# ebp - 8: caller's esi
# ebp - 4: caller's edi
# ebp + 0: caller's ebp
# ebp + 4: return address
# ebp + 8: a argument
# ebp + 12: a_len argument
# ebp + 16: b argument
# ebp + 20: c argument
# registers:
# ebx:
# ecx: a_len
# esi: a ptr
# edi: c ptr
.globl s_mpv_mul_d
.private_extern s_mpv_mul_d
TYPE_FUNCTION(s_mpv_mul_d)
s_mpv_mul_d:
push %ebp
mov %esp, %ebp
push %edi
push %esi
psubq %mm2, %mm2 # carry = 0
mov 12(%ebp), %ecx # ecx = a_len
movd 16(%ebp), %mm1 # mm1 = b
mov 20(%ebp), %edi
cmp $0, %ecx
je 2f # jmp if a_len == 0
mov 8(%ebp), %esi # esi = a
cld
1:
movd 0(%esi), %mm0 # mm0 = *a++
add $4, %esi
pmuludq %mm1, %mm0 # mm0 = b * *a++
paddq %mm0, %mm2 # add the carry
movd %mm2, 0(%edi) # store the 32bit result
add $4, %edi
psrlq $32, %mm2 # save the carry
dec %ecx # --a_len
jnz 1b # jmp if a_len != 0
2:
movd %mm2, 0(%edi) # *c = carry
emms
pop %esi
pop %edi
leave
ret
nop
# ebp - 8: caller's esi
# ebp - 4: caller's edi
# ebp + 0: caller's ebp
# ebp + 4: return address
# ebp + 8: a argument
# ebp + 12: a_len argument
# ebp + 16: b argument
# ebp + 20: c argument
# registers:
# ebx:
# ecx: a_len
# esi: a ptr
# edi: c ptr
.globl s_mpv_mul_d_add
.private_extern s_mpv_mul_d_add
TYPE_FUNCTION(s_mpv_mul_d_add)
s_mpv_mul_d_add:
push %ebp
mov %esp, %ebp
push %edi
push %esi
psubq %mm2, %mm2 # carry = 0
mov 12(%ebp), %ecx # ecx = a_len
movd 16(%ebp), %mm1 # mm1 = b
mov 20(%ebp), %edi
cmp $0, %ecx
je 2f # jmp if a_len == 0
mov 8(%ebp), %esi # esi = a
cld
1:
movd 0(%esi), %mm0 # mm0 = *a++
add $4, %esi
pmuludq %mm1, %mm0 # mm0 = b * *a++
paddq %mm0, %mm2 # add the carry
movd 0(%edi), %mm0
paddq %mm0, %mm2 # add the carry
movd %mm2, 0(%edi) # store the 32bit result
add $4, %edi
psrlq $32, %mm2 # save the carry
dec %ecx # --a_len
jnz 1b # jmp if a_len != 0
2:
movd %mm2, 0(%edi) # *c = carry
emms
pop %esi
pop %edi
leave
ret
nop
# ebp - 12: caller's ebx
# ebp - 8: caller's esi
# ebp - 4: caller's edi
# ebp + 0: caller's ebp
# ebp + 4: return address
# ebp + 8: a argument
# ebp + 12: a_len argument
# ebp + 16: b argument
# ebp + 20: c argument
# registers:
# eax:
# ebx: carry
# ecx: a_len
# esi: a ptr
# edi: c ptr
.globl s_mpv_mul_d_add_prop
.private_extern s_mpv_mul_d_add_prop
TYPE_FUNCTION(s_mpv_mul_d_add_prop)
s_mpv_mul_d_add_prop:
push %ebp
mov %esp, %ebp
push %edi
push %esi
push %ebx
psubq %mm2, %mm2 # carry = 0
mov 12(%ebp), %ecx # ecx = a_len
movd 16(%ebp), %mm1 # mm1 = b
mov 20(%ebp), %edi
cmp $0, %ecx
je 2f # jmp if a_len == 0
mov 8(%ebp), %esi # esi = a
cld
1:
movd 0(%esi), %mm0 # mm0 = *a++
movd 0(%edi), %mm3 # fetch the sum
add $4, %esi
pmuludq %mm1, %mm0 # mm0 = b * *a++
paddq %mm0, %mm2 # add the carry
paddq %mm3, %mm2 # add *c++
movd %mm2, 0(%edi) # store the 32bit result
add $4, %edi
psrlq $32, %mm2 # save the carry
dec %ecx # --a_len
jnz 1b # jmp if a_len != 0
2:
movd %mm2, %ebx
cmp $0, %ebx # is carry zero?
jz 4f
mov 0(%edi), %eax
add %ebx, %eax
stosl
jnc 4f
3:
mov 0(%edi), %eax # add in current word from *c
adc $0, %eax
stosl # [es:edi] = ax; edi += 4;
jc 3b
4:
emms
pop %ebx
pop %esi
pop %edi
leave
ret
nop
# ebp - 12: caller's ebx
# ebp - 8: caller's esi
# ebp - 4: caller's edi
# ebp + 0: caller's ebp
# ebp + 4: return address
# ebp + 8: pa argument
# ebp + 12: a_len argument
# ebp + 16: ps argument
# registers:
# eax:
# ebx: carry
# ecx: a_len
# esi: a ptr
# edi: c ptr
.globl s_mpv_sqr_add_prop
.private_extern s_mpv_sqr_add_prop
TYPE_FUNCTION(s_mpv_sqr_add_prop)
s_mpv_sqr_add_prop:
push %ebp
mov %esp, %ebp
push %edi
push %esi
push %ebx
psubq %mm2, %mm2 # carry = 0
mov 12(%ebp), %ecx # ecx = a_len
mov 16(%ebp), %edi
cmp $0, %ecx
je 2f # jmp if a_len == 0
mov 8(%ebp), %esi # esi = a
cld
1:
movd 0(%esi), %mm0 # mm0 = *a
movd 0(%edi), %mm3 # fetch the sum
add $4, %esi
pmuludq %mm0, %mm0 # mm0 = sqr(a)
paddq %mm0, %mm2 # add the carry
paddq %mm3, %mm2 # add the low word
movd 4(%edi), %mm3
movd %mm2, 0(%edi) # store the 32bit result
psrlq $32, %mm2
paddq %mm3, %mm2 # add the high word
movd %mm2, 4(%edi) # store the 32bit result
psrlq $32, %mm2 # save the carry.
add $8, %edi
dec %ecx # --a_len
jnz 1b # jmp if a_len != 0
2:
movd %mm2, %ebx
cmp $0, %ebx # is carry zero?
jz 4f
mov 0(%edi), %eax
add %ebx, %eax
stosl
jnc 4f
3:
mov 0(%edi), %eax # add in current word from *c
adc $0, %eax
stosl # [es:edi] = ax; edi += 4;
jc 3b
4:
emms
pop %ebx
pop %esi
pop %edi
leave
ret
nop
#
# Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
# so its high bit is 1. This code is from NSPR.
#
# mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
# mp_digit *qp, mp_digit *rp)
# esp + 0: Caller's ebx
# esp + 4: return address
# esp + 8: Nhi argument
# esp + 12: Nlo argument
# esp + 16: divisor argument
# esp + 20: qp argument
# esp + 24: rp argument
# registers:
# eax:
# ebx: carry
# ecx: a_len
# edx:
# esi: a ptr
# edi: c ptr
#
.globl s_mpv_div_2dx1d
.private_extern s_mpv_div_2dx1d
TYPE_FUNCTION(s_mpv_div_2dx1d)
s_mpv_div_2dx1d:
push %ebx
mov 8(%esp), %edx
mov 12(%esp), %eax
mov 16(%esp), %ebx
div %ebx
mov 20(%esp), %ebx
mov %eax, 0(%ebx)
mov 24(%esp), %ebx
mov %edx, 0(%ebx)
xor %eax, %eax # return zero
pop %ebx
ret
nop
#ifndef DARWIN
# Magic indicating no need for an executable stack
.section .note.GNU-stack, "", @progbits
.previous
#endif