mirror of
https://github.com/rn10950/RetroZilla.git
synced 2024-11-14 11:40:13 +01:00
536 lines
12 KiB
C
536 lines
12 KiB
C
/*
|
|
* mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
#include "mpi-priv.h"
|
|
|
|
static int is_sse = -1;
|
|
extern unsigned long s_mpi_is_sse2();
|
|
|
|
/*
|
|
* ebp - 36: caller's esi
|
|
* ebp - 32: caller's edi
|
|
* ebp - 28:
|
|
* ebp - 24:
|
|
* ebp - 20:
|
|
* ebp - 16:
|
|
* ebp - 12:
|
|
* ebp - 8:
|
|
* ebp - 4:
|
|
* ebp + 0: caller's ebp
|
|
* ebp + 4: return address
|
|
* ebp + 8: a argument
|
|
* ebp + 12: a_len argument
|
|
* ebp + 16: b argument
|
|
* ebp + 20: c argument
|
|
* registers:
|
|
* eax:
|
|
* ebx: carry
|
|
* ecx: a_len
|
|
* edx:
|
|
* esi: a ptr
|
|
* edi: c ptr
|
|
*/
|
|
__declspec(naked) void
|
|
s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
|
|
{
|
|
__asm {
|
|
mov eax, is_sse
|
|
cmp eax, 0
|
|
je s_mpv_mul_d_x86
|
|
jg s_mpv_mul_d_sse2
|
|
call s_mpi_is_sse2
|
|
mov is_sse, eax
|
|
cmp eax, 0
|
|
jg s_mpv_mul_d_sse2
|
|
s_mpv_mul_d_x86:
|
|
push ebp
|
|
mov ebp,esp
|
|
sub esp,28
|
|
push edi
|
|
push esi
|
|
push ebx
|
|
mov ebx,0 ; carry = 0
|
|
mov ecx,[ebp+12] ; ecx = a_len
|
|
mov edi,[ebp+20]
|
|
cmp ecx,0
|
|
je L_2 ; jmp if a_len == 0
|
|
mov esi,[ebp+8] ; esi = a
|
|
cld
|
|
L_1:
|
|
lodsd ; eax = [ds:esi]; esi += 4
|
|
mov edx,[ebp+16] ; edx = b
|
|
mul edx ; edx:eax = Phi:Plo = a_i * b
|
|
|
|
add eax,ebx ; add carry (ebx) to edx:eax
|
|
adc edx,0
|
|
mov ebx,edx ; high half of product becomes next carry
|
|
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
dec ecx ; --a_len
|
|
jnz L_1 ; jmp if a_len != 0
|
|
L_2:
|
|
mov [edi],ebx ; *c = carry
|
|
pop ebx
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
s_mpv_mul_d_sse2:
|
|
push ebp
|
|
mov ebp, esp
|
|
push edi
|
|
push esi
|
|
psubq mm2, mm2 ; carry = 0
|
|
mov ecx, [ebp+12] ; ecx = a_len
|
|
movd mm1, [ebp+16] ; mm1 = b
|
|
mov edi, [ebp+20]
|
|
cmp ecx, 0
|
|
je L_6 ; jmp if a_len == 0
|
|
mov esi, [ebp+8] ; esi = a
|
|
cld
|
|
L_5:
|
|
movd mm0, [esi] ; mm0 = *a++
|
|
add esi, 4
|
|
pmuludq mm0, mm1 ; mm0 = b * *a++
|
|
paddq mm2, mm0 ; add the carry
|
|
movd [edi], mm2 ; store the 32bit result
|
|
add edi, 4
|
|
psrlq mm2, 32 ; save the carry
|
|
dec ecx ; --a_len
|
|
jnz L_5 ; jmp if a_len != 0
|
|
L_6:
|
|
movd [edi], mm2 ; *c = carry
|
|
emms
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ebp - 36: caller's esi
|
|
* ebp - 32: caller's edi
|
|
* ebp - 28:
|
|
* ebp - 24:
|
|
* ebp - 20:
|
|
* ebp - 16:
|
|
* ebp - 12:
|
|
* ebp - 8:
|
|
* ebp - 4:
|
|
* ebp + 0: caller's ebp
|
|
* ebp + 4: return address
|
|
* ebp + 8: a argument
|
|
* ebp + 12: a_len argument
|
|
* ebp + 16: b argument
|
|
* ebp + 20: c argument
|
|
* registers:
|
|
* eax:
|
|
* ebx: carry
|
|
* ecx: a_len
|
|
* edx:
|
|
* esi: a ptr
|
|
* edi: c ptr
|
|
*/
|
|
__declspec(naked) void
|
|
s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
|
|
{
|
|
__asm {
|
|
mov eax, is_sse
|
|
cmp eax, 0
|
|
je s_mpv_mul_d_add_x86
|
|
jg s_mpv_mul_d_add_sse2
|
|
call s_mpi_is_sse2
|
|
mov is_sse, eax
|
|
cmp eax, 0
|
|
jg s_mpv_mul_d_add_sse2
|
|
s_mpv_mul_d_add_x86:
|
|
push ebp
|
|
mov ebp,esp
|
|
sub esp,28
|
|
push edi
|
|
push esi
|
|
push ebx
|
|
mov ebx,0 ; carry = 0
|
|
mov ecx,[ebp+12] ; ecx = a_len
|
|
mov edi,[ebp+20]
|
|
cmp ecx,0
|
|
je L_11 ; jmp if a_len == 0
|
|
mov esi,[ebp+8] ; esi = a
|
|
cld
|
|
L_10:
|
|
lodsd ; eax = [ds:esi]; esi += 4
|
|
mov edx,[ebp+16] ; edx = b
|
|
mul edx ; edx:eax = Phi:Plo = a_i * b
|
|
|
|
add eax,ebx ; add carry (ebx) to edx:eax
|
|
adc edx,0
|
|
mov ebx,[edi] ; add in current word from *c
|
|
add eax,ebx
|
|
adc edx,0
|
|
mov ebx,edx ; high half of product becomes next carry
|
|
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
dec ecx ; --a_len
|
|
jnz L_10 ; jmp if a_len != 0
|
|
L_11:
|
|
mov [edi],ebx ; *c = carry
|
|
pop ebx
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
s_mpv_mul_d_add_sse2:
|
|
push ebp
|
|
mov ebp, esp
|
|
push edi
|
|
push esi
|
|
psubq mm2, mm2 ; carry = 0
|
|
mov ecx, [ebp+12] ; ecx = a_len
|
|
movd mm1, [ebp+16] ; mm1 = b
|
|
mov edi, [ebp+20]
|
|
cmp ecx, 0
|
|
je L_16 ; jmp if a_len == 0
|
|
mov esi, [ebp+8] ; esi = a
|
|
cld
|
|
L_15:
|
|
movd mm0, [esi] ; mm0 = *a++
|
|
add esi, 4
|
|
pmuludq mm0, mm1 ; mm0 = b * *a++
|
|
paddq mm2, mm0 ; add the carry
|
|
movd mm0, [edi]
|
|
paddq mm2, mm0 ; add the carry
|
|
movd [edi], mm2 ; store the 32bit result
|
|
add edi, 4
|
|
psrlq mm2, 32 ; save the carry
|
|
dec ecx ; --a_len
|
|
jnz L_15 ; jmp if a_len != 0
|
|
L_16:
|
|
movd [edi], mm2 ; *c = carry
|
|
emms
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ebp - 36: caller's esi
|
|
* ebp - 32: caller's edi
|
|
* ebp - 28:
|
|
* ebp - 24:
|
|
* ebp - 20:
|
|
* ebp - 16:
|
|
* ebp - 12:
|
|
* ebp - 8:
|
|
* ebp - 4:
|
|
* ebp + 0: caller's ebp
|
|
* ebp + 4: return address
|
|
* ebp + 8: a argument
|
|
* ebp + 12: a_len argument
|
|
* ebp + 16: b argument
|
|
* ebp + 20: c argument
|
|
* registers:
|
|
* eax:
|
|
* ebx: carry
|
|
* ecx: a_len
|
|
* edx:
|
|
* esi: a ptr
|
|
* edi: c ptr
|
|
*/
|
|
__declspec(naked) void
|
|
s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
|
|
{
|
|
__asm {
|
|
mov eax, is_sse
|
|
cmp eax, 0
|
|
je s_mpv_mul_d_add_prop_x86
|
|
jg s_mpv_mul_d_add_prop_sse2
|
|
call s_mpi_is_sse2
|
|
mov is_sse, eax
|
|
cmp eax, 0
|
|
jg s_mpv_mul_d_add_prop_sse2
|
|
s_mpv_mul_d_add_prop_x86:
|
|
push ebp
|
|
mov ebp,esp
|
|
sub esp,28
|
|
push edi
|
|
push esi
|
|
push ebx
|
|
mov ebx,0 ; carry = 0
|
|
mov ecx,[ebp+12] ; ecx = a_len
|
|
mov edi,[ebp+20]
|
|
cmp ecx,0
|
|
je L_21 ; jmp if a_len == 0
|
|
cld
|
|
mov esi,[ebp+8] ; esi = a
|
|
L_20:
|
|
lodsd ; eax = [ds:esi]; esi += 4
|
|
mov edx,[ebp+16] ; edx = b
|
|
mul edx ; edx:eax = Phi:Plo = a_i * b
|
|
|
|
add eax,ebx ; add carry (ebx) to edx:eax
|
|
adc edx,0
|
|
mov ebx,[edi] ; add in current word from *c
|
|
add eax,ebx
|
|
adc edx,0
|
|
mov ebx,edx ; high half of product becomes next carry
|
|
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
dec ecx ; --a_len
|
|
jnz L_20 ; jmp if a_len != 0
|
|
L_21:
|
|
cmp ebx,0 ; is carry zero?
|
|
jz L_23
|
|
mov eax,[edi] ; add in current word from *c
|
|
add eax,ebx
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
jnc L_23
|
|
L_22:
|
|
mov eax,[edi] ; add in current word from *c
|
|
adc eax,0
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
jc L_22
|
|
L_23:
|
|
pop ebx
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
s_mpv_mul_d_add_prop_sse2:
|
|
push ebp
|
|
mov ebp, esp
|
|
push edi
|
|
push esi
|
|
push ebx
|
|
psubq mm2, mm2 ; carry = 0
|
|
mov ecx, [ebp+12] ; ecx = a_len
|
|
movd mm1, [ebp+16] ; mm1 = b
|
|
mov edi, [ebp+20]
|
|
cmp ecx, 0
|
|
je L_26 ; jmp if a_len == 0
|
|
mov esi, [ebp+8] ; esi = a
|
|
cld
|
|
L_25:
|
|
movd mm0, [esi] ; mm0 = *a++
|
|
movd mm3, [edi] ; fetch the sum
|
|
add esi, 4
|
|
pmuludq mm0, mm1 ; mm0 = b * *a++
|
|
paddq mm2, mm0 ; add the carry
|
|
paddq mm2, mm3 ; add *c++
|
|
movd [edi], mm2 ; store the 32bit result
|
|
add edi, 4
|
|
psrlq mm2, 32 ; save the carry
|
|
dec ecx ; --a_len
|
|
jnz L_25 ; jmp if a_len != 0
|
|
L_26:
|
|
movd ebx, mm2
|
|
cmp ebx, 0 ; is carry zero?
|
|
jz L_28
|
|
mov eax, [edi]
|
|
add eax, ebx
|
|
stosd
|
|
jnc L_28
|
|
L_27:
|
|
mov eax, [edi] ; add in current word from *c
|
|
adc eax, 0
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
jc L_27
|
|
L_28:
|
|
emms
|
|
pop ebx
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ebp - 20: caller's esi
|
|
* ebp - 16: caller's edi
|
|
* ebp - 12:
|
|
* ebp - 8: carry
|
|
* ebp - 4: a_len local
|
|
* ebp + 0: caller's ebp
|
|
* ebp + 4: return address
|
|
* ebp + 8: pa argument
|
|
* ebp + 12: a_len argument
|
|
* ebp + 16: ps argument
|
|
* ebp + 20:
|
|
* registers:
|
|
* eax:
|
|
* ebx: carry
|
|
* ecx: a_len
|
|
* edx:
|
|
* esi: a ptr
|
|
* edi: c ptr
|
|
*/
|
|
__declspec(naked) void
|
|
s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs)
|
|
{
|
|
__asm {
|
|
mov eax, is_sse
|
|
cmp eax, 0
|
|
je s_mpv_sqr_add_prop_x86
|
|
jg s_mpv_sqr_add_prop_sse2
|
|
call s_mpi_is_sse2
|
|
mov is_sse, eax
|
|
cmp eax, 0
|
|
jg s_mpv_sqr_add_prop_sse2
|
|
s_mpv_sqr_add_prop_x86:
|
|
push ebp
|
|
mov ebp,esp
|
|
sub esp,12
|
|
push edi
|
|
push esi
|
|
push ebx
|
|
mov ebx,0 ; carry = 0
|
|
mov ecx,[ebp+12] ; a_len
|
|
mov edi,[ebp+16] ; edi = ps
|
|
cmp ecx,0
|
|
je L_31 ; jump if a_len == 0
|
|
cld
|
|
mov esi,[ebp+8] ; esi = pa
|
|
L_30:
|
|
lodsd ; eax = [ds:si]; si += 4;
|
|
mul eax
|
|
|
|
add eax,ebx ; add "carry"
|
|
adc edx,0
|
|
mov ebx,[edi]
|
|
add eax,ebx ; add low word from result
|
|
mov ebx,[edi+4]
|
|
stosd ; [es:di] = eax; di += 4;
|
|
adc edx,ebx ; add high word from result
|
|
mov ebx,0
|
|
mov eax,edx
|
|
adc ebx,0
|
|
stosd ; [es:di] = eax; di += 4;
|
|
dec ecx ; --a_len
|
|
jnz L_30 ; jmp if a_len != 0
|
|
L_31:
|
|
cmp ebx,0 ; is carry zero?
|
|
jz L_34
|
|
mov eax,[edi] ; add in current word from *c
|
|
add eax,ebx
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
jnc L_34
|
|
L_32:
|
|
mov eax,[edi] ; add in current word from *c
|
|
adc eax,0
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
jc L_32
|
|
L_34:
|
|
pop ebx
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
s_mpv_sqr_add_prop_sse2:
|
|
push ebp
|
|
mov ebp, esp
|
|
push edi
|
|
push esi
|
|
push ebx
|
|
psubq mm2, mm2 ; carry = 0
|
|
mov ecx, [ebp+12] ; ecx = a_len
|
|
mov edi, [ebp+16]
|
|
cmp ecx, 0
|
|
je L_36 ; jmp if a_len == 0
|
|
mov esi, [ebp+8] ; esi = a
|
|
cld
|
|
L_35:
|
|
movd mm0, [esi] ; mm0 = *a
|
|
movd mm3, [edi] ; fetch the sum
|
|
add esi, 4
|
|
pmuludq mm0, mm0 ; mm0 = sqr(a)
|
|
paddq mm2, mm0 ; add the carry
|
|
paddq mm2, mm3 ; add the low word
|
|
movd mm3, [edi+4]
|
|
movd [edi], mm2 ; store the 32bit result
|
|
psrlq mm2, 32
|
|
paddq mm2, mm3 ; add the high word
|
|
movd [edi+4], mm2 ; store the 32bit result
|
|
psrlq mm2, 32 ; save the carry.
|
|
add edi, 8
|
|
dec ecx ; --a_len
|
|
jnz L_35 ; jmp if a_len != 0
|
|
L_36:
|
|
movd ebx, mm2
|
|
cmp ebx, 0 ; is carry zero?
|
|
jz L_38
|
|
mov eax, [edi]
|
|
add eax, ebx
|
|
stosd
|
|
jnc L_38
|
|
L_37:
|
|
mov eax, [edi] ; add in current word from *c
|
|
adc eax, 0
|
|
stosd ; [es:edi] = ax; edi += 4;
|
|
jc L_37
|
|
L_38:
|
|
emms
|
|
pop ebx
|
|
pop esi
|
|
pop edi
|
|
leave
|
|
ret
|
|
nop
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
|
|
* so its high bit is 1. This code is from NSPR.
|
|
*
|
|
* Dump of assembler code for function s_mpv_div_2dx1d:
|
|
*
|
|
* esp + 0: Caller's ebx
|
|
* esp + 4: return address
|
|
* esp + 8: Nhi argument
|
|
* esp + 12: Nlo argument
|
|
* esp + 16: divisor argument
|
|
* esp + 20: qp argument
|
|
* esp + 24: rp argument
|
|
* registers:
|
|
* eax:
|
|
* ebx: carry
|
|
* ecx: a_len
|
|
* edx:
|
|
* esi: a ptr
|
|
* edi: c ptr
|
|
*/
|
|
__declspec(naked) mp_err
|
|
s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
|
|
mp_digit *qp, mp_digit *rp)
|
|
{
|
|
__asm {
|
|
push ebx
|
|
mov edx,[esp+8]
|
|
mov eax,[esp+12]
|
|
mov ebx,[esp+16]
|
|
div ebx
|
|
mov ebx,[esp+20]
|
|
mov [ebx],eax
|
|
mov ebx,[esp+24]
|
|
mov [ebx],edx
|
|
xor eax,eax ; return zero
|
|
pop ebx
|
|
ret
|
|
nop
|
|
}
|
|
}
|