mirror of
https://github.com/rn10950/RetroZilla.git
synced 2024-11-16 04:20:32 +01:00
390 lines
8.7 KiB
ArmAsm
390 lines
8.7 KiB
ArmAsm
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
|
# ------------------------------------------------------------------------
|
|
#
|
|
# Implementation of s_mpv_mul_set_vec which exploits
|
|
# the 64X64->128 bit unsigned multiply instruction.
|
|
#
|
|
# ------------------------------------------------------------------------
|
|
|
|
# r = a * digit, r and a are vectors of length len
|
|
# returns the carry digit
|
|
# r and a are 64 bit aligned.
|
|
#
|
|
# uint64_t
|
|
# s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
|
|
#
|
|
|
|
.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
|
|
|
|
xorq %rax, %rax # if (len == 0) return (0)
|
|
testq %rdx, %rdx
|
|
jz .L17
|
|
|
|
movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
|
|
xorq %r9, %r9 # cy = 0
|
|
|
|
.L15:
|
|
cmpq $8, %r8 # 8 - len
|
|
jb .L16
|
|
movq 0(%rsi), %rax # rax = a[0]
|
|
movq 8(%rsi), %r11 # prefetch a[1]
|
|
mulq %rcx # p = a[0] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 0(%rdi) # r[0] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 16(%rsi), %r11 # prefetch a[2]
|
|
mulq %rcx # p = a[1] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 8(%rdi) # r[1] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 24(%rsi), %r11 # prefetch a[3]
|
|
mulq %rcx # p = a[2] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 16(%rdi) # r[2] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 32(%rsi), %r11 # prefetch a[4]
|
|
mulq %rcx # p = a[3] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 24(%rdi) # r[3] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 40(%rsi), %r11 # prefetch a[5]
|
|
mulq %rcx # p = a[4] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 32(%rdi) # r[4] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 48(%rsi), %r11 # prefetch a[6]
|
|
mulq %rcx # p = a[5] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 40(%rdi) # r[5] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 56(%rsi), %r11 # prefetch a[7]
|
|
mulq %rcx # p = a[6] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 48(%rdi) # r[6] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
mulq %rcx # p = a[7] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 56(%rdi) # r[7] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
addq $64, %rsi
|
|
addq $64, %rdi
|
|
subq $8, %r8
|
|
|
|
jz .L17
|
|
jmp .L15
|
|
|
|
.L16:
|
|
movq 0(%rsi), %rax
|
|
mulq %rcx # p = a[0] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 0(%rdi) # r[0] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L17
|
|
|
|
movq 8(%rsi), %rax
|
|
mulq %rcx # p = a[1] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 8(%rdi) # r[1] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L17
|
|
|
|
movq 16(%rsi), %rax
|
|
mulq %rcx # p = a[2] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 16(%rdi) # r[2] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L17
|
|
|
|
movq 24(%rsi), %rax
|
|
mulq %rcx # p = a[3] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 24(%rdi) # r[3] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L17
|
|
|
|
movq 32(%rsi), %rax
|
|
mulq %rcx # p = a[4] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 32(%rdi) # r[4] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L17
|
|
|
|
movq 40(%rsi), %rax
|
|
mulq %rcx # p = a[5] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 40(%rdi) # r[5] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L17
|
|
|
|
movq 48(%rsi), %rax
|
|
mulq %rcx # p = a[6] * digit
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 48(%rdi) # r[6] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L17
|
|
|
|
|
|
.L17:
|
|
movq %r9, %rax
|
|
ret
|
|
|
|
.size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
|
|
|
|
# ------------------------------------------------------------------------
|
|
#
|
|
# Implementation of s_mpv_mul_add_vec which exploits
|
|
# the 64X64->128 bit unsigned multiply instruction.
|
|
#
|
|
# ------------------------------------------------------------------------
|
|
|
|
# r += a * digit, r and a are vectors of length len
|
|
# returns the carry digit
|
|
# r and a are 64 bit aligned.
|
|
#
|
|
# uint64_t
|
|
# s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
|
|
#
|
|
|
|
.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
|
|
|
|
xorq %rax, %rax # if (len == 0) return (0)
|
|
testq %rdx, %rdx
|
|
jz .L27
|
|
|
|
movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
|
|
xorq %r9, %r9 # cy = 0
|
|
|
|
.L25:
|
|
cmpq $8, %r8 # 8 - len
|
|
jb .L26
|
|
movq 0(%rsi), %rax # rax = a[0]
|
|
movq 0(%rdi), %r10 # r10 = r[0]
|
|
movq 8(%rsi), %r11 # prefetch a[1]
|
|
mulq %rcx # p = a[0] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[0]
|
|
movq 8(%rdi), %r10 # prefetch r[1]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 0(%rdi) # r[0] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 16(%rsi), %r11 # prefetch a[2]
|
|
mulq %rcx # p = a[1] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[1]
|
|
movq 16(%rdi), %r10 # prefetch r[2]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 8(%rdi) # r[1] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 24(%rsi), %r11 # prefetch a[3]
|
|
mulq %rcx # p = a[2] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[2]
|
|
movq 24(%rdi), %r10 # prefetch r[3]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 16(%rdi) # r[2] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 32(%rsi), %r11 # prefetch a[4]
|
|
mulq %rcx # p = a[3] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[3]
|
|
movq 32(%rdi), %r10 # prefetch r[4]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 24(%rdi) # r[3] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 40(%rsi), %r11 # prefetch a[5]
|
|
mulq %rcx # p = a[4] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[4]
|
|
movq 40(%rdi), %r10 # prefetch r[5]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 32(%rdi) # r[4] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 48(%rsi), %r11 # prefetch a[6]
|
|
mulq %rcx # p = a[5] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[5]
|
|
movq 48(%rdi), %r10 # prefetch r[6]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 40(%rdi) # r[5] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
movq 56(%rsi), %r11 # prefetch a[7]
|
|
mulq %rcx # p = a[6] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[6]
|
|
movq 56(%rdi), %r10 # prefetch r[7]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 48(%rdi) # r[6] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
movq %r11, %rax
|
|
mulq %rcx # p = a[7] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[7]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 56(%rdi) # r[7] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
|
|
addq $64, %rsi
|
|
addq $64, %rdi
|
|
subq $8, %r8
|
|
|
|
jz .L27
|
|
jmp .L25
|
|
|
|
.L26:
|
|
movq 0(%rsi), %rax
|
|
movq 0(%rdi), %r10
|
|
mulq %rcx # p = a[0] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[0]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 0(%rdi) # r[0] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L27
|
|
|
|
movq 8(%rsi), %rax
|
|
movq 8(%rdi), %r10
|
|
mulq %rcx # p = a[1] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[1]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 8(%rdi) # r[1] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L27
|
|
|
|
movq 16(%rsi), %rax
|
|
movq 16(%rdi), %r10
|
|
mulq %rcx # p = a[2] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[2]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 16(%rdi) # r[2] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L27
|
|
|
|
movq 24(%rsi), %rax
|
|
movq 24(%rdi), %r10
|
|
mulq %rcx # p = a[3] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[3]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 24(%rdi) # r[3] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L27
|
|
|
|
movq 32(%rsi), %rax
|
|
movq 32(%rdi), %r10
|
|
mulq %rcx # p = a[4] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[4]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 32(%rdi) # r[4] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L27
|
|
|
|
movq 40(%rsi), %rax
|
|
movq 40(%rdi), %r10
|
|
mulq %rcx # p = a[5] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[5]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 40(%rdi) # r[5] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L27
|
|
|
|
movq 48(%rsi), %rax
|
|
movq 48(%rdi), %r10
|
|
mulq %rcx # p = a[6] * digit
|
|
addq %r10, %rax
|
|
adcq $0, %rdx # p += r[6]
|
|
addq %r9, %rax
|
|
adcq $0, %rdx # p += cy
|
|
movq %rax, 48(%rdi) # r[6] = lo(p)
|
|
movq %rdx, %r9 # cy = hi(p)
|
|
decq %r8
|
|
jz .L27
|
|
|
|
|
|
.L27:
|
|
movq %r9, %rax
|
|
ret
|
|
|
|
.size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
|
|
|
|
# Magic indicating no need for an executable stack
|
|
.section .note.GNU-stack, "", @progbits
|
|
.previous
|