# ***** BEGIN LICENSE BLOCK ***** # Version: MPL 1.1/GPL 2.0/LGPL 2.1 # # The contents of this file are subject to the Mozilla Public License Version # 1.1 (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS IS" basis, # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License # for the specific language governing rights and limitations under the # License. # # The Original Code is the Solaris software cryptographic token. # # The Initial Developer of the Original Code is # Sun Microsystems, Inc. # Portions created by the Initial Developer are Copyright (C) 2005 # the Initial Developer. All Rights Reserved. # # Contributor(s): # Sun Microsystems, Inc. # # Alternatively, the contents of this file may be used under the terms of # either the GNU General Public License Version 2 or later (the "GPL"), or # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), # in which case the provisions of the GPL or the LGPL are applicable instead # of those above. If you wish to allow use of your version of this file only # under the terms of either the GPL or the LGPL, and not to allow others to # use your version of this file under the terms of the MPL, indicate your # decision by deleting the provisions above and replace them with the notice # and other provisions required by the GPL or the LGPL. If you do not delete # the provisions above, a recipient may use your version of this file under # the terms of any one of the MPL, the GPL or the LGPL. # # ***** END LICENSE BLOCK ***** */ # ------------------------------------------------------------------------ # # Implementation of s_mpv_mul_set_vec which exploits # the 64X64->128 bit unsigned multiply instruction. # # ------------------------------------------------------------------------ # r = a * digit, r and a are vectors of length len # returns the carry digit # r and a are 64 bit aligned. # # uint64_t # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) # .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: xorq %rax, %rax # if (len == 0) return (0) testq %rdx, %rdx jz .L17 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul xorq %r9, %r9 # cy = 0 .L15: cmpq $8, %r8 # 8 - len jb .L16 movq 0(%rsi), %rax # rax = a[0] movq 8(%rsi), %r11 # prefetch a[1] mulq %rcx # p = a[0] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 0(%rdi) # r[0] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 16(%rsi), %r11 # prefetch a[2] mulq %rcx # p = a[1] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 8(%rdi) # r[1] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 24(%rsi), %r11 # prefetch a[3] mulq %rcx # p = a[2] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 16(%rdi) # r[2] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 32(%rsi), %r11 # prefetch a[4] mulq %rcx # p = a[3] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 24(%rdi) # r[3] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 40(%rsi), %r11 # prefetch a[5] mulq %rcx # p = a[4] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 32(%rdi) # r[4] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 48(%rsi), %r11 # prefetch a[6] mulq %rcx # p = a[5] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 40(%rdi) # r[5] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 56(%rsi), %r11 # prefetch a[7] mulq %rcx # p = a[6] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 48(%rdi) # r[6] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax mulq %rcx # p = a[7] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 56(%rdi) # r[7] = lo(p) movq %rdx, %r9 # cy = hi(p) addq $64, %rsi addq $64, %rdi subq $8, %r8 jz .L17 jmp .L15 .L16: movq 0(%rsi), %rax mulq %rcx # p = a[0] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 0(%rdi) # r[0] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L17 movq 8(%rsi), %rax mulq %rcx # p = a[1] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 8(%rdi) # r[1] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L17 movq 16(%rsi), %rax mulq %rcx # p = a[2] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 16(%rdi) # r[2] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L17 movq 24(%rsi), %rax mulq %rcx # p = a[3] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 24(%rdi) # r[3] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L17 movq 32(%rsi), %rax mulq %rcx # p = a[4] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 32(%rdi) # r[4] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L17 movq 40(%rsi), %rax mulq %rcx # p = a[5] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 40(%rdi) # r[5] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L17 movq 48(%rsi), %rax mulq %rcx # p = a[6] * digit addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 48(%rdi) # r[6] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L17 .L17: movq %r9, %rax ret .size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64] # ------------------------------------------------------------------------ # # Implementation of s_mpv_mul_add_vec which exploits # the 64X64->128 bit unsigned multiply instruction. # # ------------------------------------------------------------------------ # r += a * digit, r and a are vectors of length len # returns the carry digit # r and a are 64 bit aligned. # # uint64_t # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) # .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: xorq %rax, %rax # if (len == 0) return (0) testq %rdx, %rdx jz .L27 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul xorq %r9, %r9 # cy = 0 .L25: cmpq $8, %r8 # 8 - len jb .L26 movq 0(%rsi), %rax # rax = a[0] movq 0(%rdi), %r10 # r10 = r[0] movq 8(%rsi), %r11 # prefetch a[1] mulq %rcx # p = a[0] * digit addq %r10, %rax adcq $0, %rdx # p += r[0] movq 8(%rdi), %r10 # prefetch r[1] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 0(%rdi) # r[0] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 16(%rsi), %r11 # prefetch a[2] mulq %rcx # p = a[1] * digit addq %r10, %rax adcq $0, %rdx # p += r[1] movq 16(%rdi), %r10 # prefetch r[2] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 8(%rdi) # r[1] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 24(%rsi), %r11 # prefetch a[3] mulq %rcx # p = a[2] * digit addq %r10, %rax adcq $0, %rdx # p += r[2] movq 24(%rdi), %r10 # prefetch r[3] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 16(%rdi) # r[2] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 32(%rsi), %r11 # prefetch a[4] mulq %rcx # p = a[3] * digit addq %r10, %rax adcq $0, %rdx # p += r[3] movq 32(%rdi), %r10 # prefetch r[4] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 24(%rdi) # r[3] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 40(%rsi), %r11 # prefetch a[5] mulq %rcx # p = a[4] * digit addq %r10, %rax adcq $0, %rdx # p += r[4] movq 40(%rdi), %r10 # prefetch r[5] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 32(%rdi) # r[4] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 48(%rsi), %r11 # prefetch a[6] mulq %rcx # p = a[5] * digit addq %r10, %rax adcq $0, %rdx # p += r[5] movq 48(%rdi), %r10 # prefetch r[6] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 40(%rdi) # r[5] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax movq 56(%rsi), %r11 # prefetch a[7] mulq %rcx # p = a[6] * digit addq %r10, %rax adcq $0, %rdx # p += r[6] movq 56(%rdi), %r10 # prefetch r[7] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 48(%rdi) # r[6] = lo(p) movq %rdx, %r9 # cy = hi(p) movq %r11, %rax mulq %rcx # p = a[7] * digit addq %r10, %rax adcq $0, %rdx # p += r[7] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 56(%rdi) # r[7] = lo(p) movq %rdx, %r9 # cy = hi(p) addq $64, %rsi addq $64, %rdi subq $8, %r8 jz .L27 jmp .L25 .L26: movq 0(%rsi), %rax movq 0(%rdi), %r10 mulq %rcx # p = a[0] * digit addq %r10, %rax adcq $0, %rdx # p += r[0] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 0(%rdi) # r[0] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L27 movq 8(%rsi), %rax movq 8(%rdi), %r10 mulq %rcx # p = a[1] * digit addq %r10, %rax adcq $0, %rdx # p += r[1] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 8(%rdi) # r[1] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L27 movq 16(%rsi), %rax movq 16(%rdi), %r10 mulq %rcx # p = a[2] * digit addq %r10, %rax adcq $0, %rdx # p += r[2] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 16(%rdi) # r[2] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L27 movq 24(%rsi), %rax movq 24(%rdi), %r10 mulq %rcx # p = a[3] * digit addq %r10, %rax adcq $0, %rdx # p += r[3] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 24(%rdi) # r[3] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L27 movq 32(%rsi), %rax movq 32(%rdi), %r10 mulq %rcx # p = a[4] * digit addq %r10, %rax adcq $0, %rdx # p += r[4] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 32(%rdi) # r[4] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L27 movq 40(%rsi), %rax movq 40(%rdi), %r10 mulq %rcx # p = a[5] * digit addq %r10, %rax adcq $0, %rdx # p += r[5] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 40(%rdi) # r[5] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L27 movq 48(%rsi), %rax movq 48(%rdi), %r10 mulq %rcx # p = a[6] * digit addq %r10, %rax adcq $0, %rdx # p += r[6] addq %r9, %rax adcq $0, %rdx # p += cy movq %rax, 48(%rdi) # r[6] = lo(p) movq %rdx, %r9 # cy = hi(p) decq %r8 jz .L27 .L27: movq %r9, %rax ret .size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64] # Magic indicating no need for an executable stack .section .note.GNU-stack, "", @progbits .previous