RetroZilla/security/nss/lib/freebl/mpi/hppa20.s
2015-10-20 23:03:22 -04:00

936 lines
29 KiB
ArmAsm
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

; ***** BEGIN LICENSE BLOCK *****
; Version: MPL 1.1/GPL 2.0/LGPL 2.1
;
; The contents of this file are subject to the Mozilla Public License Version
; 1.1 (the "License"); you may not use this file except in compliance with
; the License. You may obtain a copy of the License at
; http://www.mozilla.org/MPL/
;
; Software distributed under the License is distributed on an "AS IS" basis,
; WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
; for the specific language governing rights and limitations under the
; License.
;
; The Original Code is MAXPY multiple-precision integer arithmetic.
;
; The Initial Developer of the Original Code is
; the Hewlett-Packard Company.
; Portions created by the Initial Developer are Copyright (C) 1997
; the Initial Developer. All Rights Reserved.
;
; Contributor(s):
; coded by: William B. Ackerman
;
; Alternatively, the contents of this file may be used under the terms of
; either the GNU General Public License Version 2 or later (the "GPL"), or
; the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
; in which case the provisions of the GPL or the LGPL are applicable instead
; of those above. If you wish to allow use of your version of this file only
; under the terms of either the GPL or the LGPL, and not to allow others to
; use your version of this file under the terms of the MPL, indicate your
; decision by deleting the provisions above and replace them with the notice
; and other provisions required by the GPL or the LGPL. If you do not delete
; the provisions above, a recipient may use your version of this file under
; the terms of any one of the MPL, the GPL or the LGPL.
;
; ***** END LICENSE BLOCK *****
#ifdef __LP64__
.LEVEL 2.0W
#else
; .LEVEL 1.1
; .ALLOW 2.0N
.LEVEL 2.0N
#endif
.SPACE $TEXT$,SORT=8
.SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
; ***************************************************************
;
; maxpy_[little/big]
;
; ***************************************************************
; There is no default -- you must specify one or the other.
#define LITTLE_WORDIAN 1
#ifdef LITTLE_WORDIAN
#define EIGHT 8
#define SIXTEEN 16
#define THIRTY_TWO 32
#define UN_EIGHT -8
#define UN_SIXTEEN -16
#define UN_TWENTY_FOUR -24
#endif
#ifdef BIG_WORDIAN
#define EIGHT -8
#define SIXTEEN -16
#define THIRTY_TWO -32
#define UN_EIGHT 8
#define UN_SIXTEEN 16
#define UN_TWENTY_FOUR 24
#endif
; This performs a multiple-precision integer version of "daxpy",
; Using the selected addressing direction. "Little-wordian" means that
; the least significant word of a number is stored at the lowest address.
; "Big-wordian" means that the most significant word is at the lowest
; address. Either way, the incoming address of the vector is that
; of the least significant word. That means that, for little-wordian
; addressing, we move the address upward as we propagate carries
; from the least significant word to the most significant. For
; big-wordian we move the address downward.
; We use the following registers:
;
; r2 return PC, of course
; r26 = arg1 = length
; r25 = arg2 = address of scalar
; r24 = arg3 = multiplicand vector
; r23 = arg4 = result vector
;
; fr9 = scalar loaded once only from r25
; The cycle counts shown in the bodies below are simply the result of a
; scheduling by hand. The actual PCX-U hardware does it differently.
; The intention is that the overall speed is the same.
; The pipeline startup and shutdown code is constructed in the usual way,
; by taking the loop bodies and removing unnecessary instructions.
; We have left the comments describing cycle numbers in the code.
; These are intended for reference when comparing with the main loop,
; and have no particular relationship to actual cycle numbers.
#ifdef LITTLE_WORDIAN
maxpy_little
#else
maxpy_big
#endif
.PROC
.CALLINFO FRAME=120,ENTRY_GR=%r4
.ENTER
; Of course, real men don't use the sissy "enter" and "leave" commands.
; They write their own stack manipulation stuff. Unfortunately,
; that doesn't generate complete unwind info, whereas "enter" and
; "leave" (if the documentation is to be believed) do so. Therefore,
; we use the sissy commands. We have verified (by real-man methods)
; that the above command generates what we want:
; STW,MA %r3,128(%sp)
; STW %r4,-124(%sp)
ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
FLDD 0(%r25),%fr9 ; fr9 = scalar
; First startup
FLDD 0(%r24),%fr24 ; Cycle 1
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
FLDD EIGHT(%r24),%fr28 ; Cycle 8
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
FSTD %fr24,-96(%sp)
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
FSTD %fr25,-80(%sp)
LDO SIXTEEN(%r24),%r24 ; Cycle 12
FSTD %fr31,-64(%sp)
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
FSTD %fr27,-48(%sp)
; Second startup
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
FSTD %fr30,-56(%sp)
FLDD 0(%r24),%fr24
FSTD %fr26,-88(%sp) ; Cycle 2
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
FSTD %fr28,-104(%sp)
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
LDD -96(%sp),%r3
FSTD %fr29,-72(%sp)
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
LDD -64(%sp),%r19
LDD -80(%sp),%r21
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
LDD -56(%sp),%r20
ADD %r21,%r3,%r3
ADD,DC %r20,%r19,%r19 ; Cycle 7
LDD -88(%sp),%r4
SHRPD %r3,%r0,32,%r21
LDD -48(%sp),%r1
FLDD EIGHT(%r24),%fr28 ; Cycle 8
LDD -104(%sp),%r31
ADD,DC %r0,%r0,%r20
SHRPD %r19,%r3,32,%r3
LDD -72(%sp),%r29 ; Cycle 9
SHRPD %r20,%r19,32,%r20
ADD %r21,%r1,%r1
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
ADD,DC %r3,%r4,%r4
FSTD %fr24,-96(%sp)
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
ADD,DC %r0,%r20,%r20
LDD 0(%r23),%r3
FSTD %fr25,-80(%sp)
LDO SIXTEEN(%r24),%r24 ; Cycle 12
FSTD %fr31,-64(%sp)
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
ADD %r0,%r0,%r0 ; clear the carry bit
ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
FSTD %fr27,-48(%sp)
; MFCTL %cr16,%r21 ; for timing
; STD %r21,-112(%sp)
; Here is the loop.
$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
ADD,DC %r29,%r4,%r4
FSTD %fr30,-56(%sp)
FLDD 0(%r24),%fr24
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
ADD %r3,%r1,%r1
FSTD %fr28,-104(%sp)
LDD UN_EIGHT(%r23),%r21
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
ADD,DC %r21,%r4,%r28
FSTD %fr29,-72(%sp)
LDD -96(%sp),%r3
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
ADD,DC %r20,%r31,%r22
LDD -64(%sp),%r19
LDD -80(%sp),%r21
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
ADD %r21,%r3,%r3
LDD -56(%sp),%r20
STD %r1,UN_SIXTEEN(%r23)
ADD,DC %r20,%r19,%r19 ; Cycle 7
SHRPD %r3,%r0,32,%r21
LDD -88(%sp),%r4
LDD -48(%sp),%r1
ADD,DC %r0,%r0,%r20 ; Cycle 8
SHRPD %r19,%r3,32,%r3
FLDD EIGHT(%r24),%fr28
LDD -104(%sp),%r31
SHRPD %r20,%r19,32,%r20 ; Cycle 9
ADD %r21,%r1,%r1
STD %r28,UN_EIGHT(%r23)
LDD -72(%sp),%r29
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
ADD,DC %r3,%r4,%r4
FSTD %fr24,-96(%sp)
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
ADD,DC %r0,%r20,%r20
FSTD %fr25,-80(%sp)
LDD 0(%r23),%r3
LDO SIXTEEN(%r24),%r24 ; Cycle 12
FSTD %fr31,-64(%sp)
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
ADD %r22,%r1,%r1
ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
FSTD %fr27,-48(%sp)
$ENDLOOP
; Shutdown code, first stage.
; MFCTL %cr16,%r21 ; for timing
; STD %r21,UN_SIXTEEN(%r23)
; LDD -112(%sp),%r21
; STD %r21,UN_EIGHT(%r23)
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
ADD,DC %r29,%r4,%r4
CMPIB,= 0,%r26,$ONEMORE
FSTD %fr30,-56(%sp)
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
ADD %r3,%r1,%r1 ; Cycle 3
FSTD %fr28,-104(%sp)
LDD UN_EIGHT(%r23),%r21
ADD,DC %r21,%r4,%r28 ; Cycle 4
FSTD %fr29,-72(%sp)
STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
LDD -96(%sp),%r3
ADD,DC %r20,%r31,%r22 ; Cycle 5
STD %r1,UN_SIXTEEN(%r23)
$JOIN4
LDD -64(%sp),%r19
LDD -80(%sp),%r21
ADD %r21,%r3,%r3 ; Cycle 6
LDD -56(%sp),%r20
ADD,DC %r20,%r19,%r19 ; Cycle 7
SHRPD %r3,%r0,32,%r21
LDD -88(%sp),%r4
LDD -48(%sp),%r1
ADD,DC %r0,%r0,%r20 ; Cycle 8
SHRPD %r19,%r3,32,%r3
LDD -104(%sp),%r31
SHRPD %r20,%r19,32,%r20 ; Cycle 9
ADD %r21,%r1,%r1
LDD -72(%sp),%r29
ADD,DC %r3,%r4,%r4 ; Cycle 10
ADD,DC %r0,%r20,%r20 ; Cycle 11
LDD 0(%r23),%r3
ADD %r22,%r1,%r1 ; Cycle 13
; Shutdown code, second stage.
ADD,DC %r29,%r4,%r4 ; Cycle 1
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
ADD %r3,%r1,%r1
ADD,DC %r21,%r4,%r28 ; Cycle 4
ADD,DC %r20,%r31,%r22 ; Cycle 5
STD %r1,UN_SIXTEEN(%r23); Cycle 6
STD %r28,UN_EIGHT(%r23) ; Cycle 9
LDD 0(%r23),%r3 ; Cycle 11
; Shutdown code, third stage.
LDO SIXTEEN(%r23),%r23
ADD %r3,%r22,%r1
$JOIN1 ADD,DC %r0,%r0,%r21
CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
STD %r1,UN_SIXTEEN(%r23)
; Final carry propagation
$FINAL1 LDO EIGHT(%r23),%r23
LDD UN_SIXTEEN(%r23),%r21
ADDI 1,%r21,%r21
CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
STD %r21,UN_SIXTEEN(%r23)
B $L0
NOP
; Here is the code that handles the difficult cases N=1, N=2, and N=3.
; We do the usual trick -- branch out of the startup code at appropriate
; points, and branch into the shutdown code.
$N_IS_SMALL
CMPIB,= 0,%r26,$N_IS_ONE
FSTD %fr24,-96(%sp) ; Cycle 10
FLDD EIGHT(%r24),%fr28 ; Cycle 8
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
FSTD %fr25,-80(%sp)
FSTD %fr31,-64(%sp) ; Cycle 12
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
FSTD %fr27,-48(%sp)
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
CMPIB,= 2,%r26,$N_IS_THREE
FSTD %fr30,-56(%sp)
; N = 2
FSTD %fr26,-88(%sp) ; Cycle 2
FSTD %fr28,-104(%sp) ; Cycle 3
LDD -96(%sp),%r3 ; Cycle 4
FSTD %fr29,-72(%sp)
B $JOIN4
ADD %r0,%r0,%r22
$N_IS_THREE
FLDD SIXTEEN(%r24),%fr24
FSTD %fr26,-88(%sp) ; Cycle 2
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
FSTD %fr28,-104(%sp)
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
LDD -96(%sp),%r3
FSTD %fr29,-72(%sp)
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
LDD -64(%sp),%r19
LDD -80(%sp),%r21
B $JOIN3
ADD %r0,%r0,%r22
$N_IS_ONE
FSTD %fr25,-80(%sp)
FSTD %fr27,-48(%sp)
FSTD %fr26,-88(%sp) ; Cycle 2
B $JOIN5
ADD %r0,%r0,%r22
; We came out of the unrolled loop with wrong parity. Do one more
; single cycle. This is quite tricky, because of the way the
; carry chains and SHRPD chains have been chopped up.
$ONEMORE
FLDD 0(%r24),%fr24
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
FSTD %fr28,-104(%sp)
LDD UN_EIGHT(%r23),%r21
ADD %r3,%r1,%r1
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
ADD,DC %r21,%r4,%r28
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
LDD -96(%sp),%r3
FSTD %fr29,-72(%sp)
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
ADD,DC %r20,%r31,%r22
LDD -64(%sp),%r19
LDD -80(%sp),%r21
STD %r1,UN_SIXTEEN(%r23); Cycle 6
$JOIN3
XMPYU %fr9L,%fr24R,%fr24
LDD -56(%sp),%r20
ADD %r21,%r3,%r3
ADD,DC %r20,%r19,%r19 ; Cycle 7
LDD -88(%sp),%r4
SHRPD %r3,%r0,32,%r21
LDD -48(%sp),%r1
LDD -104(%sp),%r31 ; Cycle 8
ADD,DC %r0,%r0,%r20
SHRPD %r19,%r3,32,%r3
LDD -72(%sp),%r29 ; Cycle 9
SHRPD %r20,%r19,32,%r20
ADD %r21,%r1,%r1
ADD,DC %r3,%r4,%r4 ; Cycle 10
FSTD %fr24,-96(%sp)
ADD,DC %r0,%r20,%r20 ; Cycle 11
LDD 0(%r23),%r3
FSTD %fr25,-80(%sp)
ADD %r22,%r1,%r1 ; Cycle 13
FSTD %fr27,-48(%sp)
; Shutdown code, stage 1-1/2.
ADD,DC %r29,%r4,%r4 ; Cycle 1
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
ADD %r3,%r1,%r1
ADD,DC %r21,%r4,%r28 ; Cycle 4
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
ADD,DC %r20,%r31,%r22 ; Cycle 5
STD %r1,UN_SIXTEEN(%r23)
$JOIN5
LDD -96(%sp),%r3 ; moved from cycle 4
LDD -80(%sp),%r21
ADD %r21,%r3,%r3 ; Cycle 6
ADD,DC %r0,%r0,%r19 ; Cycle 7
LDD -88(%sp),%r4
SHRPD %r3,%r0,32,%r21
LDD -48(%sp),%r1
SHRPD %r19,%r3,32,%r3 ; Cycle 8
ADD %r21,%r1,%r1 ; Cycle 9
ADD,DC %r3,%r4,%r4 ; Cycle 10
LDD 0(%r23),%r3 ; Cycle 11
ADD %r22,%r1,%r1 ; Cycle 13
; Shutdown code, stage 2-1/2.
ADD,DC %r0,%r4,%r4 ; Cycle 1
LDO SIXTEEN(%r23),%r23 ; Cycle 2
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
ADD %r3,%r1,%r1
STD %r1,UN_SIXTEEN(%r23)
ADD,DC %r21,%r4,%r1
B $JOIN1
LDO EIGHT(%r23),%r23
; exit
$L0
.LEAVE
; We have verified that the above command generates what we want:
; LDW -124(%sp),%r4
; BVE (%r2)
; LDW,MB -128(%sp),%r3
.PROCEND
; ***************************************************************
;
; add_diag_[little/big]
;
; ***************************************************************
; The arguments are as follows:
; r2 return PC, of course
; r26 = arg1 = length
; r25 = arg2 = vector to square
; r24 = arg3 = result vector
#ifdef LITTLE_WORDIAN
add_diag_little
#else
add_diag_big
#endif
.PROC
.CALLINFO FRAME=120,ENTRY_GR=%r4
.ENTER
ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
NOP
; Startup code
FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr30
LDO SIXTEEN(%r25),%r25 ; Cycle 6
FSTD %fr29,-88(%sp)
FSTD %fr27,-72(%sp) ; Cycle 7
CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
FSTD %fr30,-96(%sp)
FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
LDD -88(%sp),%r22 ; Cycle 3
LDD -72(%sp),%r31 ; Cycle 4
XMPYU %fr7R,%fr7R,%fr28
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr31
LDD -96(%sp),%r20 ; Cycle 6
FSTD %fr28,-80(%sp)
ADD %r0,%r0,%r0 ; clear the carry bit
ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
FSTD %fr24,-64(%sp)
; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
$DIAGLOOP
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
LDO SIXTEEN(%r25),%r25
LDD 0(%r24),%r1
FSTD %fr31,-104(%sp)
SHRPD %r0,%r31,31,%r4 ; Cycle 2
ADD,DC %r22,%r3,%r3
FLDD UN_SIXTEEN(%r25),%fr7
ADD,DC %r0,%r20,%r20 ; Cycle 3
ADD %r1,%r3,%r3
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
LDD -80(%sp),%r21
STD %r3,0(%r24)
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr30
LDD -64(%sp),%r29
LDD EIGHT(%r24),%r1
ADD,DC %r4,%r20,%r20 ; Cycle 6
LDD -104(%sp),%r19
FSTD %fr29,-88(%sp)
ADD %r20,%r1,%r1 ; Cycle 7
FSTD %fr27,-72(%sp)
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
LDO THIRTY_TWO(%r24),%r24
LDD UN_SIXTEEN(%r24),%r28
FSTD %fr30,-96(%sp)
SHRPD %r0,%r29,31,%r3 ; Cycle 2
ADD,DC %r21,%r4,%r4
FLDD UN_EIGHT(%r25),%fr7
STD %r1,UN_TWENTY_FOUR(%r24)
ADD,DC %r0,%r19,%r19 ; Cycle 3
ADD %r28,%r4,%r4
XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
LDD -88(%sp),%r22
STD %r4,UN_SIXTEEN(%r24)
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr31
LDD -72(%sp),%r31
LDD UN_EIGHT(%r24),%r28
ADD,DC %r3,%r19,%r19 ; Cycle 6
LDD -96(%sp),%r20
FSTD %fr28,-80(%sp)
ADD %r19,%r28,%r28 ; Cycle 7
FSTD %fr24,-64(%sp)
ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
STD %r28,UN_EIGHT(%r24)
$ENDDIAGLOOP
ADD,DC %r0,%r22,%r22
CMPIB,= 0,%r26,$ONEMOREDIAG
SHRPD %r31,%r0,31,%r3
; Shutdown code, first stage.
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
LDD 0(%r24),%r28
SHRPD %r0,%r31,31,%r4 ; Cycle 2
ADD %r3,%r22,%r3
ADD,DC %r0,%r20,%r20 ; Cycle 3
LDD -80(%sp),%r21
ADD %r3,%r28,%r3
LDD -64(%sp),%r29 ; Cycle 4
STD %r3,0(%r24)
LDD EIGHT(%r24),%r1 ; Cycle 5
LDO SIXTEEN(%r25),%r25 ; Cycle 6
LDD -104(%sp),%r19
ADD,DC %r4,%r20,%r20
ADD %r20,%r1,%r1 ; Cycle 7
ADD,DC %r0,%r21,%r21 ; Cycle 8
STD %r1,EIGHT(%r24)
; Shutdown code, second stage.
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
LDO THIRTY_TWO(%r24),%r24
LDD UN_SIXTEEN(%r24),%r1
SHRPD %r0,%r29,31,%r3 ; Cycle 2
ADD %r4,%r21,%r4
ADD,DC %r0,%r19,%r19 ; Cycle 3
ADD %r4,%r1,%r4
STD %r4,UN_SIXTEEN(%r24); Cycle 4
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
ADD,DC %r3,%r19,%r19 ; Cycle 6
ADD %r19,%r28,%r28 ; Cycle 7
ADD,DC %r0,%r0,%r22 ; Cycle 8
CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
STD %r28,UN_EIGHT(%r24)
; Final carry propagation
$FDIAG2
LDO EIGHT(%r24),%r24
LDD UN_EIGHT(%r24),%r26
ADDI 1,%r26,%r26
CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
STD %r26,UN_EIGHT(%r24)
B $Z0
NOP
; Here is the code that handles the difficult case N=1.
; We do the usual trick -- branch out of the startup code at appropriate
; points, and branch into the shutdown code.
$DIAG_N_IS_ONE
LDD -88(%sp),%r22
LDD -72(%sp),%r31
B $JOINDIAG
LDD -96(%sp),%r20
; We came out of the unrolled loop with wrong parity. Do one more
; single cycle. This is the "alternate body". It will, of course,
; give us opposite registers from the other case, so we need
; completely different shutdown code.
$ONEMOREDIAG
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
LDD 0(%r24),%r28
FLDD 0(%r25),%fr7 ; Cycle 2
SHRPD %r0,%r31,31,%r4
ADD %r3,%r22,%r3
ADD,DC %r0,%r20,%r20 ; Cycle 3
LDD -80(%sp),%r21
ADD %r3,%r28,%r3
LDD -64(%sp),%r29 ; Cycle 4
STD %r3,0(%r24)
XMPYU %fr7R,%fr7R,%fr29
LDD EIGHT(%r24),%r1 ; Cycle 5
XMPYU %fr7L,%fr7R,%fr27
XMPYU %fr7L,%fr7L,%fr30
LDD -104(%sp),%r19 ; Cycle 6
FSTD %fr29,-88(%sp)
ADD,DC %r4,%r20,%r20
FSTD %fr27,-72(%sp) ; Cycle 7
ADD %r20,%r1,%r1
ADD,DC %r0,%r21,%r21 ; Cycle 8
STD %r1,EIGHT(%r24)
; Shutdown code, first stage.
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
LDO THIRTY_TWO(%r24),%r24
FSTD %fr30,-96(%sp)
LDD UN_SIXTEEN(%r24),%r1
SHRPD %r0,%r29,31,%r3 ; Cycle 2
ADD %r4,%r21,%r4
ADD,DC %r0,%r19,%r19 ; Cycle 3
LDD -88(%sp),%r22
ADD %r4,%r1,%r4
LDD -72(%sp),%r31 ; Cycle 4
STD %r4,UN_SIXTEEN(%r24)
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
LDD -96(%sp),%r20 ; Cycle 6
ADD,DC %r3,%r19,%r19
ADD %r19,%r28,%r28 ; Cycle 7
ADD,DC %r0,%r22,%r22 ; Cycle 8
STD %r28,UN_EIGHT(%r24)
; Shutdown code, second stage.
$JOINDIAG
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
LDD 0(%r24),%r28
SHRPD %r0,%r31,31,%r4 ; Cycle 2
ADD %r3,%r22,%r3
ADD,DC %r0,%r20,%r20 ; Cycle 3
ADD %r3,%r28,%r3
STD %r3,0(%r24) ; Cycle 4
LDD EIGHT(%r24),%r1 ; Cycle 5
ADD,DC %r4,%r20,%r20
ADD %r20,%r1,%r1 ; Cycle 7
ADD,DC %r0,%r0,%r21 ; Cycle 8
CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
STD %r1,EIGHT(%r24)
; Final carry propagation
$FDIAG1
LDO EIGHT(%r24),%r24
LDD EIGHT(%r24),%r26
ADDI 1,%r26,%r26
CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
STD %r26,EIGHT(%r24)
$Z0
.LEAVE
.PROCEND
; .ALLOW
.SPACE $TEXT$
.SUBSPA $CODE$
#ifdef LITTLE_WORDIAN
.EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
.EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
#else
.EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
.EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
#endif
.END
; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
;
; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
; performs a 64-bit x any-size multiply, and adds the
; result to an area of memory. That is, it performs
; something like
;
; A B C D
; * Z
; __________
; P Q R S T
;
; and then adds the "PQRST" vector into an area of memory,
; handling all carries.
;
; Digression on nomenclature and endian-ness:
;
; Each of the capital letters in the above represents a 64-bit
; quantity. That is, you could think of the discussion as
; being in terms of radix-16-quintillion arithmetic. The data
; type being manipulated is "unsigned long long int". This
; requires the 64-bit extension of the HP-UX C compiler,
; available at release 10. You need these compiler flags to
; enable these extensions:
;
; -Aa +e +DA2.0 +DS2.0
;
; (The first specifies ANSI C, the second enables the
; extensions, which are beyond ANSI C, and the third and
; fourth tell the compiler to use whatever features of the
; PA2.0 architecture it wishes, in order to made the code more
; efficient. Since the presence of the assembly code will
; make the program unable to run on anything less than PA2.0,
; you might as well gain the performance enhancements in the C
; code as well.)
;
; Questions of "endian-ness" often come up, usually in the
; context of byte ordering in a word. These routines have a
; similar issue, that could be called "wordian-ness".
; Independent of byte ordering (PA is always big-endian), one
; can make two choices when representing extremely large
; numbers as arrays of 64-bit doublewords in memory.
;
; "Little-wordian" layout means that the least significant
; word of a number is stored at the lowest address.
;
; MSW LSW
; | |
; V V
;
; A B C D E
;
; ^ ^ ^
; | | |____ address 0
; | |
; | |_______address 8
; |
; address 32
;
; "Big-wordian" means that the most significant word is at the
; lowest address.
;
; MSW LSW
; | |
; V V
;
; A B C D E
;
; ^ ^ ^
; | | |____ address 32
; | |
; | |_______address 24
; |
; address 0
;
; When you compile the file, you must specify one or the other, with
; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
;
; Incidentally, you assemble this file as part of your
; project with the same C compiler as the rest of the program.
; My "makefile" for a superprecision arithmetic package has
; the following stuff:
;
; # definitions:
; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
; CFLAGS = +O3
; LDFLAGS = -L /usr/lib -Wl,-aarchive
;
; # general build rule for ".s" files:
; .s.o:
; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
;
; # Now any bind step that calls for pa20.o will assemble pa20.s
;
; End of digression, back to arithmetic:
;
; The way we multiply two huge numbers is, of course, to multiply
; the "ABCD" vector by each of the "WXYZ" doublewords, adding
; the result vectors with increasing offsets, the way we learned
; in school, back before we all used calculators:
;
; A B C D
; * W X Y Z
; __________
; P Q R S T
; E F G H I
; M N O P Q
; + R S T U V
; _______________
; F I N A L S U M
;
; So we call maxpy_PA20_big (in my case; my package is
; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
; in turn as the "scalar", and giving the "ABCD" vector each
; time. We direct it to add its result into an area of memory
; that we have cleared at the start. We skew the exact
; location into that area with each call.
;
; The prototype for the function is
;
; extern void maxpy_PA20_big(
; int length, /* Number of doublewords in the multiplicand vector. */
; const long long int *scalaraddr, /* Address to fetch the scalar. */
; const long long int *multiplicand, /* The multiplicand vector. */
; long long int *result); /* Where to accumulate the result. */
;
; (You should place a copy of this prototype in an include file
; or in your C file.)
;
; Now, IN ALL CASES, the given address for the multiplicand or
; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
; That word is, of course, the word at which the routine
; starts processing. "maxpy_PA20_little" then increases the
; addresses as it computes. "maxpy_PA20_big" decreases them.
;
; In our example above, "length" would be 4 in each case.
; "multiplicand" would be the "ABCD" vector. Specifically,
; the address of the element "D". "scalaraddr" would be the
; address of "W", "X", "Y", or "Z" on the four calls that we
; would make. (The order doesn't matter, of course.)
; "result" would be the appropriate address in the result
; area. When multiplying by "Z", that would be the least
; significant word. When multiplying by "Y", it would be the
; next higher word (8 bytes higher if little-wordian; 8 bytes
; lower if big-wordian), and so on. The size of the result
; area must be the the sum of the sizes of the multiplicand
; and multiplier vectors, and must be initialized to zero
; before we start.
;
; Whenever the routine adds its partial product into the result
; vector, it follows carry chains as far as they need to go.
;
; Here is the super-precision multiply routine that I use for
; my package. The package is big-wordian. I have taken out
; handling of exponents (it's a floating point package):
;
; static void mul_PA20(
; int size,
; const long long int *arg1,
; const long long int *arg2,
; long long int *result)
; {
; int i;
;
; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
;
; for (i=0 ; i<size ; i++) {
; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
; }
; }