RetroZilla/security/nss/lib/freebl/mpi/hppa20.s

905 lines
28 KiB
ArmAsm
Raw Normal View History

2018-05-04 16:08:28 +02:00
; This Source Code Form is subject to the terms of the Mozilla Public
; License, v. 2.0. If a copy of the MPL was not distributed with this
; file, You can obtain one at http://mozilla.org/MPL/2.0/.
2015-10-21 05:03:22 +02:00
#ifdef __LP64__
.LEVEL 2.0W
#else
; .LEVEL 1.1
; .ALLOW 2.0N
2018-05-04 16:08:28 +02:00
.LEVEL 2.0
2015-10-21 05:03:22 +02:00
#endif
.SPACE $TEXT$,SORT=8
.SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
; ***************************************************************
;
; maxpy_[little/big]
;
; ***************************************************************
; There is no default -- you must specify one or the other.
#define LITTLE_WORDIAN 1
#ifdef LITTLE_WORDIAN
#define EIGHT 8
#define SIXTEEN 16
#define THIRTY_TWO 32
#define UN_EIGHT -8
#define UN_SIXTEEN -16
#define UN_TWENTY_FOUR -24
#endif
#ifdef BIG_WORDIAN
#define EIGHT -8
#define SIXTEEN -16
#define THIRTY_TWO -32
#define UN_EIGHT 8
#define UN_SIXTEEN 16
#define UN_TWENTY_FOUR 24
#endif
; This performs a multiple-precision integer version of "daxpy",
; Using the selected addressing direction. "Little-wordian" means that
; the least significant word of a number is stored at the lowest address.
; "Big-wordian" means that the most significant word is at the lowest
; address. Either way, the incoming address of the vector is that
; of the least significant word. That means that, for little-wordian
; addressing, we move the address upward as we propagate carries
; from the least significant word to the most significant. For
; big-wordian we move the address downward.
; We use the following registers:
;
; r2 return PC, of course
; r26 = arg1 = length
; r25 = arg2 = address of scalar
; r24 = arg3 = multiplicand vector
; r23 = arg4 = result vector
;
; fr9 = scalar loaded once only from r25
; The cycle counts shown in the bodies below are simply the result of a
; scheduling by hand. The actual PCX-U hardware does it differently.
; The intention is that the overall speed is the same.
; The pipeline startup and shutdown code is constructed in the usual way,
; by taking the loop bodies and removing unnecessary instructions.
; We have left the comments describing cycle numbers in the code.
; These are intended for reference when comparing with the main loop,
; and have no particular relationship to actual cycle numbers.
#ifdef LITTLE_WORDIAN
maxpy_little
#else
maxpy_big
#endif
.PROC
2018-05-04 16:08:28 +02:00
.CALLINFO FRAME=120,ENTRY_GR=4
.ENTRY
STW,MA %r3,128(%sp)
STW %r4,-124(%sp)
2015-10-21 05:03:22 +02:00
ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
FLDD 0(%r25),%fr9 ; fr9 = scalar
; First startup
FLDD 0(%r24),%fr24 ; Cycle 1
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
FLDD EIGHT(%r24),%fr28 ; Cycle 8
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
FSTD %fr24,-96(%sp)
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
FSTD %fr25,-80(%sp)
LDO SIXTEEN(%r24),%r24 ; Cycle 12
FSTD %fr31,-64(%sp)
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
FSTD %fr27,-48(%sp)
; Second startup
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
FSTD %fr30,-56(%sp)
FLDD 0(%r24),%fr24
FSTD %fr26,-88(%sp) ; Cycle 2
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
FSTD %fr28,-104(%sp)
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
LDD -96(%sp),%r3
FSTD %fr29,-72(%sp)
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
LDD -64(%sp),%r19
LDD -80(%sp),%r21
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
LDD -56(%sp),%r20
ADD %r21,%r3,%r3
ADD,DC %r20,%r19,%r19 ; Cycle 7
LDD -88(%sp),%r4
SHRPD %r3,%r0,32,%r21
LDD -48(%sp),%r1
FLDD EIGHT(%r24),%fr28 ; Cycle 8
LDD -104(%sp),%r31
ADD,DC %r0,%r0,%r20
SHRPD %r19,%r3,32,%r3
LDD -72(%sp),%r29 ; Cycle 9
SHRPD %r20,%r19,32,%r20
ADD %r21,%r1,%r1
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
ADD,DC %r3,%r4,%r4
FSTD %fr24,-96(%sp)
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
ADD,DC %r0,%r20,%r20
LDD 0(%r23),%r3
FSTD %fr25,-80(%sp)
LDO SIXTEEN(%r24),%r24 ; Cycle 12
FSTD %fr31,-64(%sp)
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
ADD %r0,%r0,%r0 ; clear the carry bit
ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
FSTD %fr27,-48(%sp)
; MFCTL %cr16,%r21 ; for timing
; STD %r21,-112(%sp)
; Here is the loop.
$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
ADD,DC %r29,%r4,%r4
FSTD %fr30,-56(%sp)
FLDD 0(%r24),%fr24
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
ADD %r3,%r1,%r1
FSTD %fr28,-104(%sp)
LDD UN_EIGHT(%r23),%r21
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
ADD,DC %r21,%r4,%r28
FSTD %fr29,-72(%sp)
LDD -96(%sp),%r3
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
ADD,DC %r20,%r31,%r22
LDD -64(%sp),%r19
LDD -80(%sp),%r21
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
ADD %r21,%r3,%r3
LDD -56(%sp),%r20
STD %r1,UN_SIXTEEN(%r23)
ADD,DC %r20,%r19,%r19 ; Cycle 7
SHRPD %r3,%r0,32,%r21
LDD -88(%sp),%r4
LDD -48(%sp),%r1
ADD,DC %r0,%r0,%r20 ; Cycle 8
SHRPD %r19,%r3,32,%r3
FLDD EIGHT(%r24),%fr28
LDD -104(%sp),%r31
SHRPD %r20,%r19,32,%r20 ; Cycle 9
ADD %r21,%r1,%r1
STD %r28,UN_EIGHT(%r23)
LDD -72(%sp),%r29
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
ADD,DC %r3,%r4,%r4
FSTD %fr24,-96(%sp)
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
ADD,DC %r0,%r20,%r20
FSTD %fr25,-80(%sp)
LDD 0(%r23),%r3
LDO SIXTEEN(%r24),%r24 ; Cycle 12
FSTD %fr31,-64(%sp)
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
ADD %r22,%r1,%r1
ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
FSTD %fr27,-48(%sp)
$ENDLOOP
; Shutdown code, first stage.
; MFCTL %cr16,%r21 ; for timing
; STD %r21,UN_SIXTEEN(%r23)
; LDD -112(%sp),%r21
; STD %r21,UN_EIGHT(%r23)
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
ADD,DC %r29,%r4,%r4
CMPIB,= 0,%r26,$ONEMORE
FSTD %fr30,-56(%sp)
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
ADD %r3,%r1,%r1 ; Cycle 3
FSTD %fr28,-104(%sp)
LDD UN_EIGHT(%r23),%r21
ADD,DC %r21,%r4,%r28 ; Cycle 4
FSTD %fr29,-72(%sp)
STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
LDD -96(%sp),%r3
ADD,DC %r20,%r31,%r22 ; Cycle 5
STD %r1,UN_SIXTEEN(%r23)
$JOIN4
LDD -64(%sp),%r19
LDD -80(%sp),%r21
ADD %r21,%r3,%r3 ; Cycle 6
LDD -56(%sp),%r20
ADD,DC %r20,%r19,%r19 ; Cycle 7
SHRPD %r3,%r0,32,%r21
LDD -88(%sp),%r4
LDD -48(%sp),%r1
ADD,DC %r0,%r0,%r20 ; Cycle 8
SHRPD %r19,%r3,32,%r3
LDD -104(%sp),%r31
SHRPD %r20,%r19,32,%r20 ; Cycle 9
ADD %r21,%r1,%r1
LDD -72(%sp),%r29
ADD,DC %r3,%r4,%r4 ; Cycle 10
ADD,DC %r0,%r20,%r20 ; Cycle 11
LDD 0(%r23),%r3
ADD %r22,%r1,%r1 ; Cycle 13
; Shutdown code, second stage.
ADD,DC %r29,%r4,%r4 ; Cycle 1
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
ADD %r3,%r1,%r1
ADD,DC %r21,%r4,%r28 ; Cycle 4
ADD,DC %r20,%r31,%r22 ; Cycle 5
STD %r1,UN_SIXTEEN(%r23); Cycle 6
STD %r28,UN_EIGHT(%r23) ; Cycle 9
LDD 0(%r23),%r3 ; Cycle 11
; Shutdown code, third stage.
LDO SIXTEEN(%r23),%r23
ADD %r3,%r22,%r1
$JOIN1 ADD,DC %r0,%r0,%r21
CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
STD %r1,UN_SIXTEEN(%r23)
; Final carry propagation
$FINAL1 LDO EIGHT(%r23),%r23
LDD UN_SIXTEEN(%r23),%r21
ADDI 1,%r21,%r21
CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
STD %r21,UN_SIXTEEN(%r23)
B $L0
NOP
; Here is the code that handles the difficult cases N=1, N=2, and N=3.
; We do the usual trick -- branch out of the startup code at appropriate
; points, and branch into the shutdown code.
$N_IS_SMALL
CMPIB,= 0,%r26,$N_IS_ONE
FSTD %fr24,-96(%sp) ; Cycle 10
FLDD EIGHT(%r24),%fr28 ; Cycle 8
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
FSTD %fr25,-80(%sp)
FSTD %fr31,-64(%sp) ; Cycle 12
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
FSTD %fr27,-48(%sp)
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
CMPIB,= 2,%r26,$N_IS_THREE
FSTD %fr30,-56(%sp)
; N = 2
FSTD %fr26,-88(%sp) ; Cycle 2
FSTD %fr28,-104(%sp) ; Cycle 3
LDD -96(%sp),%r3 ; Cycle 4
FSTD %fr29,-72(%sp)
B $JOIN4
ADD %r0,%r0,%r22
$N_IS_THREE
FLDD SIXTEEN(%r24),%fr24
FSTD %fr26,-88(%sp) ; Cycle 2
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
FSTD %fr28,-104(%sp)
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
LDD -96(%sp),%r3
FSTD %fr29,-72(%sp)
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
LDD -64(%sp),%r19
LDD -80(%sp),%r21
B $JOIN3
ADD %r0,%r0,%r22
$N_IS_ONE
FSTD %fr25,-80(%sp)
FSTD %fr27,-48(%sp)
FSTD %fr26,-88(%sp) ; Cycle 2
B $JOIN5
ADD %r0,%r0,%r22
; We came out of the unrolled loop with wrong parity. Do one more
; single cycle. This is quite tricky, because of the way the
; carry chains and SHRPD chains have been chopped up.
$ONEMORE
FLDD 0(%r24),%fr24
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
FSTD %fr28,-104(%sp)
LDD UN_EIGHT(%r23),%r21
ADD %r3,%r1,%r1
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
ADD,DC %r21,%r4,%r28
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
LDD -96(%sp),%r3
FSTD %fr29,-72(%sp)
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
ADD,DC %r20,%r31,%r22
LDD -64(%sp),%r19
LDD -80(%sp),%r21
STD %r1,UN_SIXTEEN(%r23); Cycle 6
$JOIN3
XMPYU %fr9L,%fr24R,%fr24
LDD -56(%sp),%r20
ADD %r21,%r3,%r3
ADD,DC %r20,%r19,%r19 ; Cycle 7
LDD -88(%sp),%r4
SHRPD %r3,%r0,32,%r21
LDD -48(%sp),%r1
LDD -104(%sp),%r31 ; Cycle 8
ADD,DC %r0,%r0,%r20
SHRPD %r19,%r3,32,%r3
LDD -72(%sp),%r29 ; Cycle 9
SHRPD %r20,%r19,32,%r20
ADD %r21,%r1,%r1
ADD,DC %r3,%r4,%r4 ; Cycle 10
FSTD %fr24,-96(%sp)
ADD,DC %r0,%r20,%r20 ; Cycle 11
LDD 0(%r23),%r3
FSTD %fr25,-80(%sp)
ADD %r22,%r1,%r1 ; Cycle 13
FSTD %fr27,-48(%sp)
; Shutdown code, stage 1-1/2.
ADD,DC %r29,%r4,%r4 ; Cycle 1
LDO SIXTEEN(%r23),%r23 ; Cycle 2
ADD,DC %r0,%r20,%r20
FSTD %fr26,-88(%sp)
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
ADD %r3,%r1,%r1
ADD,DC %r21,%r4,%r28 ; Cycle 4
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
ADD,DC %r20,%r31,%r22 ; Cycle 5
STD %r1,UN_SIXTEEN(%r23)
$JOIN5
LDD -96(%sp),%r3 ; moved from cycle 4
LDD -80(%sp),%r21
ADD %r21,%r3,%r3 ; Cycle 6
ADD,DC %r0,%r0,%r19 ; Cycle 7
LDD -88(%sp),%r4
SHRPD %r3,%r0,32,%r21
LDD -48(%sp),%r1
SHRPD %r19,%r3,32,%r3 ; Cycle 8
ADD %r21,%r1,%r1 ; Cycle 9
ADD,DC %r3,%r4,%r4 ; Cycle 10
LDD 0(%r23),%r3 ; Cycle 11
ADD %r22,%r1,%r1 ; Cycle 13
; Shutdown code, stage 2-1/2.
ADD,DC %r0,%r4,%r4 ; Cycle 1
LDO SIXTEEN(%r23),%r23 ; Cycle 2
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
ADD %r3,%r1,%r1
STD %r1,UN_SIXTEEN(%r23)
ADD,DC %r21,%r4,%r1
B $JOIN1
LDO EIGHT(%r23),%r23
; exit
$L0
2018-05-04 16:08:28 +02:00
LDW -124(%sp),%r4
BVE (%r2)
.EXIT
LDW,MB -128(%sp),%r3
2015-10-21 05:03:22 +02:00
.PROCEND
; ***************************************************************
;
; add_diag_[little/big]
;
; ***************************************************************
; The arguments are as follows:
; r2 return PC, of course
; r26 = arg1 = length
; r25 = arg2 = vector to square
; r24 = arg3 = result vector
#ifdef LITTLE_WORDIAN
add_diag_little
#else
add_diag_big
#endif
.PROC
2018-05-04 16:08:28 +02:00
.CALLINFO FRAME=120,ENTRY_GR=4
.ENTRY
STW,MA %r3,128(%sp)
STW %r4,-124(%sp)
2015-10-21 05:03:22 +02:00
ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
NOP
; Startup code
FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr30
LDO SIXTEEN(%r25),%r25 ; Cycle 6
FSTD %fr29,-88(%sp)
FSTD %fr27,-72(%sp) ; Cycle 7
CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
FSTD %fr30,-96(%sp)
FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
LDD -88(%sp),%r22 ; Cycle 3
LDD -72(%sp),%r31 ; Cycle 4
XMPYU %fr7R,%fr7R,%fr28
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr31
LDD -96(%sp),%r20 ; Cycle 6
FSTD %fr28,-80(%sp)
ADD %r0,%r0,%r0 ; clear the carry bit
ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
FSTD %fr24,-64(%sp)
; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
$DIAGLOOP
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
LDO SIXTEEN(%r25),%r25
LDD 0(%r24),%r1
FSTD %fr31,-104(%sp)
SHRPD %r0,%r31,31,%r4 ; Cycle 2
ADD,DC %r22,%r3,%r3
FLDD UN_SIXTEEN(%r25),%fr7
ADD,DC %r0,%r20,%r20 ; Cycle 3
ADD %r1,%r3,%r3
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
LDD -80(%sp),%r21
STD %r3,0(%r24)
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr30
LDD -64(%sp),%r29
LDD EIGHT(%r24),%r1
ADD,DC %r4,%r20,%r20 ; Cycle 6
LDD -104(%sp),%r19
FSTD %fr29,-88(%sp)
ADD %r20,%r1,%r1 ; Cycle 7
FSTD %fr27,-72(%sp)
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
LDO THIRTY_TWO(%r24),%r24
LDD UN_SIXTEEN(%r24),%r28
FSTD %fr30,-96(%sp)
SHRPD %r0,%r29,31,%r3 ; Cycle 2
ADD,DC %r21,%r4,%r4
FLDD UN_EIGHT(%r25),%fr7
STD %r1,UN_TWENTY_FOUR(%r24)
ADD,DC %r0,%r19,%r19 ; Cycle 3
ADD %r28,%r4,%r4
XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
LDD -88(%sp),%r22
STD %r4,UN_SIXTEEN(%r24)
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
XMPYU %fr7L,%fr7L,%fr31
LDD -72(%sp),%r31
LDD UN_EIGHT(%r24),%r28
ADD,DC %r3,%r19,%r19 ; Cycle 6
LDD -96(%sp),%r20
FSTD %fr28,-80(%sp)
ADD %r19,%r28,%r28 ; Cycle 7
FSTD %fr24,-64(%sp)
ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
STD %r28,UN_EIGHT(%r24)
$ENDDIAGLOOP
ADD,DC %r0,%r22,%r22
CMPIB,= 0,%r26,$ONEMOREDIAG
SHRPD %r31,%r0,31,%r3
; Shutdown code, first stage.
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
LDD 0(%r24),%r28
SHRPD %r0,%r31,31,%r4 ; Cycle 2
ADD %r3,%r22,%r3
ADD,DC %r0,%r20,%r20 ; Cycle 3
LDD -80(%sp),%r21
ADD %r3,%r28,%r3
LDD -64(%sp),%r29 ; Cycle 4
STD %r3,0(%r24)
LDD EIGHT(%r24),%r1 ; Cycle 5
LDO SIXTEEN(%r25),%r25 ; Cycle 6
LDD -104(%sp),%r19
ADD,DC %r4,%r20,%r20
ADD %r20,%r1,%r1 ; Cycle 7
ADD,DC %r0,%r21,%r21 ; Cycle 8
STD %r1,EIGHT(%r24)
; Shutdown code, second stage.
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
LDO THIRTY_TWO(%r24),%r24
LDD UN_SIXTEEN(%r24),%r1
SHRPD %r0,%r29,31,%r3 ; Cycle 2
ADD %r4,%r21,%r4
ADD,DC %r0,%r19,%r19 ; Cycle 3
ADD %r4,%r1,%r4
STD %r4,UN_SIXTEEN(%r24); Cycle 4
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
ADD,DC %r3,%r19,%r19 ; Cycle 6
ADD %r19,%r28,%r28 ; Cycle 7
ADD,DC %r0,%r0,%r22 ; Cycle 8
CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
STD %r28,UN_EIGHT(%r24)
; Final carry propagation
$FDIAG2
LDO EIGHT(%r24),%r24
LDD UN_EIGHT(%r24),%r26
ADDI 1,%r26,%r26
CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
STD %r26,UN_EIGHT(%r24)
B $Z0
NOP
; Here is the code that handles the difficult case N=1.
; We do the usual trick -- branch out of the startup code at appropriate
; points, and branch into the shutdown code.
$DIAG_N_IS_ONE
LDD -88(%sp),%r22
LDD -72(%sp),%r31
B $JOINDIAG
LDD -96(%sp),%r20
; We came out of the unrolled loop with wrong parity. Do one more
; single cycle. This is the "alternate body". It will, of course,
; give us opposite registers from the other case, so we need
; completely different shutdown code.
$ONEMOREDIAG
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
LDD 0(%r24),%r28
FLDD 0(%r25),%fr7 ; Cycle 2
SHRPD %r0,%r31,31,%r4
ADD %r3,%r22,%r3
ADD,DC %r0,%r20,%r20 ; Cycle 3
LDD -80(%sp),%r21
ADD %r3,%r28,%r3
LDD -64(%sp),%r29 ; Cycle 4
STD %r3,0(%r24)
XMPYU %fr7R,%fr7R,%fr29
LDD EIGHT(%r24),%r1 ; Cycle 5
XMPYU %fr7L,%fr7R,%fr27
XMPYU %fr7L,%fr7L,%fr30
LDD -104(%sp),%r19 ; Cycle 6
FSTD %fr29,-88(%sp)
ADD,DC %r4,%r20,%r20
FSTD %fr27,-72(%sp) ; Cycle 7
ADD %r20,%r1,%r1
ADD,DC %r0,%r21,%r21 ; Cycle 8
STD %r1,EIGHT(%r24)
; Shutdown code, first stage.
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
LDO THIRTY_TWO(%r24),%r24
FSTD %fr30,-96(%sp)
LDD UN_SIXTEEN(%r24),%r1
SHRPD %r0,%r29,31,%r3 ; Cycle 2
ADD %r4,%r21,%r4
ADD,DC %r0,%r19,%r19 ; Cycle 3
LDD -88(%sp),%r22
ADD %r4,%r1,%r4
LDD -72(%sp),%r31 ; Cycle 4
STD %r4,UN_SIXTEEN(%r24)
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
LDD -96(%sp),%r20 ; Cycle 6
ADD,DC %r3,%r19,%r19
ADD %r19,%r28,%r28 ; Cycle 7
ADD,DC %r0,%r22,%r22 ; Cycle 8
STD %r28,UN_EIGHT(%r24)
; Shutdown code, second stage.
$JOINDIAG
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
LDD 0(%r24),%r28
SHRPD %r0,%r31,31,%r4 ; Cycle 2
ADD %r3,%r22,%r3
ADD,DC %r0,%r20,%r20 ; Cycle 3
ADD %r3,%r28,%r3
STD %r3,0(%r24) ; Cycle 4
LDD EIGHT(%r24),%r1 ; Cycle 5
ADD,DC %r4,%r20,%r20
ADD %r20,%r1,%r1 ; Cycle 7
ADD,DC %r0,%r0,%r21 ; Cycle 8
CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
STD %r1,EIGHT(%r24)
; Final carry propagation
$FDIAG1
LDO EIGHT(%r24),%r24
LDD EIGHT(%r24),%r26
ADDI 1,%r26,%r26
CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
STD %r26,EIGHT(%r24)
$Z0
2018-05-04 16:08:28 +02:00
LDW -124(%sp),%r4
BVE (%r2)
.EXIT
LDW,MB -128(%sp),%r3
2015-10-21 05:03:22 +02:00
.PROCEND
; .ALLOW
.SPACE $TEXT$
.SUBSPA $CODE$
#ifdef LITTLE_WORDIAN
2018-05-04 16:08:28 +02:00
#ifdef __GNUC__
; GNU-as (as of 2.19) does not support LONG_RETURN
.EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
#else
2015-10-21 05:03:22 +02:00
.EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
.EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
2018-05-04 16:08:28 +02:00
#endif
2015-10-21 05:03:22 +02:00
#else
.EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
.EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
#endif
.END
; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
;
; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
; performs a 64-bit x any-size multiply, and adds the
; result to an area of memory. That is, it performs
; something like
;
; A B C D
; * Z
; __________
; P Q R S T
;
; and then adds the "PQRST" vector into an area of memory,
; handling all carries.
;
; Digression on nomenclature and endian-ness:
;
; Each of the capital letters in the above represents a 64-bit
; quantity. That is, you could think of the discussion as
; being in terms of radix-16-quintillion arithmetic. The data
; type being manipulated is "unsigned long long int". This
; requires the 64-bit extension of the HP-UX C compiler,
; available at release 10. You need these compiler flags to
; enable these extensions:
;
; -Aa +e +DA2.0 +DS2.0
;
; (The first specifies ANSI C, the second enables the
; extensions, which are beyond ANSI C, and the third and
; fourth tell the compiler to use whatever features of the
; PA2.0 architecture it wishes, in order to made the code more
; efficient. Since the presence of the assembly code will
; make the program unable to run on anything less than PA2.0,
; you might as well gain the performance enhancements in the C
; code as well.)
;
; Questions of "endian-ness" often come up, usually in the
; context of byte ordering in a word. These routines have a
; similar issue, that could be called "wordian-ness".
; Independent of byte ordering (PA is always big-endian), one
; can make two choices when representing extremely large
; numbers as arrays of 64-bit doublewords in memory.
;
; "Little-wordian" layout means that the least significant
; word of a number is stored at the lowest address.
;
; MSW LSW
; | |
; V V
;
; A B C D E
;
; ^ ^ ^
; | | |____ address 0
; | |
; | |_______address 8
; |
; address 32
;
; "Big-wordian" means that the most significant word is at the
; lowest address.
;
; MSW LSW
; | |
; V V
;
; A B C D E
;
; ^ ^ ^
; | | |____ address 32
; | |
; | |_______address 24
; |
; address 0
;
; When you compile the file, you must specify one or the other, with
; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
;
; Incidentally, you assemble this file as part of your
; project with the same C compiler as the rest of the program.
; My "makefile" for a superprecision arithmetic package has
; the following stuff:
;
; # definitions:
; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
; CFLAGS = +O3
; LDFLAGS = -L /usr/lib -Wl,-aarchive
;
; # general build rule for ".s" files:
; .s.o:
; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
;
; # Now any bind step that calls for pa20.o will assemble pa20.s
;
; End of digression, back to arithmetic:
;
; The way we multiply two huge numbers is, of course, to multiply
; the "ABCD" vector by each of the "WXYZ" doublewords, adding
; the result vectors with increasing offsets, the way we learned
; in school, back before we all used calculators:
;
; A B C D
; * W X Y Z
; __________
; P Q R S T
; E F G H I
; M N O P Q
; + R S T U V
; _______________
; F I N A L S U M
;
; So we call maxpy_PA20_big (in my case; my package is
; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
; in turn as the "scalar", and giving the "ABCD" vector each
; time. We direct it to add its result into an area of memory
; that we have cleared at the start. We skew the exact
; location into that area with each call.
;
; The prototype for the function is
;
; extern void maxpy_PA20_big(
; int length, /* Number of doublewords in the multiplicand vector. */
; const long long int *scalaraddr, /* Address to fetch the scalar. */
; const long long int *multiplicand, /* The multiplicand vector. */
; long long int *result); /* Where to accumulate the result. */
;
; (You should place a copy of this prototype in an include file
; or in your C file.)
;
; Now, IN ALL CASES, the given address for the multiplicand or
; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
; That word is, of course, the word at which the routine
; starts processing. "maxpy_PA20_little" then increases the
; addresses as it computes. "maxpy_PA20_big" decreases them.
;
; In our example above, "length" would be 4 in each case.
; "multiplicand" would be the "ABCD" vector. Specifically,
; the address of the element "D". "scalaraddr" would be the
; address of "W", "X", "Y", or "Z" on the four calls that we
; would make. (The order doesn't matter, of course.)
; "result" would be the appropriate address in the result
; area. When multiplying by "Z", that would be the least
; significant word. When multiplying by "Y", it would be the
; next higher word (8 bytes higher if little-wordian; 8 bytes
; lower if big-wordian), and so on. The size of the result
; area must be the the sum of the sizes of the multiplicand
; and multiplier vectors, and must be initialized to zero
; before we start.
;
; Whenever the routine adds its partial product into the result
; vector, it follows carry chains as far as they need to go.
;
; Here is the super-precision multiply routine that I use for
; my package. The package is big-wordian. I have taken out
; handling of exponents (it's a floating point package):
;
; static void mul_PA20(
; int size,
; const long long int *arg1,
; const long long int *arg2,
; long long int *result)
; {
; int i;
;
; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
;
; for (i=0 ; i<size ; i++) {
; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
; }
; }