2018-05-04 16:08:28 +02:00
|
|
|
|
; This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
|
; License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
|
; file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
2015-10-21 05:03:22 +02:00
|
|
|
|
|
|
|
|
|
#ifdef __LP64__
|
|
|
|
|
.LEVEL 2.0W
|
|
|
|
|
#else
|
|
|
|
|
; .LEVEL 1.1
|
|
|
|
|
; .ALLOW 2.0N
|
2018-05-04 16:08:28 +02:00
|
|
|
|
.LEVEL 2.0
|
2015-10-21 05:03:22 +02:00
|
|
|
|
#endif
|
|
|
|
|
.SPACE $TEXT$,SORT=8
|
|
|
|
|
.SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
|
|
|
|
|
|
|
|
|
|
; ***************************************************************
|
|
|
|
|
;
|
|
|
|
|
; maxpy_[little/big]
|
|
|
|
|
;
|
|
|
|
|
; ***************************************************************
|
|
|
|
|
|
|
|
|
|
; There is no default -- you must specify one or the other.
|
|
|
|
|
#define LITTLE_WORDIAN 1
|
|
|
|
|
|
|
|
|
|
#ifdef LITTLE_WORDIAN
|
|
|
|
|
#define EIGHT 8
|
|
|
|
|
#define SIXTEEN 16
|
|
|
|
|
#define THIRTY_TWO 32
|
|
|
|
|
#define UN_EIGHT -8
|
|
|
|
|
#define UN_SIXTEEN -16
|
|
|
|
|
#define UN_TWENTY_FOUR -24
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifdef BIG_WORDIAN
|
|
|
|
|
#define EIGHT -8
|
|
|
|
|
#define SIXTEEN -16
|
|
|
|
|
#define THIRTY_TWO -32
|
|
|
|
|
#define UN_EIGHT 8
|
|
|
|
|
#define UN_SIXTEEN 16
|
|
|
|
|
#define UN_TWENTY_FOUR 24
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
; This performs a multiple-precision integer version of "daxpy",
|
|
|
|
|
; Using the selected addressing direction. "Little-wordian" means that
|
|
|
|
|
; the least significant word of a number is stored at the lowest address.
|
|
|
|
|
; "Big-wordian" means that the most significant word is at the lowest
|
|
|
|
|
; address. Either way, the incoming address of the vector is that
|
|
|
|
|
; of the least significant word. That means that, for little-wordian
|
|
|
|
|
; addressing, we move the address upward as we propagate carries
|
|
|
|
|
; from the least significant word to the most significant. For
|
|
|
|
|
; big-wordian we move the address downward.
|
|
|
|
|
|
|
|
|
|
; We use the following registers:
|
|
|
|
|
;
|
|
|
|
|
; r2 return PC, of course
|
|
|
|
|
; r26 = arg1 = length
|
|
|
|
|
; r25 = arg2 = address of scalar
|
|
|
|
|
; r24 = arg3 = multiplicand vector
|
|
|
|
|
; r23 = arg4 = result vector
|
|
|
|
|
;
|
|
|
|
|
; fr9 = scalar loaded once only from r25
|
|
|
|
|
|
|
|
|
|
; The cycle counts shown in the bodies below are simply the result of a
|
|
|
|
|
; scheduling by hand. The actual PCX-U hardware does it differently.
|
|
|
|
|
; The intention is that the overall speed is the same.
|
|
|
|
|
|
|
|
|
|
; The pipeline startup and shutdown code is constructed in the usual way,
|
|
|
|
|
; by taking the loop bodies and removing unnecessary instructions.
|
|
|
|
|
; We have left the comments describing cycle numbers in the code.
|
|
|
|
|
; These are intended for reference when comparing with the main loop,
|
|
|
|
|
; and have no particular relationship to actual cycle numbers.
|
|
|
|
|
|
|
|
|
|
#ifdef LITTLE_WORDIAN
|
|
|
|
|
maxpy_little
|
|
|
|
|
#else
|
|
|
|
|
maxpy_big
|
|
|
|
|
#endif
|
|
|
|
|
.PROC
|
2018-05-04 16:08:28 +02:00
|
|
|
|
.CALLINFO FRAME=120,ENTRY_GR=4
|
|
|
|
|
.ENTRY
|
|
|
|
|
STW,MA %r3,128(%sp)
|
|
|
|
|
STW %r4,-124(%sp)
|
2015-10-21 05:03:22 +02:00
|
|
|
|
|
|
|
|
|
ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
|
|
|
|
|
FLDD 0(%r25),%fr9 ; fr9 = scalar
|
|
|
|
|
|
|
|
|
|
; First startup
|
|
|
|
|
|
|
|
|
|
FLDD 0(%r24),%fr24 ; Cycle 1
|
|
|
|
|
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
|
|
|
|
|
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
|
|
|
|
|
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
|
|
|
|
|
CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
|
|
|
|
|
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
|
|
|
|
|
FLDD EIGHT(%r24),%fr28 ; Cycle 8
|
|
|
|
|
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
|
|
|
|
|
FSTD %fr24,-96(%sp)
|
|
|
|
|
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
|
|
|
|
|
FSTD %fr25,-80(%sp)
|
|
|
|
|
LDO SIXTEEN(%r24),%r24 ; Cycle 12
|
|
|
|
|
FSTD %fr31,-64(%sp)
|
|
|
|
|
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
|
|
|
|
|
FSTD %fr27,-48(%sp)
|
|
|
|
|
|
|
|
|
|
; Second startup
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
|
|
|
|
|
FSTD %fr30,-56(%sp)
|
|
|
|
|
FLDD 0(%r24),%fr24
|
|
|
|
|
|
|
|
|
|
FSTD %fr26,-88(%sp) ; Cycle 2
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
|
|
|
|
|
FSTD %fr28,-104(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
|
|
|
|
|
LDD -96(%sp),%r3
|
|
|
|
|
FSTD %fr29,-72(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
|
|
|
|
|
LDD -64(%sp),%r19
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
|
|
|
|
|
LDD -56(%sp),%r20
|
|
|
|
|
ADD %r21,%r3,%r3
|
|
|
|
|
|
|
|
|
|
ADD,DC %r20,%r19,%r19 ; Cycle 7
|
|
|
|
|
LDD -88(%sp),%r4
|
|
|
|
|
SHRPD %r3,%r0,32,%r21
|
|
|
|
|
LDD -48(%sp),%r1
|
|
|
|
|
|
|
|
|
|
FLDD EIGHT(%r24),%fr28 ; Cycle 8
|
|
|
|
|
LDD -104(%sp),%r31
|
|
|
|
|
ADD,DC %r0,%r0,%r20
|
|
|
|
|
SHRPD %r19,%r3,32,%r3
|
|
|
|
|
|
|
|
|
|
LDD -72(%sp),%r29 ; Cycle 9
|
|
|
|
|
SHRPD %r20,%r19,32,%r20
|
|
|
|
|
ADD %r21,%r1,%r1
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
|
|
|
|
|
ADD,DC %r3,%r4,%r4
|
|
|
|
|
FSTD %fr24,-96(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
|
|
|
|
|
ADD,DC %r0,%r20,%r20
|
|
|
|
|
LDD 0(%r23),%r3
|
|
|
|
|
FSTD %fr25,-80(%sp)
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r24),%r24 ; Cycle 12
|
|
|
|
|
FSTD %fr31,-64(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
|
|
|
|
|
ADD %r0,%r0,%r0 ; clear the carry bit
|
|
|
|
|
ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
|
|
|
|
|
FSTD %fr27,-48(%sp)
|
|
|
|
|
; MFCTL %cr16,%r21 ; for timing
|
|
|
|
|
; STD %r21,-112(%sp)
|
|
|
|
|
|
|
|
|
|
; Here is the loop.
|
|
|
|
|
|
|
|
|
|
$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
|
|
|
|
|
ADD,DC %r29,%r4,%r4
|
|
|
|
|
FSTD %fr30,-56(%sp)
|
|
|
|
|
FLDD 0(%r24),%fr24
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r23),%r23 ; Cycle 2
|
|
|
|
|
ADD,DC %r0,%r20,%r20
|
|
|
|
|
FSTD %fr26,-88(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
|
|
|
|
|
ADD %r3,%r1,%r1
|
|
|
|
|
FSTD %fr28,-104(%sp)
|
|
|
|
|
LDD UN_EIGHT(%r23),%r21
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
|
|
|
|
|
ADD,DC %r21,%r4,%r28
|
|
|
|
|
FSTD %fr29,-72(%sp)
|
|
|
|
|
LDD -96(%sp),%r3
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
|
|
|
|
|
ADD,DC %r20,%r31,%r22
|
|
|
|
|
LDD -64(%sp),%r19
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
|
|
|
|
|
ADD %r21,%r3,%r3
|
|
|
|
|
LDD -56(%sp),%r20
|
|
|
|
|
STD %r1,UN_SIXTEEN(%r23)
|
|
|
|
|
|
|
|
|
|
ADD,DC %r20,%r19,%r19 ; Cycle 7
|
|
|
|
|
SHRPD %r3,%r0,32,%r21
|
|
|
|
|
LDD -88(%sp),%r4
|
|
|
|
|
LDD -48(%sp),%r1
|
|
|
|
|
|
|
|
|
|
ADD,DC %r0,%r0,%r20 ; Cycle 8
|
|
|
|
|
SHRPD %r19,%r3,32,%r3
|
|
|
|
|
FLDD EIGHT(%r24),%fr28
|
|
|
|
|
LDD -104(%sp),%r31
|
|
|
|
|
|
|
|
|
|
SHRPD %r20,%r19,32,%r20 ; Cycle 9
|
|
|
|
|
ADD %r21,%r1,%r1
|
|
|
|
|
STD %r28,UN_EIGHT(%r23)
|
|
|
|
|
LDD -72(%sp),%r29
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
|
|
|
|
|
ADD,DC %r3,%r4,%r4
|
|
|
|
|
FSTD %fr24,-96(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
|
|
|
|
|
ADD,DC %r0,%r20,%r20
|
|
|
|
|
FSTD %fr25,-80(%sp)
|
|
|
|
|
LDD 0(%r23),%r3
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r24),%r24 ; Cycle 12
|
|
|
|
|
FSTD %fr31,-64(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
|
|
|
|
|
ADD %r22,%r1,%r1
|
|
|
|
|
ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
|
|
|
|
|
FSTD %fr27,-48(%sp)
|
|
|
|
|
|
|
|
|
|
$ENDLOOP
|
|
|
|
|
|
|
|
|
|
; Shutdown code, first stage.
|
|
|
|
|
|
|
|
|
|
; MFCTL %cr16,%r21 ; for timing
|
|
|
|
|
; STD %r21,UN_SIXTEEN(%r23)
|
|
|
|
|
; LDD -112(%sp),%r21
|
|
|
|
|
; STD %r21,UN_EIGHT(%r23)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
|
|
|
|
|
ADD,DC %r29,%r4,%r4
|
|
|
|
|
CMPIB,= 0,%r26,$ONEMORE
|
|
|
|
|
FSTD %fr30,-56(%sp)
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r23),%r23 ; Cycle 2
|
|
|
|
|
ADD,DC %r0,%r20,%r20
|
|
|
|
|
FSTD %fr26,-88(%sp)
|
|
|
|
|
|
|
|
|
|
ADD %r3,%r1,%r1 ; Cycle 3
|
|
|
|
|
FSTD %fr28,-104(%sp)
|
|
|
|
|
LDD UN_EIGHT(%r23),%r21
|
|
|
|
|
|
|
|
|
|
ADD,DC %r21,%r4,%r28 ; Cycle 4
|
|
|
|
|
FSTD %fr29,-72(%sp)
|
|
|
|
|
STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
|
|
|
|
|
LDD -96(%sp),%r3
|
|
|
|
|
|
|
|
|
|
ADD,DC %r20,%r31,%r22 ; Cycle 5
|
|
|
|
|
STD %r1,UN_SIXTEEN(%r23)
|
|
|
|
|
$JOIN4
|
|
|
|
|
LDD -64(%sp),%r19
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
|
|
|
|
|
ADD %r21,%r3,%r3 ; Cycle 6
|
|
|
|
|
LDD -56(%sp),%r20
|
|
|
|
|
|
|
|
|
|
ADD,DC %r20,%r19,%r19 ; Cycle 7
|
|
|
|
|
SHRPD %r3,%r0,32,%r21
|
|
|
|
|
LDD -88(%sp),%r4
|
|
|
|
|
LDD -48(%sp),%r1
|
|
|
|
|
|
|
|
|
|
ADD,DC %r0,%r0,%r20 ; Cycle 8
|
|
|
|
|
SHRPD %r19,%r3,32,%r3
|
|
|
|
|
LDD -104(%sp),%r31
|
|
|
|
|
|
|
|
|
|
SHRPD %r20,%r19,32,%r20 ; Cycle 9
|
|
|
|
|
ADD %r21,%r1,%r1
|
|
|
|
|
LDD -72(%sp),%r29
|
|
|
|
|
|
|
|
|
|
ADD,DC %r3,%r4,%r4 ; Cycle 10
|
|
|
|
|
|
|
|
|
|
ADD,DC %r0,%r20,%r20 ; Cycle 11
|
|
|
|
|
LDD 0(%r23),%r3
|
|
|
|
|
|
|
|
|
|
ADD %r22,%r1,%r1 ; Cycle 13
|
|
|
|
|
|
|
|
|
|
; Shutdown code, second stage.
|
|
|
|
|
|
|
|
|
|
ADD,DC %r29,%r4,%r4 ; Cycle 1
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r23),%r23 ; Cycle 2
|
|
|
|
|
ADD,DC %r0,%r20,%r20
|
|
|
|
|
|
|
|
|
|
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
|
|
|
|
|
ADD %r3,%r1,%r1
|
|
|
|
|
|
|
|
|
|
ADD,DC %r21,%r4,%r28 ; Cycle 4
|
|
|
|
|
|
|
|
|
|
ADD,DC %r20,%r31,%r22 ; Cycle 5
|
|
|
|
|
|
|
|
|
|
STD %r1,UN_SIXTEEN(%r23); Cycle 6
|
|
|
|
|
|
|
|
|
|
STD %r28,UN_EIGHT(%r23) ; Cycle 9
|
|
|
|
|
|
|
|
|
|
LDD 0(%r23),%r3 ; Cycle 11
|
|
|
|
|
|
|
|
|
|
; Shutdown code, third stage.
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r23),%r23
|
|
|
|
|
ADD %r3,%r22,%r1
|
|
|
|
|
$JOIN1 ADD,DC %r0,%r0,%r21
|
|
|
|
|
CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
|
|
|
|
|
STD %r1,UN_SIXTEEN(%r23)
|
|
|
|
|
|
|
|
|
|
; Final carry propagation
|
|
|
|
|
|
|
|
|
|
$FINAL1 LDO EIGHT(%r23),%r23
|
|
|
|
|
LDD UN_SIXTEEN(%r23),%r21
|
|
|
|
|
ADDI 1,%r21,%r21
|
|
|
|
|
CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
|
|
|
|
|
STD %r21,UN_SIXTEEN(%r23)
|
|
|
|
|
B $L0
|
|
|
|
|
NOP
|
|
|
|
|
|
|
|
|
|
; Here is the code that handles the difficult cases N=1, N=2, and N=3.
|
|
|
|
|
; We do the usual trick -- branch out of the startup code at appropriate
|
|
|
|
|
; points, and branch into the shutdown code.
|
|
|
|
|
|
|
|
|
|
$N_IS_SMALL
|
|
|
|
|
CMPIB,= 0,%r26,$N_IS_ONE
|
|
|
|
|
FSTD %fr24,-96(%sp) ; Cycle 10
|
|
|
|
|
FLDD EIGHT(%r24),%fr28 ; Cycle 8
|
|
|
|
|
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
|
|
|
|
|
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
|
|
|
|
|
FSTD %fr25,-80(%sp)
|
|
|
|
|
FSTD %fr31,-64(%sp) ; Cycle 12
|
|
|
|
|
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
|
|
|
|
|
FSTD %fr27,-48(%sp)
|
|
|
|
|
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
|
|
|
|
|
CMPIB,= 2,%r26,$N_IS_THREE
|
|
|
|
|
FSTD %fr30,-56(%sp)
|
|
|
|
|
|
|
|
|
|
; N = 2
|
|
|
|
|
FSTD %fr26,-88(%sp) ; Cycle 2
|
|
|
|
|
FSTD %fr28,-104(%sp) ; Cycle 3
|
|
|
|
|
LDD -96(%sp),%r3 ; Cycle 4
|
|
|
|
|
FSTD %fr29,-72(%sp)
|
|
|
|
|
B $JOIN4
|
|
|
|
|
ADD %r0,%r0,%r22
|
|
|
|
|
|
|
|
|
|
$N_IS_THREE
|
|
|
|
|
FLDD SIXTEEN(%r24),%fr24
|
|
|
|
|
FSTD %fr26,-88(%sp) ; Cycle 2
|
|
|
|
|
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
|
|
|
|
|
FSTD %fr28,-104(%sp)
|
|
|
|
|
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
|
|
|
|
|
LDD -96(%sp),%r3
|
|
|
|
|
FSTD %fr29,-72(%sp)
|
|
|
|
|
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
|
|
|
|
|
LDD -64(%sp),%r19
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
B $JOIN3
|
|
|
|
|
ADD %r0,%r0,%r22
|
|
|
|
|
|
|
|
|
|
$N_IS_ONE
|
|
|
|
|
FSTD %fr25,-80(%sp)
|
|
|
|
|
FSTD %fr27,-48(%sp)
|
|
|
|
|
FSTD %fr26,-88(%sp) ; Cycle 2
|
|
|
|
|
B $JOIN5
|
|
|
|
|
ADD %r0,%r0,%r22
|
|
|
|
|
|
|
|
|
|
; We came out of the unrolled loop with wrong parity. Do one more
|
|
|
|
|
; single cycle. This is quite tricky, because of the way the
|
|
|
|
|
; carry chains and SHRPD chains have been chopped up.
|
|
|
|
|
|
|
|
|
|
$ONEMORE
|
|
|
|
|
|
|
|
|
|
FLDD 0(%r24),%fr24
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r23),%r23 ; Cycle 2
|
|
|
|
|
ADD,DC %r0,%r20,%r20
|
|
|
|
|
FSTD %fr26,-88(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
|
|
|
|
|
FSTD %fr28,-104(%sp)
|
|
|
|
|
LDD UN_EIGHT(%r23),%r21
|
|
|
|
|
ADD %r3,%r1,%r1
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
|
|
|
|
|
ADD,DC %r21,%r4,%r28
|
|
|
|
|
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
|
|
|
|
|
LDD -96(%sp),%r3
|
|
|
|
|
FSTD %fr29,-72(%sp)
|
|
|
|
|
|
|
|
|
|
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
|
|
|
|
|
ADD,DC %r20,%r31,%r22
|
|
|
|
|
LDD -64(%sp),%r19
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
|
|
|
|
|
STD %r1,UN_SIXTEEN(%r23); Cycle 6
|
|
|
|
|
$JOIN3
|
|
|
|
|
XMPYU %fr9L,%fr24R,%fr24
|
|
|
|
|
LDD -56(%sp),%r20
|
|
|
|
|
ADD %r21,%r3,%r3
|
|
|
|
|
|
|
|
|
|
ADD,DC %r20,%r19,%r19 ; Cycle 7
|
|
|
|
|
LDD -88(%sp),%r4
|
|
|
|
|
SHRPD %r3,%r0,32,%r21
|
|
|
|
|
LDD -48(%sp),%r1
|
|
|
|
|
|
|
|
|
|
LDD -104(%sp),%r31 ; Cycle 8
|
|
|
|
|
ADD,DC %r0,%r0,%r20
|
|
|
|
|
SHRPD %r19,%r3,32,%r3
|
|
|
|
|
|
|
|
|
|
LDD -72(%sp),%r29 ; Cycle 9
|
|
|
|
|
SHRPD %r20,%r19,32,%r20
|
|
|
|
|
ADD %r21,%r1,%r1
|
|
|
|
|
|
|
|
|
|
ADD,DC %r3,%r4,%r4 ; Cycle 10
|
|
|
|
|
FSTD %fr24,-96(%sp)
|
|
|
|
|
|
|
|
|
|
ADD,DC %r0,%r20,%r20 ; Cycle 11
|
|
|
|
|
LDD 0(%r23),%r3
|
|
|
|
|
FSTD %fr25,-80(%sp)
|
|
|
|
|
|
|
|
|
|
ADD %r22,%r1,%r1 ; Cycle 13
|
|
|
|
|
FSTD %fr27,-48(%sp)
|
|
|
|
|
|
|
|
|
|
; Shutdown code, stage 1-1/2.
|
|
|
|
|
|
|
|
|
|
ADD,DC %r29,%r4,%r4 ; Cycle 1
|
|
|
|
|
|
|
|
|
|
LDO SIXTEEN(%r23),%r23 ; Cycle 2
|
|
|
|
|
ADD,DC %r0,%r20,%r20
|
|
|
|
|
FSTD %fr26,-88(%sp)
|
|
|
|
|
|
|
|
|
|
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
|
|
|
|
|
ADD %r3,%r1,%r1
|
|
|
|
|
|
|
|
|
|
ADD,DC %r21,%r4,%r28 ; Cycle 4
|
|
|
|
|
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
|
|
|
|
|
|
|
|
|
|
ADD,DC %r20,%r31,%r22 ; Cycle 5
|
|
|
|
|
STD %r1,UN_SIXTEEN(%r23)
|
|
|
|
|
$JOIN5
|
|
|
|
|
LDD -96(%sp),%r3 ; moved from cycle 4
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
ADD %r21,%r3,%r3 ; Cycle 6
|
|
|
|
|
ADD,DC %r0,%r0,%r19 ; Cycle 7
|
|
|
|
|
LDD -88(%sp),%r4
|
|
|
|
|
SHRPD %r3,%r0,32,%r21
|
|
|
|
|
LDD -48(%sp),%r1
|
|
|
|
|
SHRPD %r19,%r3,32,%r3 ; Cycle 8
|
|
|
|
|
ADD %r21,%r1,%r1 ; Cycle 9
|
|
|
|
|
ADD,DC %r3,%r4,%r4 ; Cycle 10
|
|
|
|
|
LDD 0(%r23),%r3 ; Cycle 11
|
|
|
|
|
ADD %r22,%r1,%r1 ; Cycle 13
|
|
|
|
|
|
|
|
|
|
; Shutdown code, stage 2-1/2.
|
|
|
|
|
|
|
|
|
|
ADD,DC %r0,%r4,%r4 ; Cycle 1
|
|
|
|
|
LDO SIXTEEN(%r23),%r23 ; Cycle 2
|
|
|
|
|
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
|
|
|
|
|
ADD %r3,%r1,%r1
|
|
|
|
|
STD %r1,UN_SIXTEEN(%r23)
|
|
|
|
|
ADD,DC %r21,%r4,%r1
|
|
|
|
|
B $JOIN1
|
|
|
|
|
LDO EIGHT(%r23),%r23
|
|
|
|
|
|
|
|
|
|
; exit
|
|
|
|
|
|
|
|
|
|
$L0
|
2018-05-04 16:08:28 +02:00
|
|
|
|
LDW -124(%sp),%r4
|
|
|
|
|
BVE (%r2)
|
|
|
|
|
.EXIT
|
|
|
|
|
LDW,MB -128(%sp),%r3
|
2015-10-21 05:03:22 +02:00
|
|
|
|
|
|
|
|
|
.PROCEND
|
|
|
|
|
|
|
|
|
|
; ***************************************************************
|
|
|
|
|
;
|
|
|
|
|
; add_diag_[little/big]
|
|
|
|
|
;
|
|
|
|
|
; ***************************************************************
|
|
|
|
|
|
|
|
|
|
; The arguments are as follows:
|
|
|
|
|
; r2 return PC, of course
|
|
|
|
|
; r26 = arg1 = length
|
|
|
|
|
; r25 = arg2 = vector to square
|
|
|
|
|
; r24 = arg3 = result vector
|
|
|
|
|
|
|
|
|
|
#ifdef LITTLE_WORDIAN
|
|
|
|
|
add_diag_little
|
|
|
|
|
#else
|
|
|
|
|
add_diag_big
|
|
|
|
|
#endif
|
|
|
|
|
.PROC
|
2018-05-04 16:08:28 +02:00
|
|
|
|
.CALLINFO FRAME=120,ENTRY_GR=4
|
|
|
|
|
.ENTRY
|
|
|
|
|
STW,MA %r3,128(%sp)
|
|
|
|
|
STW %r4,-124(%sp)
|
2015-10-21 05:03:22 +02:00
|
|
|
|
|
|
|
|
|
ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
|
|
|
|
|
NOP
|
|
|
|
|
|
|
|
|
|
; Startup code
|
|
|
|
|
|
|
|
|
|
FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
|
|
|
|
|
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
|
|
|
|
|
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
|
|
|
|
|
XMPYU %fr7L,%fr7L,%fr30
|
|
|
|
|
LDO SIXTEEN(%r25),%r25 ; Cycle 6
|
|
|
|
|
FSTD %fr29,-88(%sp)
|
|
|
|
|
FSTD %fr27,-72(%sp) ; Cycle 7
|
|
|
|
|
CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
|
|
|
|
|
FSTD %fr30,-96(%sp)
|
|
|
|
|
FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
|
|
|
|
|
LDD -88(%sp),%r22 ; Cycle 3
|
|
|
|
|
LDD -72(%sp),%r31 ; Cycle 4
|
|
|
|
|
XMPYU %fr7R,%fr7R,%fr28
|
|
|
|
|
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
|
|
|
|
|
XMPYU %fr7L,%fr7L,%fr31
|
|
|
|
|
LDD -96(%sp),%r20 ; Cycle 6
|
|
|
|
|
FSTD %fr28,-80(%sp)
|
|
|
|
|
ADD %r0,%r0,%r0 ; clear the carry bit
|
|
|
|
|
ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
|
|
|
|
|
FSTD %fr24,-64(%sp)
|
|
|
|
|
|
|
|
|
|
; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
|
|
|
|
|
|
|
|
|
|
$DIAGLOOP
|
|
|
|
|
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
|
|
|
|
|
LDO SIXTEEN(%r25),%r25
|
|
|
|
|
LDD 0(%r24),%r1
|
|
|
|
|
FSTD %fr31,-104(%sp)
|
|
|
|
|
SHRPD %r0,%r31,31,%r4 ; Cycle 2
|
|
|
|
|
ADD,DC %r22,%r3,%r3
|
|
|
|
|
FLDD UN_SIXTEEN(%r25),%fr7
|
|
|
|
|
ADD,DC %r0,%r20,%r20 ; Cycle 3
|
|
|
|
|
ADD %r1,%r3,%r3
|
|
|
|
|
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
STD %r3,0(%r24)
|
|
|
|
|
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
|
|
|
|
|
XMPYU %fr7L,%fr7L,%fr30
|
|
|
|
|
LDD -64(%sp),%r29
|
|
|
|
|
LDD EIGHT(%r24),%r1
|
|
|
|
|
ADD,DC %r4,%r20,%r20 ; Cycle 6
|
|
|
|
|
LDD -104(%sp),%r19
|
|
|
|
|
FSTD %fr29,-88(%sp)
|
|
|
|
|
ADD %r20,%r1,%r1 ; Cycle 7
|
|
|
|
|
FSTD %fr27,-72(%sp)
|
|
|
|
|
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
|
|
|
|
|
LDO THIRTY_TWO(%r24),%r24
|
|
|
|
|
LDD UN_SIXTEEN(%r24),%r28
|
|
|
|
|
FSTD %fr30,-96(%sp)
|
|
|
|
|
SHRPD %r0,%r29,31,%r3 ; Cycle 2
|
|
|
|
|
ADD,DC %r21,%r4,%r4
|
|
|
|
|
FLDD UN_EIGHT(%r25),%fr7
|
|
|
|
|
STD %r1,UN_TWENTY_FOUR(%r24)
|
|
|
|
|
ADD,DC %r0,%r19,%r19 ; Cycle 3
|
|
|
|
|
ADD %r28,%r4,%r4
|
|
|
|
|
XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
|
|
|
|
|
LDD -88(%sp),%r22
|
|
|
|
|
STD %r4,UN_SIXTEEN(%r24)
|
|
|
|
|
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
|
|
|
|
|
XMPYU %fr7L,%fr7L,%fr31
|
|
|
|
|
LDD -72(%sp),%r31
|
|
|
|
|
LDD UN_EIGHT(%r24),%r28
|
|
|
|
|
ADD,DC %r3,%r19,%r19 ; Cycle 6
|
|
|
|
|
LDD -96(%sp),%r20
|
|
|
|
|
FSTD %fr28,-80(%sp)
|
|
|
|
|
ADD %r19,%r28,%r28 ; Cycle 7
|
|
|
|
|
FSTD %fr24,-64(%sp)
|
|
|
|
|
ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
|
|
|
|
|
STD %r28,UN_EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
$ENDDIAGLOOP
|
|
|
|
|
|
|
|
|
|
ADD,DC %r0,%r22,%r22
|
|
|
|
|
CMPIB,= 0,%r26,$ONEMOREDIAG
|
|
|
|
|
SHRPD %r31,%r0,31,%r3
|
|
|
|
|
|
|
|
|
|
; Shutdown code, first stage.
|
|
|
|
|
|
|
|
|
|
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
|
|
|
|
|
LDD 0(%r24),%r28
|
|
|
|
|
SHRPD %r0,%r31,31,%r4 ; Cycle 2
|
|
|
|
|
ADD %r3,%r22,%r3
|
|
|
|
|
ADD,DC %r0,%r20,%r20 ; Cycle 3
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
ADD %r3,%r28,%r3
|
|
|
|
|
LDD -64(%sp),%r29 ; Cycle 4
|
|
|
|
|
STD %r3,0(%r24)
|
|
|
|
|
LDD EIGHT(%r24),%r1 ; Cycle 5
|
|
|
|
|
LDO SIXTEEN(%r25),%r25 ; Cycle 6
|
|
|
|
|
LDD -104(%sp),%r19
|
|
|
|
|
ADD,DC %r4,%r20,%r20
|
|
|
|
|
ADD %r20,%r1,%r1 ; Cycle 7
|
|
|
|
|
ADD,DC %r0,%r21,%r21 ; Cycle 8
|
|
|
|
|
STD %r1,EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
; Shutdown code, second stage.
|
|
|
|
|
|
|
|
|
|
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
|
|
|
|
|
LDO THIRTY_TWO(%r24),%r24
|
|
|
|
|
LDD UN_SIXTEEN(%r24),%r1
|
|
|
|
|
SHRPD %r0,%r29,31,%r3 ; Cycle 2
|
|
|
|
|
ADD %r4,%r21,%r4
|
|
|
|
|
ADD,DC %r0,%r19,%r19 ; Cycle 3
|
|
|
|
|
ADD %r4,%r1,%r4
|
|
|
|
|
STD %r4,UN_SIXTEEN(%r24); Cycle 4
|
|
|
|
|
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
|
|
|
|
|
ADD,DC %r3,%r19,%r19 ; Cycle 6
|
|
|
|
|
ADD %r19,%r28,%r28 ; Cycle 7
|
|
|
|
|
ADD,DC %r0,%r0,%r22 ; Cycle 8
|
|
|
|
|
CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
|
|
|
|
|
STD %r28,UN_EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
; Final carry propagation
|
|
|
|
|
|
|
|
|
|
$FDIAG2
|
|
|
|
|
LDO EIGHT(%r24),%r24
|
|
|
|
|
LDD UN_EIGHT(%r24),%r26
|
|
|
|
|
ADDI 1,%r26,%r26
|
|
|
|
|
CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
|
|
|
|
|
STD %r26,UN_EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
B $Z0
|
|
|
|
|
NOP
|
|
|
|
|
|
|
|
|
|
; Here is the code that handles the difficult case N=1.
|
|
|
|
|
; We do the usual trick -- branch out of the startup code at appropriate
|
|
|
|
|
; points, and branch into the shutdown code.
|
|
|
|
|
|
|
|
|
|
$DIAG_N_IS_ONE
|
|
|
|
|
|
|
|
|
|
LDD -88(%sp),%r22
|
|
|
|
|
LDD -72(%sp),%r31
|
|
|
|
|
B $JOINDIAG
|
|
|
|
|
LDD -96(%sp),%r20
|
|
|
|
|
|
|
|
|
|
; We came out of the unrolled loop with wrong parity. Do one more
|
|
|
|
|
; single cycle. This is the "alternate body". It will, of course,
|
|
|
|
|
; give us opposite registers from the other case, so we need
|
|
|
|
|
; completely different shutdown code.
|
|
|
|
|
|
|
|
|
|
$ONEMOREDIAG
|
|
|
|
|
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
|
|
|
|
|
LDD 0(%r24),%r28
|
|
|
|
|
FLDD 0(%r25),%fr7 ; Cycle 2
|
|
|
|
|
SHRPD %r0,%r31,31,%r4
|
|
|
|
|
ADD %r3,%r22,%r3
|
|
|
|
|
ADD,DC %r0,%r20,%r20 ; Cycle 3
|
|
|
|
|
LDD -80(%sp),%r21
|
|
|
|
|
ADD %r3,%r28,%r3
|
|
|
|
|
LDD -64(%sp),%r29 ; Cycle 4
|
|
|
|
|
STD %r3,0(%r24)
|
|
|
|
|
XMPYU %fr7R,%fr7R,%fr29
|
|
|
|
|
LDD EIGHT(%r24),%r1 ; Cycle 5
|
|
|
|
|
XMPYU %fr7L,%fr7R,%fr27
|
|
|
|
|
XMPYU %fr7L,%fr7L,%fr30
|
|
|
|
|
LDD -104(%sp),%r19 ; Cycle 6
|
|
|
|
|
FSTD %fr29,-88(%sp)
|
|
|
|
|
ADD,DC %r4,%r20,%r20
|
|
|
|
|
FSTD %fr27,-72(%sp) ; Cycle 7
|
|
|
|
|
ADD %r20,%r1,%r1
|
|
|
|
|
ADD,DC %r0,%r21,%r21 ; Cycle 8
|
|
|
|
|
STD %r1,EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
; Shutdown code, first stage.
|
|
|
|
|
|
|
|
|
|
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
|
|
|
|
|
LDO THIRTY_TWO(%r24),%r24
|
|
|
|
|
FSTD %fr30,-96(%sp)
|
|
|
|
|
LDD UN_SIXTEEN(%r24),%r1
|
|
|
|
|
SHRPD %r0,%r29,31,%r3 ; Cycle 2
|
|
|
|
|
ADD %r4,%r21,%r4
|
|
|
|
|
ADD,DC %r0,%r19,%r19 ; Cycle 3
|
|
|
|
|
LDD -88(%sp),%r22
|
|
|
|
|
ADD %r4,%r1,%r4
|
|
|
|
|
LDD -72(%sp),%r31 ; Cycle 4
|
|
|
|
|
STD %r4,UN_SIXTEEN(%r24)
|
|
|
|
|
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
|
|
|
|
|
LDD -96(%sp),%r20 ; Cycle 6
|
|
|
|
|
ADD,DC %r3,%r19,%r19
|
|
|
|
|
ADD %r19,%r28,%r28 ; Cycle 7
|
|
|
|
|
ADD,DC %r0,%r22,%r22 ; Cycle 8
|
|
|
|
|
STD %r28,UN_EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
; Shutdown code, second stage.
|
|
|
|
|
|
|
|
|
|
$JOINDIAG
|
|
|
|
|
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
|
|
|
|
|
LDD 0(%r24),%r28
|
|
|
|
|
SHRPD %r0,%r31,31,%r4 ; Cycle 2
|
|
|
|
|
ADD %r3,%r22,%r3
|
|
|
|
|
ADD,DC %r0,%r20,%r20 ; Cycle 3
|
|
|
|
|
ADD %r3,%r28,%r3
|
|
|
|
|
STD %r3,0(%r24) ; Cycle 4
|
|
|
|
|
LDD EIGHT(%r24),%r1 ; Cycle 5
|
|
|
|
|
ADD,DC %r4,%r20,%r20
|
|
|
|
|
ADD %r20,%r1,%r1 ; Cycle 7
|
|
|
|
|
ADD,DC %r0,%r0,%r21 ; Cycle 8
|
|
|
|
|
CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
|
|
|
|
|
STD %r1,EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
; Final carry propagation
|
|
|
|
|
|
|
|
|
|
$FDIAG1
|
|
|
|
|
LDO EIGHT(%r24),%r24
|
|
|
|
|
LDD EIGHT(%r24),%r26
|
|
|
|
|
ADDI 1,%r26,%r26
|
|
|
|
|
CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
|
|
|
|
|
STD %r26,EIGHT(%r24)
|
|
|
|
|
|
|
|
|
|
$Z0
|
2018-05-04 16:08:28 +02:00
|
|
|
|
LDW -124(%sp),%r4
|
|
|
|
|
BVE (%r2)
|
|
|
|
|
.EXIT
|
|
|
|
|
LDW,MB -128(%sp),%r3
|
2015-10-21 05:03:22 +02:00
|
|
|
|
.PROCEND
|
|
|
|
|
; .ALLOW
|
|
|
|
|
|
|
|
|
|
.SPACE $TEXT$
|
|
|
|
|
.SUBSPA $CODE$
|
|
|
|
|
#ifdef LITTLE_WORDIAN
|
2018-05-04 16:08:28 +02:00
|
|
|
|
#ifdef __GNUC__
|
|
|
|
|
; GNU-as (as of 2.19) does not support LONG_RETURN
|
|
|
|
|
.EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
|
|
|
|
.EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
|
|
|
|
|
#else
|
2015-10-21 05:03:22 +02:00
|
|
|
|
.EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
|
|
|
|
|
.EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
|
2018-05-04 16:08:28 +02:00
|
|
|
|
#endif
|
2015-10-21 05:03:22 +02:00
|
|
|
|
#else
|
|
|
|
|
.EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
|
|
|
|
|
.EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
|
|
|
|
|
#endif
|
|
|
|
|
.END
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
|
|
|
|
|
;
|
|
|
|
|
; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
|
|
|
|
|
; performs a 64-bit x any-size multiply, and adds the
|
|
|
|
|
; result to an area of memory. That is, it performs
|
|
|
|
|
; something like
|
|
|
|
|
;
|
|
|
|
|
; A B C D
|
|
|
|
|
; * Z
|
|
|
|
|
; __________
|
|
|
|
|
; P Q R S T
|
|
|
|
|
;
|
|
|
|
|
; and then adds the "PQRST" vector into an area of memory,
|
|
|
|
|
; handling all carries.
|
|
|
|
|
;
|
|
|
|
|
; Digression on nomenclature and endian-ness:
|
|
|
|
|
;
|
|
|
|
|
; Each of the capital letters in the above represents a 64-bit
|
|
|
|
|
; quantity. That is, you could think of the discussion as
|
|
|
|
|
; being in terms of radix-16-quintillion arithmetic. The data
|
|
|
|
|
; type being manipulated is "unsigned long long int". This
|
|
|
|
|
; requires the 64-bit extension of the HP-UX C compiler,
|
|
|
|
|
; available at release 10. You need these compiler flags to
|
|
|
|
|
; enable these extensions:
|
|
|
|
|
;
|
|
|
|
|
; -Aa +e +DA2.0 +DS2.0
|
|
|
|
|
;
|
|
|
|
|
; (The first specifies ANSI C, the second enables the
|
|
|
|
|
; extensions, which are beyond ANSI C, and the third and
|
|
|
|
|
; fourth tell the compiler to use whatever features of the
|
|
|
|
|
; PA2.0 architecture it wishes, in order to made the code more
|
|
|
|
|
; efficient. Since the presence of the assembly code will
|
|
|
|
|
; make the program unable to run on anything less than PA2.0,
|
|
|
|
|
; you might as well gain the performance enhancements in the C
|
|
|
|
|
; code as well.)
|
|
|
|
|
;
|
|
|
|
|
; Questions of "endian-ness" often come up, usually in the
|
|
|
|
|
; context of byte ordering in a word. These routines have a
|
|
|
|
|
; similar issue, that could be called "wordian-ness".
|
|
|
|
|
; Independent of byte ordering (PA is always big-endian), one
|
|
|
|
|
; can make two choices when representing extremely large
|
|
|
|
|
; numbers as arrays of 64-bit doublewords in memory.
|
|
|
|
|
;
|
|
|
|
|
; "Little-wordian" layout means that the least significant
|
|
|
|
|
; word of a number is stored at the lowest address.
|
|
|
|
|
;
|
|
|
|
|
; MSW LSW
|
|
|
|
|
; | |
|
|
|
|
|
; V V
|
|
|
|
|
;
|
|
|
|
|
; A B C D E
|
|
|
|
|
;
|
|
|
|
|
; ^ ^ ^
|
|
|
|
|
; | | |____ address 0
|
|
|
|
|
; | |
|
|
|
|
|
; | |_______address 8
|
|
|
|
|
; |
|
|
|
|
|
; address 32
|
|
|
|
|
;
|
|
|
|
|
; "Big-wordian" means that the most significant word is at the
|
|
|
|
|
; lowest address.
|
|
|
|
|
;
|
|
|
|
|
; MSW LSW
|
|
|
|
|
; | |
|
|
|
|
|
; V V
|
|
|
|
|
;
|
|
|
|
|
; A B C D E
|
|
|
|
|
;
|
|
|
|
|
; ^ ^ ^
|
|
|
|
|
; | | |____ address 32
|
|
|
|
|
; | |
|
|
|
|
|
; | |_______address 24
|
|
|
|
|
; |
|
|
|
|
|
; address 0
|
|
|
|
|
;
|
|
|
|
|
; When you compile the file, you must specify one or the other, with
|
|
|
|
|
; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
|
|
|
|
|
;
|
|
|
|
|
; Incidentally, you assemble this file as part of your
|
|
|
|
|
; project with the same C compiler as the rest of the program.
|
|
|
|
|
; My "makefile" for a superprecision arithmetic package has
|
|
|
|
|
; the following stuff:
|
|
|
|
|
;
|
|
|
|
|
; # definitions:
|
|
|
|
|
; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
|
|
|
|
|
; CFLAGS = +O3
|
|
|
|
|
; LDFLAGS = -L /usr/lib -Wl,-aarchive
|
|
|
|
|
;
|
|
|
|
|
; # general build rule for ".s" files:
|
|
|
|
|
; .s.o:
|
|
|
|
|
; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
|
|
|
|
|
;
|
|
|
|
|
; # Now any bind step that calls for pa20.o will assemble pa20.s
|
|
|
|
|
;
|
|
|
|
|
; End of digression, back to arithmetic:
|
|
|
|
|
;
|
|
|
|
|
; The way we multiply two huge numbers is, of course, to multiply
|
|
|
|
|
; the "ABCD" vector by each of the "WXYZ" doublewords, adding
|
|
|
|
|
; the result vectors with increasing offsets, the way we learned
|
|
|
|
|
; in school, back before we all used calculators:
|
|
|
|
|
;
|
|
|
|
|
; A B C D
|
|
|
|
|
; * W X Y Z
|
|
|
|
|
; __________
|
|
|
|
|
; P Q R S T
|
|
|
|
|
; E F G H I
|
|
|
|
|
; M N O P Q
|
|
|
|
|
; + R S T U V
|
|
|
|
|
; _______________
|
|
|
|
|
; F I N A L S U M
|
|
|
|
|
;
|
|
|
|
|
; So we call maxpy_PA20_big (in my case; my package is
|
|
|
|
|
; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
|
|
|
|
|
; in turn as the "scalar", and giving the "ABCD" vector each
|
|
|
|
|
; time. We direct it to add its result into an area of memory
|
|
|
|
|
; that we have cleared at the start. We skew the exact
|
|
|
|
|
; location into that area with each call.
|
|
|
|
|
;
|
|
|
|
|
; The prototype for the function is
|
|
|
|
|
;
|
|
|
|
|
; extern void maxpy_PA20_big(
|
|
|
|
|
; int length, /* Number of doublewords in the multiplicand vector. */
|
|
|
|
|
; const long long int *scalaraddr, /* Address to fetch the scalar. */
|
|
|
|
|
; const long long int *multiplicand, /* The multiplicand vector. */
|
|
|
|
|
; long long int *result); /* Where to accumulate the result. */
|
|
|
|
|
;
|
|
|
|
|
; (You should place a copy of this prototype in an include file
|
|
|
|
|
; or in your C file.)
|
|
|
|
|
;
|
|
|
|
|
; Now, IN ALL CASES, the given address for the multiplicand or
|
|
|
|
|
; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
|
|
|
|
|
; That word is, of course, the word at which the routine
|
|
|
|
|
; starts processing. "maxpy_PA20_little" then increases the
|
|
|
|
|
; addresses as it computes. "maxpy_PA20_big" decreases them.
|
|
|
|
|
;
|
|
|
|
|
; In our example above, "length" would be 4 in each case.
|
|
|
|
|
; "multiplicand" would be the "ABCD" vector. Specifically,
|
|
|
|
|
; the address of the element "D". "scalaraddr" would be the
|
|
|
|
|
; address of "W", "X", "Y", or "Z" on the four calls that we
|
|
|
|
|
; would make. (The order doesn't matter, of course.)
|
|
|
|
|
; "result" would be the appropriate address in the result
|
|
|
|
|
; area. When multiplying by "Z", that would be the least
|
|
|
|
|
; significant word. When multiplying by "Y", it would be the
|
|
|
|
|
; next higher word (8 bytes higher if little-wordian; 8 bytes
|
|
|
|
|
; lower if big-wordian), and so on. The size of the result
|
|
|
|
|
; area must be the the sum of the sizes of the multiplicand
|
|
|
|
|
; and multiplier vectors, and must be initialized to zero
|
|
|
|
|
; before we start.
|
|
|
|
|
;
|
|
|
|
|
; Whenever the routine adds its partial product into the result
|
|
|
|
|
; vector, it follows carry chains as far as they need to go.
|
|
|
|
|
;
|
|
|
|
|
; Here is the super-precision multiply routine that I use for
|
|
|
|
|
; my package. The package is big-wordian. I have taken out
|
|
|
|
|
; handling of exponents (it's a floating point package):
|
|
|
|
|
;
|
|
|
|
|
; static void mul_PA20(
|
|
|
|
|
; int size,
|
|
|
|
|
; const long long int *arg1,
|
|
|
|
|
; const long long int *arg2,
|
|
|
|
|
; long long int *result)
|
|
|
|
|
; {
|
|
|
|
|
; int i;
|
|
|
|
|
;
|
|
|
|
|
; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
|
|
|
|
|
;
|
|
|
|
|
; for (i=0 ; i<size ; i++) {
|
|
|
|
|
; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
|
|
|
|
|
; }
|
|
|
|
|
; }
|