diff options
author | YamaArashi <shadow962@live.com> | 2016-01-06 01:47:28 -0800 |
---|---|---|
committer | YamaArashi <shadow962@live.com> | 2016-01-06 01:47:28 -0800 |
commit | be8b04496302184c6e8f04d6179f9c3afc50aeb6 (patch) | |
tree | 726e2468c0c07add773c0dbd86ab6386844259ae /gcc/config/c4x/libgcc.S |
initial commit
Diffstat (limited to 'gcc/config/c4x/libgcc.S')
-rwxr-xr-x | gcc/config/c4x/libgcc.S | 1501 |
1 files changed, 1501 insertions, 0 deletions
diff --git a/gcc/config/c4x/libgcc.S b/gcc/config/c4x/libgcc.S new file mode 100755 index 0000000..fb79cf8 --- /dev/null +++ b/gcc/config/c4x/libgcc.S @@ -0,0 +1,1501 @@ +/* libgcc1 routines for the Texas Instruments TMS320C[34]x + Copyright (C) 1997,98 Free Software Foundation, Inc. + + Contributed by Michael Hayes (m.hayes@elec.canterbury.cri.nz) + and Herman Ten Brugge (Haj.Ten.Brugge@net.HCC.nl). + + +This file is part of GNU CC. + +GNU CC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2, or (at your option) any +later version. + +In addition to the permissions in the GNU General Public License, the +Free Software Foundation gives you unlimited permission to link the +compiled version of this file with other programs, and to distribute +those programs without any restriction coming from the use of this +file. (The General Public License restrictions do apply in other +respects; for example, they cover modification of the file, and +distribution when not linked into another program.) + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING. If not, write to +the Free Software Foundation, 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +/* As a special exception, if you link this library with files + compiled with GCC to produce an executable, this does not cause + the resulting executable to be covered by the GNU General Public License. + This exception does not however invalidate any other reasons why + the executable file might be covered by the GNU General Public License. */ + + +; These routines are called using the standard TI register argument +; passing model. +; The following registers do not have to be saved: +; r0, r1, r2, r3, ar0, ar1, ar2, ir0, ir1, bk, rs, rc, re, (r9, r10, r11) +; +; Perform floating point divqf3 +; +; This routine performs a reciprocal of the divisor using the method +; described in the C30/C40 user manuals. It then multiplies that +; result by the dividend. +; +; Let r be the reciprocal of the divisor v and let the ith estimate +; of r be denoted by r[i]. An iterative approach can be used to +; improve the estimate of r, given an initial estimate r[0], where +; +; r[i + 1] = r[i] * (2.0 - v * r[i]) +; +; The normalised error e[i] at the ith iteration is +; +; e[i] = (r - r[i]) / r = (1 / v - r[i]) * v = (1 - v * r[i]) +; +; Note that +; +; e[i + 1] = (1 - v * r[i + 1]) = 1 - 2 * v * r[i] + v^2 + (r[i])^2 +; = (1 - v * r[i])^2 = (e[i])^2 + +; r2 dividend, r3 divisor, r0 quotient +; clobbers r1, ar1 +#ifdef L_divqf3 + .text + .global ___divqf3 +___divqf3: + +#ifdef _TMS320C4x + .if .REGPARM == 0 + lda sp,ar0 + ldf *-ar0(2), r3 + .endif + + pop ar1 ; Pop return address + +; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor + rcpf r3, r0 ; Compute initial estimate r[0] + + mpyf3 r0, r3, r1 ; r1 = r[0] * v + subrf 2.0, r1 ; r1 = 2.0 - r[0] * v + mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1] +; End of 1st iteration (16 bits accuracy) + + mpyf3 r0, r3, r1 ; r1 = r[1] * v + subrf 2.0, r1 ; r1 = 2.0 - r[1] * v + + bud ar1 ; Delayed branch + mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2] +; End of 2nd iteration (32 bits accuracy) + .if .REGPARM == 0 + mpyf *-ar0(1), r0 ; Multiply by the dividend + .else + mpyf r2, r0 ; Multiply by the dividend + .endif + rnd r0 + ; Branch occurs here +#else + .if .REGPARM == 0 + ldiu sp,ar0 + ldf *-ar0(2), r3 + .endif + + pop ar1 ; Pop return address + +; Initial estimate r[0] = 1.0 * 2^(-e - 1) +; where v = m * 2^e + +; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor + +; Calculate initial estimate r[0] + pushf r3 + pop r0 + not r0 ; r0 = -e + ; complement exponent = -e -1 + ; complement sign (side effect) + ; complement mantissa (almost 3 bit accurate) + push r0 + popf r0 ; r0 = 1.0 * e^(-e - 1) + inverted mantissa + ldf -1.0, r1 ; undo complement sign bit + xor r1, r0 + + mpyf3 r0, r3, r1 ; r1 = r[0] * v + subrf 2.0, r1 ; r1 = 2.0 - r[0] * v + mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1] +; End of 1st iteration + + mpyf3 r0, r3, r1 ; r1 = r[1] * v + subrf 2.0, r1 ; r1 = 2.0 - r[1] * v + mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2] +; End of 2nd iteration + + mpyf3 r0, r3, r1 ; r1 = r[2] * v + subrf 2.0, r1 ; r1 = 2.0 - r[2] * v + mpyf r1, r0 ; r0 = r[2] * (2.0 - r[2] * v) = r[3] +; End of 3rd iteration + + or 080h, r0 ; add 1 lsb to result. needed when complemeting + ; 1.0 / 2.0 + rnd r0 + +; Use modified last iteration +; r[4] = (r[3] * (1.0 - (v * r[3]))) + r[3] + mpyf3 r0, r3, r1 ; r1 = r[3] * v + subrf 1.0, r1 ; r1 = 1.0 - r[3] * v + mpyf r0, r1 ; r1 = r[3] * (1.0 - r[3] * v) + + bud ar1 ; Delayed branch + addf r1, r0 ; r0 = r[3] * (1.0 - r[3] * v) + r[3] = r[4] + .if .REGPARM == 0 + mpyf *-ar0(1), r0 ; Multiply by the dividend + .else + mpyf r2, r0 ; Multiply by the dividend + .endif + rnd r0 + ; Branch occurs here +#endif + +#endif +; +; Integer signed division +; +; ar2 dividend, r2 divisor, r0 quotient +; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re +#ifdef L_divqi3 + .text + .global ___divqi3 + .ref udivqi3n +___divqi3: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + ldi *-ar0(2), r2 + .endif + + xor3 ar2, r2, r3 ; Get the sign + absi ar2, r0 + bvd divq32 + ldi r0, ar2 + absi r2, r2 + cmpi ar2, r2 ; Divisor > dividend? + + pop ir1 + bhid zero ; If so, return 0 + +; +; Normalize oeprands. Use difference exponents as shift count +; for divisor, and as repeat count for "subc" +; + float ar2, r1 ; Normalize dividend + pushf r1 ; Get as integer + pop ar0 + lsh -24, ar0 ; Get exponent + + float r2, r1 ; Normalize divisor + pushf r1 ; Get as integer + pop ir0 + lsh -24, ir0 ; Get exponent + + subi ir0, ar0 ; Get difference of exponents + lsh ar0, r2 ; Align divisor with dividend + +; +; Do count + 1 subtracts and shifts +; + rpts ar0 + subc r2, ar2 + +; +; Mask off the lower count+1 bits of ar2 +; + subri 31, ar0 ; Shift count is (32 - (ar0 + 1)) + lsh ar0, ar2 ; Shift left + negi ar0, ar0 + lsh3 ar0, ar2, r0 ; Shift right and put result in r0 + +; +; Check sign and negate result if necessary +; + bud ir1 ; Delayed return + negi r0, r1 ; Negate result + ash -31, r3 ; Check sign + ldinz r1, r0 ; If set, use negative result + ; Branch occurs here + +zero: bud ir1 ; Delayed branch + ldi 0, r0 + nop + nop + ; Branch occurs here +; +; special case where ar2 = abs(ar2) = 0x80000000. We handle this by +; calling unsigned divide and negating the result if necessary. +; +divq32: + push r3 ; Save sign + call udivqi3n + pop r3 + pop ir1 + bd ir1 + negi r0, r1 ; Negate result + ash -31, r3 ; Check sign + ldinz r1, r0 ; If set, use negative result + ; Branch occurs here +#endif +; +; +; ar2 dividend, r2 divisor, r0 quotient, +; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re +#ifdef L_udivqi3 + .text + .global ___udivqi3 + .global udivqi3n +___udivqi3: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + ldi *-ar0(2), r2 + .endif + +udivqi3n: + pop ir1 + + cmpi ar2, r2 ; If divisor > dividend + bhi qzero ; return zero + ldi r2, ar1 ; Store divisor in ar1 + + tstb ar2, ar2 ; Check top bit, jump if set to special handler + bld div_32 ; Delayed branch + +; +; Get divisor exponent +; + float ar1, r1 ; Normalize the divisor + pushf r1 ; Get into int register + pop rc + ; branch occurs here + + bzd qzero ; if (float) divisor zero, return zero + + float ar2, r1 ; Normalize the dividend + pushf r1 ; Get into int register + pop ar0 + lsh -24, ar0 ; Get both the exponents + lsh -24, rc + + subi rc, ar0 ; Get the difference between the exponents + lsh ar0, ar1 ; Normalize the divisor with the dividend + +; +; Do count_1 subtracts and shifts +; + rpts ar0 + subc ar1, ar2 + +; +; mask off the lower count+1 bits +; + subri 31, ar0 ; Shift count (31 - (ar0+1)) + bud ir1 ; Delayed return + lsh3 ar0, ar2, r0 + negi ar0, ar0 + lsh ar0, r0 + ; Branch occurs here + +; +; Handle a full 32-bit dividend +; +div_32: tstb ar1, ar1 + bld qone ; if divisor high bit is one, the result is one + lsh -24, rc + subri 31, rc + lsh rc, ar1 ; Line up the divisor + +; +; Now divisor and dividend are aligned. Do first SUBC by hand, save +; of the forst quotient digit. Then, shift divisor right rather +; than shifting dividend left. This leaves a zero in the top bit of +; the divident +; + ldi 1, ar0 ; Initizialize MSB of quotient + lsh rc, ar0 ; create a mask for MSBs + subi 1, ar0 ; mask is (2 << count) - 1 + + subi3 ar1, ar2, r1 + ldihs r1, ar2 + ldihs 1, r1 + ldilo 0, r1 + lsh rc, r1 + + lsh -1, ar1 + subi 1, rc +; +; do the rest of the shifts and subtracts +; + rpts rc + subc ar1, ar2 + + bud ir1 + and ar0, ar2 + or3 r1, ar2, r0 + nop + +qone: + bud ir1 + ldi 1, r0 + nop + nop + +qzero: + bud ir1 + ldi 0, r0 + nop + nop +#endif + +#ifdef L_umodqi3 + .text + .global ___umodqi3 + .global umodqi3n +___umodqi3: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + ldi *-ar0(2), r2 + .endif + +umodqi3n: + pop ir1 ; return address + cmpi ar2, r2 ; divisor > dividend ? + bhi uzero ; if so, return dividend + ldi r2, ar1 ; load divisor +; +; If top bit of dividend is set, handle specially. +; + tstb ar2, ar2 ; check top bit + bld umod_32 ; get divisor exponent, then jump. +; +; Get divisor exponent by converting to float. +; + float ar1, r1 ; normalize divisor + pushf r1 ; push as float + pop rc ; pop as int to get exponent + bzd uzero ; if (float)divisor was zero, return +; +; 31 or less bits in dividend. Get dividend exponent. +; + float ar2, r1 ; normalize dividend + pushf r1 ; push as float + pop ar0 ; pop as int to get exponent +; +; Use difference in exponents as shift count to line up MSBs. +; + lsh -24, rc ; divisor exponent + lsh -24, ar0 ; dividend exponent + subi rc, ar0 ; difference + lsh ar0, ar1 ; shift divisor up +; +; Do COUNT+1 subtract & shifts. +; + rpts ar0 + subc ar1, ar2 +; +; Remainder is in upper 31-COUNT bits. +; + bud ir1 ; delayed branch to return + addi 1, ar0 ; shift count is COUNT+1 + negi ar0, ar0 ; negate for right shift + lsh3 ar0, ar2, r0 ; shift to get result + ; Return occurs here + +; +; The following code handles cases of a full 32-bit dividend. Before +; SUBC can be used, the top bit must be cleared (otherwise SUBC can +; possibly shift a significant 1 out the top of the dividend). This +; is accomplished by first doing a normal subtraction, then proceeding +; with SUBCs. +; +umod_32: +; +; If the top bit of the divisor is set too, the remainder is simply +; the difference between the dividend and divisor. Otherwise, shift +; the divisor up to line up the MSBs. +; + tstb ar1, ar1 ; check divisor + bld uone ; if negative, remainder is diff + + lsh -24, rc ; divisor exponent + subri 31, rc ; shift count = 31 - exp + negi rc, ar0 ; used later as shift count + lsh rc, ar1 ; shift up to line up MSBs +; +; Now MSBs are aligned. Do first SUBC by hand using a plain subtraction. +; Then, shift divisor right rather than shifting dividend left. This leaves +; a 0 in the top bit of the dividend. +; + subi3 ar1, ar2, r1 ; subtract + ldihs r1, ar2 ; if positive, replace dividend + subi 1, rc ; first iteration is done + lsh -1, ar1 ; shift divisor down +; +; Do EXP subtract & shifts. +; + rpts rc + subc ar1, ar2 +; +; Quotient is in EXP+1 LSBs; shift remainder (in MSBs) down. +; + bud ir1 + lsh3 ar0, ar2, r0 ; COUNT contains -(EXP+1) + nop + nop +; +; Return (dividend - divisor). +; +uone: bud ir1 + subi3 r2, ar2, r0 + nop + nop +; +; Return dividend. +; +uzero: bud ir1 + ldi ar2, r0 ; set status from result + nop + nop +#endif + +#ifdef L_modqi3 + .text + .global ___modqi3 + .ref umodqi3n +___modqi3: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + ldi *-ar0(2), r2 + .endif + +; +; Determine sign of result. Get absolute value of operands. +; + ldi ar2, ar0 ; sign of result same as dividend + absi ar2, r0 ; make dividend positive + bvd mod_32 ; if still negative, escape + absi r2, r1 ; make divisor positive + ldi r1, ar1 ; save in ar1 + cmpi r0, ar1 ; divisor > dividend ? + + pop ir1 ; return address + bhid return ; if so, return dividend +; +; Normalize operands. Use difference in exponents as shift count +; for divisor, and as repeat count for SUBC. +; + float r1, r1 ; normalize divisor + pushf r1 ; push as float + pop rc ; pop as int + bzd return ; if (float)divisor was zero, return + + float r0, r1 ; normalize dividend + pushf r1 ; push as float + pop r1 ; pop as int + + lsh -24, rc ; get divisor exponent + lsh -24, r1 ; get dividend exponent + subi rc, r1 ; get difference in exponents + lsh r1, ar1 ; align divisor with dividend +; +; Do COUNT+1 subtract & shifts. +; + rpts r1 + subc ar1, r0 +; +; Remainder is in upper bits of R0 +; + addi 1, r1 ; shift count is -(r1+1) + negi r1, r1 + lsh r1, r0 ; shift right +; +; Check sign and negate result if necessary. +; +return: + bud ir1 ; delayed branch to return + negi r0, r1 ; negate result + cmpi 0, ar0 ; check sign + ldin r1, r0 ; if set, use negative result + ; Return occurs here +; +; The following code handles cases of a full 32-bit dividend. This occurs +; when R0 = abs(R0) = 080000000h. Handle this by calling the unsigned mod +; function, then negating the result if necessary. +; +mod_32: + push ar0 ; remember sign + call umodqi3n ; do divide + + brd return ; return + pop ar0 ; restore sign + pop ir1 ; return address + nop +#endif + +#ifdef L_unsfltconst + .section .const + .global ___unsfltconst +___unsfltconst: .float 4294967296.0 +#endif + +#ifdef L_unsfltcompare + .section .const + .global ___unsfltcompare +___unsfltcompare: .float 2147483648.0 +#endif + +; Integer 32-bit signed multiplication +; +; The TMS320C3x MPYI instruction takes two 24-bit signed integers +; and produces a 48-bit signed result which is truncated to 32-bits. +; +; A 32-bit by 32-bit multiplication thus requires a number of steps. +; +; Consider the product of two 32-bit signed integers, +; +; z = x * y +; +; where x = (b << 16) + a, y = (d << 16) + c +; +; This can be expressed as +; +; z = ((b << 16) + a) * ((d << 16) + c) +; +; = ((b * d) << 32) + ((b * c + a * d) << 16) + a * c +; +; Let z = (f << 16) + e where f < (1 << 16). +; +; Since we are only interested in a 32-bit result, we can ignore the +; (b * d) << 32 term, and thus +; +; f = b * c + a * d, e = a * c +; +; We can simplify things if we have some a priori knowledge of the +; operands, for example, if -32768 <= y <= 32767, then y = c and d = 0 and thus +; +; f = b * c, e = a * c +; +; ar2 multiplier, r2 multiplicand, r0 product +; clobbers r1, r2, r3 +#ifdef L_mulqi3 + .text + .global ___mulqi3 +___mulqi3: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + ldi *-ar0(2), r2 + .endif + + pop ir1 ; return address + ldi ar2, r0 ; + and 0ffffh, r0 ; a + lsh -16, ar2 ; b + ldi r2, r3 ; + and 0ffffh, r3 ; c + mpyi r3, ar2 ; c * b + lsh -16, r2 ; d + mpyi r0, r2 ; a * d + addi ar2, r2 ; c * b + a * d + bd ir1 ; delayed branch to return + lsh 16, r2 ; (c * b + a * d) << 16 + mpyi r3, r0 ; a * c + addi r2, r0 ; a * c + (c * b + a * d) << 16 +; branch occurs here + +#endif + +; +; Integer 64 by 64 multiply +; long1 and long2 on stack +; result in r0,r1 +; +#ifdef L_mulhi3 + .text + .global ___mulhi3 +#ifdef _TMS320C4x +___mulhi3: + pop ar0 + ldi sp,ar2 + ldi *-ar2(1),r2 + ldi *-ar2(3),r3 + mpyi3 r2,r3,r0 + mpyuhi3 r2,r3,r1 + mpyi *-ar2(2),r2 + bd ar0 + mpyi *-ar2(0),r3 + addi r2,r1 + addi r3,r1 +#else +___mulhi3: + ldi sp,ar2 + ldi -16,rs + ldi *-ar2(2),ar0 + ldi *-ar2(4),ar1 + ldi ar0,r2 + and 0ffffh,r2 + ldi ar1,r3 + and 0ffffh,r3 + lsh rs,ar0 + lsh rs,ar1 + + mpyi r2,r3,r0 + mpyi ar0,ar1,r1 + mpyi r2,ar1,rc + lsh rs,rc,re + addi re,r1 + lsh 16,rc + addi rc,r0 + addc 0,r1 + mpyi r3,ar0,rc + lsh rs,rc,re + addi re,r1 + lsh 16,rc + addi rc,r0 + addc 0,r1 + + ldi *-ar2(1),ar0 + ldi ar0,r2 + and 0ffffh,r2 + lsh rs,ar0 + mpyi r2,r3,rc + addi rc,r1 + mpyi r2,ar1,rc + mpyi r3,ar0,re + addi re,rc + lsh 16,rc + addi rc,r1 + + ldi *-ar2(2),ar0 + ldi *-ar2(3),ar1 + ldi ar0,r2 + and 0ffffh,r2 + ldi ar1,r3 + and 0ffffh,r3 + lsh rs,ar0 + lsh rs,ar1 + mpyi r2,r3,rc + addi rc,r1 + mpyi r2,ar1,rc + mpyi r3,ar0,re + pop ar0 + bd ar0 + addi re,rc + lsh 16,rc + addi rc,r1 +#endif +#endif + +; +; Integer 32 by 32 multiply highpart unsigned +; src1 in ar2 +; src2 in r2 +; result in r0 +; +#ifdef L_umulhi3_high + .text + .global ___umulhi3_high +___umulhi3_high: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + ldi *-ar0(2), r2 + .endif + + ldi -16,rs + ldi r2,r3 + and 0ffffh,r2 + ldi ar2,ar1 + and 0ffffh,ar2 + lsh rs,r3 + lsh rs,ar1 + + mpyi ar2,r2,r1 + mpyi ar1,r3,r0 + mpyi ar2,r3,rc + lsh rs,rc,re + addi re,r0 + lsh 16,rc + addi rc,r1 + addc 0,r0 + mpyi r2,ar1,rc + lsh rs,rc,re + addi re,r0 + pop ar0 + bd ar0 + lsh 16,rc + addi rc,r1 + addc 0,r0 +#endif + +; +; Integer 32 by 32 multiply highpart signed +; src1 in ar2 +; src2 in r2 +; result in r0 +; +#ifdef L_smulhi3_high + .text + .global ___smulhi3_high +___smulhi3_high: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + ldi *-ar0(2), r2 + .endif + + ldi -16,rs + ldi 0,rc + subi3 ar2,rc,r0 + ldi r2,r3 + ldilt r0,rc + subi3 r2,rc,r0 + ldi ar2,ar1 + tstb ar1,ar1 + ldilt r0,rc + and 0ffffh,r2 + and 0ffffh,ar2 + lsh rs,r3 + lsh rs,ar1 + + mpyi ar2,r2,r1 + mpyi ar1,r3,r0 + addi rc,r0 + mpyi ar2,r3,rc + lsh rs,rc,re + addi re,r0 + lsh 16,rc + addi rc,r1 + addc 0,r0 + mpyi r2,ar1,rc + lsh rs,rc,re + addi re,r0 + pop ar0 + bd ar0 + lsh 16,rc + addi rc,r1 + addc 0,r0 +#endif + +; +; Integer 64 by 64 unsigned divide +; long1 and long2 on stack +; divide in r0,r1 +; modulo in r2,r3 +; routine takes a maximum of 64*9+21=597 cycles = 24 us @ 50Mhz +; +#ifdef L_udivhi3 + .text + .global ___udivhi3 + .global ___udivide + .global ___umodulo + .ref udivqi3n + .ref umodqi3n +___udivhi3: + ldi sp,ar2 + ldi *-ar2(4),ar0 + ldi *-ar2(3),ar1 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 + +___udivide: + or r1,ar1,r2 + bne udiv0 + ldi ar0,r2 + ldi r0,ar2 + call udivqi3n + ldiu 0,r1 + rets + +___umodulo: + or r1,ar1,r2 + bne udiv0 + ldi ar0,r2 + ldi r0,ar2 + call umodqi3n + ldi r0,r2 + ldiu 0,r3 + rets + +udiv0: + tstb ar1,ar1 + bne udiv1 + tstb ar0,ar0 + bn udiv1 + + ldiu 63,rc +#ifdef _TMS320C4x + rptbd udivend0 + ldiu 0,r2 + addi r0,r0 + rolc r1 +#else + ldiu 0,r2 + addi r0,r0 + rolc r1 + rptb udivend0 +#endif + + rolc r2 + subi3 ar0,r2,r3 + xor 1,st + ldic r3,r2 + rolc r0 +udivend0: + rolc r1 + + ldiu 0,r3 + rets +udiv1: + push r4 + push r5 + ldiu 63,rc + ldiu 0,r2 +#ifdef _TMS320C4x + rptbd udivend1 + ldiu 0,r3 + addi r0,r0 + rolc r1 +#else + ldiu 0,r3 + addi r0,r0 + rolc r1 + rptb udivend1 +#endif + + rolc r2 + rolc r3 + subi3 ar0,r2,r4 + subb3 ar1,r3,r5 + xor 1,st + ldic r4,r2 + ldic r5,r3 + rolc r0 +udivend1: + rolc r1 + + pop r5 + pop r4 + rets +#endif + +; +; Integer 64 by 64 unsigned modulo +; long1 and long2 on stack +; result in r0,r1 +; +#ifdef L_umodhi3 + .text + .global ___umodhi3 + .ref ___modulo +___umodhi3: + ldi sp,ar2 + ldi *-ar2(4),ar0 + ldi *-ar2(3),ar1 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 + call ___umodulo + pop ar0 + bd ar0 + ldi r2,r0 + ldi r3,r1 + nop +#endif + +; +; Integer 64 by 64 signed divide +; long1 and long2 on stack +; result in r0,r1 +; +#ifdef L_divhi3 + .text + .global ___divhi3 + .ref ___udivide +___divhi3: + ldi 0,ir0 + ldi sp,ar2 + ldi *-ar2(4),r0 + ldi *-ar2(3),r1 + bge div1 + negi ir0 + negi r0 + negb r1 +div1: + ldi r0,ar0 + ldi r1,ar1 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 + bge div2 + negi ir0 + negi r0 + negb r1 +div2: + call ___udivide + tstb ir0,ir0 + bge div3 + negi r0 + negb r1 +div3: + rets +#endif + +; +; Integer 64 by 64 signed modulo +; long1 and long2 on stack +; result in r0,r1 +; +#ifdef L_modhi3 + .text + .global ___modhi3 + .ref ___umodulo +___modhi3: + ldi 0,ir0 + ldi sp,ar2 + ldi *-ar2(4),r0 + ldi *-ar2(3),r1 + bge mod1 + negi ir0 + negi r0 + negb r1 +mod1: + ldi r0,ar0 + ldi r1,ar1 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 + bge mod2 + negi ir0 + negi r0 + negb r1 +mod2: + call ___umodulo + ldi r2,r0 + ldi r3,r1 + tstb ir0,ir0 + bge mod3 + negi r0 + negb r1 +mod3: + rets +#endif + +; +; double to signed long long converion +; input in r2 +; result in r0,r1 +; +#ifdef L_fix_truncqfhi2 + .text + .global ___fix_truncqfhi2 + .ref ufix_truncqfhi2n +___fix_truncqfhi2: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldf *-ar0(1), r2 + .endif + + cmpf 0.0,r2 + bge ufix_truncqfhi2n + negf r2 + call ufix_truncqfhi2n + negi r0 + negb r1 + rets +#endif + +; +; double to unsigned long long converion +; input in r2 +; result in r0,r1 +; +#ifdef L_ufix_truncqfhi2 + .text + .global ___ufix_truncqfhi2 + .global ufix_truncqfhi2n +___ufix_truncqfhi2: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldf *-ar0(1), r2 + .endif + +ufix_truncqfhi2n: + cmpf 0.0,r2 + ble ufix1 + pushf r2 + pop r3 + ash -24,r3 + subi 31,r3 + cmpi 32,r3 + bge ufix1 + cmpi -32,r3 + ble ufix1 + ldi 1,r0 + ash 31,r0 + or3 r0,r2,r0 + ldi r0,r1 + lsh3 r3,r0,r0 + subi 32,r3 + cmpi -32,r3 + ldile 0,r1 + lsh3 r3,r1,r1 + rets +ufix1: + ldi 0,r0 + ldi 0,r1 + rets +#endif + +; +; signed long long to double converion +; input on stack +; result in r0 +; +#ifdef L_floathiqf2 + .text + .global ___floathiqf2 + .ref ufloathiqf2n +___floathiqf2: + ldi sp,ar2 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 + bge ufloathiqf2n + negi r0 + negb r1 + call ufloathiqf2n + negf r0 + rets +#endif + +; +; unsigned long long to double converion +; input on stack +; result in r0 +; +#ifdef L_ufloathiqf2 + .text + .global ___ufloathiqf2 + .global ufloathiqf2n + .ref ___unsfltconst +___ufloathiqf2: + ldi sp,ar2 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 +ufloathiqf2n: + .if .BIGMODEL +#ifdef _TMS320C4x + ldpk @___unsfltconst +#else + ldp @___unsfltconst +#endif + .endif + ldf @___unsfltconst,r2 + float r0 + bge uflt1 + addf r2,r0 +uflt1: + float r1 + bge uflt2 + addf r2,r1 +uflt2: +#ifdef _TMS320C4x + pop r3 + bd r3 + mpyf r2,r1 + addf r1,r0 + nop +#else + ldf r1,r3 + and 0ffh,r3 + norm r3,r3 + mpyf r2,r3 + pop ar2 + bd ar2 + addf r3,r0 + mpyf r2,r1 + addf r1,r0 +#endif +#endif + +; +; long double to signed long long converion +; input in r2 +; result in r0,r1 +; +#ifdef L_fix_trunchfhi2 + .text + .global ___fix_trunchfhi2 + .ref ufix_trunchfhi2n +___fix_trunchfhi2: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldf *-ar0(2), r2 + ldi *-ar0(1), r2 + .endif + + cmpf 0.0,r2 + bge ufix_trunchfhi2n + negf r2 + call ufix_trunchfhi2n + negi r0 + negb r1 + rets +#endif + +; +; long double to unsigned long long converion +; input in r2 +; result in r0,r1 +; +#ifdef L_ufix_trunchfhi2 + .text + .global ___ufix_trunchfhi2 + .global ufix_trunchfhi2n +___ufix_trunchfhi2: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldf *-ar0(2), r2 + ldi *-ar0(1), r2 + .endif + +ufix_trunchfhi2n: + cmpf 0.0,r2 + ble ufixh1 + pushf r2 + pop r3 + ash -24,r3 + subi 31,r3 + cmpi 32,r3 + bge ufixh1 + cmpi -32,r3 + ble ufixh1 + ldi 1,r0 + ash 31,r0 + or3 r0,r2,r0 + ldi r0,r1 + lsh3 r3,r0,r0 + subi 32,r3 + cmpi -32,r3 + ldile 0,r1 + lsh3 r3,r1,r1 + rets +ufixh1: + ldi 0,r0 + ldi 0,r1 + rets +#endif + +; +; signed long long to long double converion +; input on stack +; result in r0 +; +#ifdef L_floathihf2 + .text + .global ___floathihf2 + .ref ufloathihf2n +___floathihf2: + ldi sp,ar2 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 + bge ufloathihf2n + negi r0 + negb r1 + call ufloathihf2n + negf r0 + rets +#endif + +; +; unsigned long long to double converion +; input on stack +; result in r0 +; +#ifdef L_ufloathihf2 + .text + .global ___ufloathihf2 + .global ufloathihf2n + .ref ___unsfltconst +___ufloathihf2: + ldi sp,ar2 + ldi *-ar2(2),r0 + ldi *-ar2(1),r1 +ufloathihf2n + .if .BIGMODEL +#ifdef _TMS320C4x + ldpk @___unsfltconst +#else + ldp @___unsfltconst +#endif + .endif + ldf @___unsfltconst,r2 + float r0 + bge uflth1 + addf r2,r0 +uflth1: + float r1 + bge uflth2 + addf r2,r1 +uflth2: +#ifdef _TMS320C4x + pop r3 + bd r3 + mpyf r2,r1 + addf r1,r0 + nop +#else + ldf r1,r3 + and 0ffh,r3 + norm r3,r3 + mpyf r2,r3 + pop ar2 + bd ar2 + addf r3,r0 + mpyf r2,r1 + addf r1,r0 +#endif +#endif + +; +; calculate ffs +; input in ar2 +; result in r0 +; +#ifdef L_ffs + .global ___ffs + .ref ___unsfltconst + .text +___ffs: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldi *-ar0(1), ar2 + .endif + + negi ar2,r0 + and ar2,r0 + float r0,r0 + ldfu 0.0,r1 + .if .BIGMODEL +#ifdef _TMS320C4x + ldpk @___unsfltconst +#else + ldp @___unsfltconst +#endif + .endif + ldflt @___unsfltconst,r1 + addf r1,r0 + pushf r0 + pop r0 + pop ar0 + bd ar0 + ash -24,r0 + ldilt -1,r0 + addi 1,r0 +#endif + +; +; calculate long double * long double +; input in r2, r3 +; output in r0 +; +#ifdef L_mulhf3 + .global ___mulhf3 + .text +___mulhf3: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldf *-ar0(2), r2 + ldi *-ar0(1), r2 + ldf *-ar0(4), r3 + ldi *-ar0(3), r3 + .endif + + pop ar2 ; return ad + ldf r2,r0 ; copy lsb0 + ldf r3,r1 ; copy lsb1 + and 0ffh,r0 ; mask lsb0 + and 0ffh,r1 ; mask lsb1 + norm r0,r0 ; correct lsb0 + norm r1,r1 ; correct lsb1 + mpyf r2,r1 ; arg0*lsb1 + mpyf r3,r0 ; arg1*lsb0 + bd ar2 ; return (delayed) + addf r0,r1 ; arg0*lsb1 + arg1*lsb0 + mpyf r2,r3,r0 ; msb0*msb1 + addf r1,r0 ; msb0*msb1 + arg0*lsb1 + arg1*lsb0 +#endif + +; +; calculate long double / long double +; r2 dividend, r3 divisor, r0 quotient +; +#ifdef L_divhf3 + .global ___divhf3 + .text +___divhf3: + .if .REGPARM == 0 +#ifdef _TMS320C4x + lda sp,ar0 +#else + ldiu sp,ar0 +#endif + ldf *-ar0(2), r2 + ldi *-ar0(1), r2 + ldf *-ar0(4), r3 + ldi *-ar0(3), r3 + .endif + +#ifdef _TMS320C4x + pop ar1 + rcpf r3, r0 + mpyf3 r0, r3, r1 + subrf 2.0, r1 + mpyf r1, r0 + mpyf3 r0, r3, r1 + bud ar1 + subrf 2.0, r1 + mpyf r1, r0 + mpyf r2, r0 +#else + pop ar1 + pushf r3 + pop r0 + not r0 + push r0 + popf r0 + ldf -1.0, r1 + xor r1, r0 + + mpyf3 r0, r3, r1 ; r1 = r[0] * v + subrf 2.0, r1 ; r1 = 2.0 - r[0] * v + mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1] +; End of 1st iteration + + mpyf3 r0, r3, r1 ; r1 = r[1] * v + subrf 2.0, r1 ; r1 = 2.0 - r[1] * v + mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2] +; End of 2nd iteration + + mpyf3 r0, r3, r1 ; r1 = r[2] * v + subrf 2.0, r1 ; r1 = 2.0 - r[2] * v + mpyf r1, r0 ; r0 = r[2] * (2.0 - r[2] * v) = r[3] +; End of 3rd iteration + + or 080h, r0 + rnd r0 + +; mpyf3 r0, r3, r1 ; r1 = r[3] * v + push r4 + pushf r4 + mpyf r0, r3, r1 + + ldf r0, r4 + and 0ffh, r4 + norm r4, r4 + mpyf r3, r4 + addf r4, r1 + + ldf r3, r4 + and 0ffh, r4 + norm r4, r4 + mpyf r0, r4 + addf r4, r1 + + subrf 2.0, r1 ; r1 = 2.0 - r[3] * v + + mpyf r1, r0, r3 ; r3 = r[3] * (2.0 - r[3] * v) = r[5] + + ldf r1, r4 + and 0ffh, r4 + norm r4, r4 + mpyf r0, r4 + addf r4, r3 + + ldf r0, r4 + and 0ffh, r4 + norm r4, r4 + mpyf r1, r4 + addf r4, r3 + + mpyf r2, r3, r0 ; Multiply by the dividend + + ldf r2, r4 + and 0ffh, r4 + norm r4, r4 + mpyf r3, r4 + addf r4, r0 + + ldf r3, r4 + and 0ffh, r4 + norm r4, r4 + mpyf r2, r4 + bd ar1 + addf r4, r0 + + popf r4 + pop r4 +#endif +#endif |