On 8/12/2024 5:35 PM, MitchAlsup1 wrote:
On Mon, 12 Aug 2024 20:58:45 +0000, BGB wrote:
On 8/12/2024 3:12 PM, MitchAlsup1 wrote:
See polpak:: r8_erf()
>
>
r8_erf: ; @r8_erf
; %bb.0:
fabs r2,r1
fcmp r3,r2,#0x3EF00000
bngt r3,.LBB141_5
; %bb.1:
fcmp r3,r2,#4
bngt r3,.LBB141_6
; %bb.2:
fcmp r3,r2,#0x403A8B020C49BA5E
bnlt r3,.LBB141_7
; %bb.3:
fmul r3,r1,r1
fdiv r3,#1,r3
mov r4,#0x3F90B4FB18B485C7
fmac r4,r3,r4,#0x3FD38A78B9F065F6
fadd r5,r3,#0x40048C54508800DB
fmac r4,r3,r4,#0x3FD70FE40E2425B8
fmac r5,r3,r5,#0x3FFDF79D6855F0AD
fmac r4,r3,r4,#0x3FC0199D980A842F
fmac r5,r3,r5,#0x3FE0E4993E122C39
fmac r4,r3,r4,#0x3F9078448CD6C5B5
fmac r5,r3,r5,#0x3FAEFC42917D7DE7
fmac r4,r3,r4,#0x3F4595FD0D71E33C
fmul r4,r3,r4
fmac r3,r3,r5,#0x3F632147A014BAD1
fdiv r3,r4,r3
fadd r3,#0x3FE20DD750429B6D,-r3
fdiv r3,r3,r2
br .LBB141_4
LBB141_5:
fmul r3,r1,r1
fcmp r2,r2,#0x3C9FFE5AB7E8AD5E
sra r2,r2,#8,#1
cvtsd r4,#0
mux r2,r2,r3,r4
mov r3,#0x3FC7C7905A31C322
fmac r3,r2,r3,#0x400949FB3ED443E9
fadd r4,r2,#0x403799EE342FB2DE
fmac r3,r2,r3,#0x405C774E4D365DA3
fmac r4,r2,r4,#0x406E80C9D57E55B8
fmac r3,r2,r3,#0x407797C38897528B
fmac r4,r2,r4,#0x40940A77529CADC8
fmac r3,r2,r3,#0x40A912C1535D121A
fmul r1,r3,r1
fmac r2,r2,r4,#0x40A63879423B87AD
fdiv r2,r1,r2
mov r1,r2
ret
LBB141_6:
mov r3,#0x3E571E703C5F5815
fmac r3,r2,r3,#0x3FE20DD508EB103E
fadd r4,r2,#0x402F7D66F486DED5
fmac r3,r2,r3,#0x4021C42C35B8BC02
fmac r4,r2,r4,#0x405D6C69B0FFCDE7
fmac r3,r2,r3,#0x405087A0D1C420D0
fmac r4,r2,r4,#0x4080C972E588749E
fmac r3,r2,r3,#0x4072AA2986ABA462
fmac r4,r2,r4,#0x4099558EECA29D27
fmac r3,r2,r3,#0x408B8F9E262B9FA3
fmac r4,r2,r4,#0x40A9B599356D1202
fmac r3,r2,r3,#0x409AC030C15DC8D7
fmac r4,r2,r4,#0x40B10A9E7CB10E86
fmac r3,r2,r3,#0x40A0062821236F6B
fmac r4,r2,r4,#0x40AADEBC3FC90DBD
fmac r3,r2,r3,#0x4093395B7FD2FC8E
fmac r4,r2,r4,#0x4093395B7FD35F61
fdiv r3,r3,r4
LBB141_4:
fmul r4,r2,#16
fmul r4,r4,#0x3D800000
rnd r4,r4,#5
fadd r5,r2,-r4
fadd r2,r2,r4
fmul r4,r4,-r4
fexp r4,r4
fmul r2,r2,-r5
fexp r2,r2
fmul r2,r4,r2
fadd r2,#0,-r2
fmac r2,r2,r3,#0x3F000000
fadd r2,r2,#0x3F000000
pdlt r1,T
fadd r2,#0,-r2
mov r1,r2
ret
LBB141_7:
fcmp r1,r1,#0
sra r1,r1,#8,#1
cvtsd r2,#-1
cvtsd r3,#1
mux r2,r1,r3,r2
mov r1,r2
ret
>
All of the constants are use once !
>
RISC-V takes 240 instructions and uses 342 words of
memory {.text, .data, .rodata}
>
My 66000 takes 85 instructions and uses 169 words of
memory {.text, .data, .rodata}
>
>
FWIW:
FADD Rm, Imm64f, Rn //XG2 Only
FADD Rm, Imm56f, Rn //
>
And:
FMUL Rm, Imm64f, Rn //XG2 Only
FMUL Rm, Imm56f, Rn //
>
Why don't yuo download polpack, compile it, and state how many
instructions it takes and how many words of storage it takes ??
Found what I assume you are talking about.
Needed to add "polpak_test.c" as otherwise BGBCC lacks a main and prunes everything;
Also needed to hack over some compiler holes related to "complex _Double" to get it to build;
Also needed to stub over some library functions that were added in C99 but missing in my C library.
So, initial result (BJX2 XG2):
128K ".text";
21K ".strtab" (strings)
4K ".data"
Instruction Count:
~ 28K
This is with a static linked C library, but used one of the lighter-weight ones (that does not include the whole OS kernel linked in).
Seems to result in around 41K lines of ASM output, but this includes blank lines, comments, and labels.
This seems to be 117K with an x86-64 build (dynamically linked C library).
RV64G (riscv64-unknown-elf-gcc -O3):
231K of ".text"
Did try "risc64-unknown-linux-gnu-gcc", but it broke due to apparently lacking "complex.h" and similar.
As for "r8_erf()":
<===
r8_erf:
MOV LR, R1
BSR __prolog_0005_00000200FFFF
ADD -800, SP
MOV.Q RQ4, (SP, 928)
LEA.Q (SP, 192), RQ3
MOV.Q RQ3, (SP, 184)
LEA.Q (SP, 240), RQ3
MOV.Q RQ3, (SP, 176)
LEA.Q (SP, 288), RQ3
MOV.Q RQ3, (SP, 168)
LEA.Q (SP, 368), RQ3
MOV.Q RQ3, (SP, 160)
LEA.Q (SP, 448), RQ3
MOV.Q RQ3, (SP, 144)
LEA.Q (SP, 512), RQ3
MOV.Q RQ3, (SP, 136)
// polpak.c:15867 {
LEA.Q (SP, 192), RQ7
MOV 0x400949FB3ED443E8, RQ6
MOV.Q RQ6, (RQ7)
MOV 0x405C774E4D365DA0, RQ5
MOV.Q RQ5, (RQ7, 8)
MOV 0x407797C388975288, RQ4
MOV.Q RQ4, (RQ7, 16)
MOV 0x40A912C1535D1218, RQ3
MOV.Q RQ3, (RQ7, 24)
MOV 0x3FC7C7905A31C320, RQ2
MOV.Q RQ2, (RQ7, 32)
// polpak.c:15874 double b[4] = {
LEA.Q (SP, 240), RQ23
MOV 0x403799EE342FB2E0, RQ22
MOV.Q RQ22, (RQ23)
MOV 0x406E80C9D57E55B8, RQ21
MOV.Q RQ21, (RQ23, 8)
MOV 0x40940A77529CADC8, RQ20
MOV.Q RQ20, (RQ23, 16)
MOV 0x40A63879423B87B0, RQ19
MOV.Q RQ19, (RQ23, 24)
// polpak.c:15879 double c[9] = {
LEA.Q (SP, 288), RQ18
MOV 0x3FE20DD508EB1040, RQ39
MOV.Q RQ39, (RQ18)
MOV 0x4021C42C35B8BC00, RQ38
MOV.Q RQ38, (RQ18, 8)
MOV 0x405087A0D1C420D0, RQ37
MOV.Q RQ37, (RQ18, 16)
MOV 0x4072AA2986ABA460, RQ36
MOV.Q RQ36, (RQ18, 24)
MOV 0x408B8F9E262B9FA0, RQ35
MOV.Q RQ35, (RQ18, 32)
MOV 0x409AC030C15DC8D8, RQ34
MOV.Q RQ34, (RQ18, 40)
MOV 0x40A0062821236F68, RQ33
MOV.Q RQ33, (RQ18, 48)
MOV 0x4093395B7FD2FC90, RQ32
MOV.Q RQ32, (RQ18, 56)
MOV 0x3E571E703C5F5818, RQ55
MOV.Q RQ55, (RQ18, 64)
// polpak.c:15889 double d[8] = {
LEA.Q (SP, 368), RQ54
MOV 0x402F7D66F486DED8, RQ53
MOV.Q RQ53, (RQ54)
MOV 0x405D6C69B0FFCDE8, RQ52
MOV.Q RQ52, (RQ54, 8)
MOV 0x4080C972E58874A0, RQ51
MOV.Q RQ51, (RQ54, 16)
MOV 0x4099558EECA29D28, RQ50
MOV.Q RQ50, (RQ54, 24)
MOV 0x40A9B599356D1200, RQ49
MOV.Q RQ49, (RQ54, 32)
MOV 0x40B10A9E7CB10E88, RQ48
MOV.Q RQ48, (RQ54, 40)
MOV 0x40AADEBC3FC90DC0, RQ29
MOV.Q RQ29, (RQ54, 48)
MOV 0x4093395B7FD35F60, RQ28
MOV.Q RQ28, (RQ54, 56)
// polpak.c:15901 double p[6] = {
LEA.Q (SP, 448), RQ27
MOV 0x3FD38A78B9F065F8, RQ26
MOV.Q RQ26, (RQ27)
MOV 0x3FD70FE40E2425B8, RQ25
MOV.Q RQ25, (RQ27, 8)
MOV 0x3FC0199D980A8430, RQ24
MOV.Q RQ24, (RQ27, 16)
MOV 0x3F9078448CD6C5B8, RQ7
MOV.Q RQ7, (RQ27, 24)
MOV 0x3F4595FD0D71E338, RQ7
MOV.Q RQ7, (RQ27, 32)
MOV 0x3F90B4FB18B485C8, RQ7
MOV.Q RQ7, (RQ27, 40)
// polpak.c:15908 double q[5] = {
LEA.Q (SP, 512), RQ6
MOV 0x40048C54508800D8, RQ7
MOV.Q RQ7, (RQ6)
MOV 0x3FFDF79D6855F0B0, RQ7
MOV.Q RQ7, (RQ6, 8)
MOV 0x3FE0E4993E122C38, RQ7
MOV.Q RQ7, (RQ6, 16)
MOV 0x3FAEFC42917D7DE8, RQ7
MOV.Q RQ7, (RQ6, 24)
MOV 0x3F632147A014BAD0, RQ7
MOV.Q RQ7, (RQ6, 32)
// polpak.c:15914 double sqrpi = 0.56418958354775628695;
MOV 0x3FE20DD750429B70, RQ5
MOV.Q RQ5, (SP, 128)
// polpak.c:15915 double thresh = 0.46875;
MOV 0x3FDE000000000000, RQ4
MOV.Q RQ4, (SP, 120)
// polpak.c:15916 double xbig = 26.543;
MOV 0x403A8B020C49BA60, RQ3
MOV.Q RQ3, (SP, 112)
// polpak.c:15920 double xsmall = 1.11E-16;
MOV 0x3C9FFE5AB7E8AD60, RQ2
MOV.Q RQ2, (SP, 104)
// polpak.c:15923 xabs = fabs ( x );
MOV.Q (SP, 928), RQ22
FABS RQ22, RQ31
// polpak.c:15927 if ( xabs <= thresh )
FCMP/GT RQ4, RQ31
BT .L0080087C
MOV.Q (SP, 104), RQ24
FCMP/GT RQ24, RQ31
BT .L0080087D
// polpak.c:15930 {
FMUL RQ31, RQ31, RQ10
BRA .L0080087E
.L0080087D:
// polpak.c:15934 {
MOV 0, RQ10
.L0080087E:
// polpak.c:15938 xnum = a[4] * xsq;
LEA.Q (SP, 192), RQ24
MOV.Q (RQ24, 32), RQ13
FMUL RQ13, RQ10, RQ9
// polpak.c:15939 xden = xsq;
MOV RQ10, RQ8
// polpak.c:15940 for ( i = 0; i < 3; i++ )
MOV 0, RD12
.L0080087F:
CMPGE.L 3, RD12
BT .L00800881
// polpak.c:15941 {
LEA.Q (SP, 192), RQ24
MOV.Q (RQ24, RD12), RQ13
FADD RQ9, RQ13, RQ14
FMUL RQ14, RQ10, RQ9
// polpak.c:15943 xden = ( xden + b[i] ) * xsq;
LEA.Q (SP, 240), RQ25
MOV.Q (RQ25, RD12), RQ13
FADD RQ8, RQ13, RQ14
FMUL RQ14, RQ10, RQ8
ADDS.L RD12, 1, RD12
BRA .L0080087F
.L00800881:
// polpak.c:15946 erfx = x * ( xnum + a[3] ) / ( xden + b[3] );
LEA.Q (SP, 192), RQ24
MOV.Q (RQ24, 24), RQ13
FADD RQ9, RQ13, RQ14
MOV.Q (SP, 928), RQ25
FMUL RQ25, RQ14, RQ13
LEA.Q (SP, 240), RQ26
MOV.Q (RQ26, 24), RQ14
FADD RQ8, RQ14, RQ11
MOV RQ13, RQ30
MOV RQ11, RQ5
MOV RQ30, RQ4
BSR __fpu_fdiv
MOV RQ2, RQ30
BRA .L00800882
.L0080087C:
FCMP/GT 17408, RQ31
BT .L00800883
// polpak.c:15952 {
LEA.Q (SP, 288), RQ24
MOV.Q (RQ24, 64), RQ13
FMUL RQ13, RQ31, RQ9
// polpak.c:15954 xden = xabs;
MOV RQ31, RQ8
// polpak.c:15955 for ( i = 0; i < 7; i++ )
MOV 0, RD12
.L00800884:
CMPGE.L 7, RD12
BT .L00800886
// polpak.c:15956 {
LEA.Q (SP, 288), RQ24
MOV.Q (RQ24, RD12), RQ14
FADD RQ9, RQ14, RQ11
FMUL RQ11, RQ31, RQ9
// polpak.c:15958 xden = ( xden + d[i] ) * xabs;
LEA.Q (SP, 368), RQ25
MOV.Q (RQ25, RD12), RQ13
FADD RQ8, RQ13, RQ14
FMUL RQ14, RQ31, RQ8
ADDS.L RD12, 1, RD12
BRA .L00800884
.L00800886:
// polpak.c:15961 erfx = ( xnum + c[7] ) / ( xden + d[7] );
LEA.Q (SP, 288), RQ24
MOV.Q (RQ24, 56), RQ11
FADD RQ9, RQ11, RQ13
LEA.Q (SP, 368), RQ25
MOV.Q (RQ25, 56), RQ14
FADD RQ8, RQ14, RQ11
MOV RQ13, RQ30
MOV RQ11, RQ5
MOV RQ30, RQ4
BSR __fpu_fdiv
MOV RQ2, RQ30
// polpak.c:15962 xsq = ( double ) ( ( int ) ( ( xabs * 16.0 ) / 16.0 ) );
FMUL RQ31, 28, RQ13
FMUL RQ13, 0x3FB0000000000000, RQ14
FSTCI RQ14, RQ26
FLDCI RQ26, RQ10
// polpak.c:15963 del = ( xabs - xsq ) * ( xabs + xsq );
FSUB RQ31, RQ10, RQ13
FADD RQ31, RQ10, RQ14
FMUL RQ13, RQ14, RQ27
MOV.Q RQ27, (SP, 152)
// polpak.c:15964 erfx = exp ( - xsq * xsq ) * exp ( - del ) * erfx;
FNEG RQ10, RQ11
FMUL RQ11, RQ10, RQ13
MOV RQ13, RQ4
BSR exp
MOV RQ2, RQ14
FNEG RQ27, RQ11
MOV RQ11, RQ4
BSR exp
MOV RQ2, RQ13
FMUL RQ14, RQ13, RQ11
FMUL RQ11, RQ30, RQ13
MOV RQ13, RQ30
// polpak.c:15966 erfx = ( 0.5 - erfx ) + 0.5;
MOV 0x3FE0000000000000, RQ28
FSUB RQ28, RQ30, RQ14
FADD RQ14, 8, RQ30
MOV.Q (SP, 928), RQ29
FCMPGE 0, RQ29
BT .L00800887
// polpak.c:15969 {
FNEG RQ30, RQ30
.L00800887:
BRA .L00800888
.L00800883:
MOV.Q (SP, 112), RQ24
FCMP/GT RQ31, RQ24
BT .L00800889
MOV.Q (SP, 928), RQ24
FCMP/GT 0, RQ24
BT .L0080088A
// polpak.c:15981 {
MOV 0x3FF0000000000000, RQ30
BRA .L0080088B
.L0080088A:
// polpak.c:15985 {
MOV 0xBFF0000000000000, RQ30
.L0080088B:
BRA .L0080088C
.L00800889:
// polpak.c:15990 {
FMUL RQ31, RQ31, RQ11
MOV 0x3FF0000000000000, RQ10
MOV RQ11, RQ5
MOV RQ10, RQ4
BSR __fpu_fdiv
MOV RQ2, RQ10
// polpak.c:15993 xnum = p[5] * xsq;
LEA.Q (SP, 448), RQ24
MOV.Q (RQ24, 40), RQ13
FMUL RQ13, RQ10, RQ9
// polpak.c:15994 xden = xsq;
MOV RQ10, RQ8
// polpak.c:15995 for ( i = 0; i < 4; i++ )
MOV 0, RD12
.L0080088D:
CMPGE.L 4, RD12
BT .L0080088F
// polpak.c:15996 {
LEA.Q (SP, 448), RQ24
MOV.Q (RQ24, RD12), RQ14
FADD RQ9, RQ14, RQ11
FMUL RQ11, RQ10, RQ9
// polpak.c:15998 xden = ( xden + q[i] ) * xsq;
LEA.Q (SP, 512), RQ25
MOV.Q (RQ25, RD12), RQ13
FADD RQ8, RQ13, RQ14
FMUL RQ14, RQ10, RQ8
ADDS.L RD12, 1, RD12
BRA .L0080088D
.L0080088F:
// polpak.c:16001 erfx = xsq * ( xnum + p[4] ) / ( xden + q[4] );
LEA.Q (SP, 448), RQ24
MOV.Q (RQ24, 32), RQ11
FADD RQ9, RQ11, RQ13
FMUL RQ10, RQ13, RQ14
LEA.Q (SP, 512), RQ25
MOV.Q (RQ25, 32), RQ11
FADD RQ8, RQ11, RQ13
MOV RQ14, RQ30
MOV RQ13, RQ5
MOV RQ30, RQ4
BSR __fpu_fdiv
MOV RQ2, RQ30
// polpak.c:16002 erfx = ( sqrpi - erfx ) / xabs;
MOV.Q (SP, 128), RQ26
FSUB RQ26, RQ30, RQ14
MOV RQ14, RQ30
MOV RQ31, RQ5
MOV RQ30, RQ4
BSR __fpu_fdiv
MOV RQ2, RQ30
// polpak.c:16003 xsq = ( double ) ( ( int ) ( ( xabs * 16.0 ) / 16.0 ) );
FMUL RQ31, 28, RQ11
FMUL RQ11, 0x3FB0000000000000, RQ13
FSTCI RQ13, RQ27
FLDCI RQ27, RQ10
// polpak.c:16004 del = ( xabs - xsq ) * ( xabs + xsq );
FSUB RQ31, RQ10, RQ13
FADD RQ31, RQ10, RQ14
FMUL RQ13, RQ14, RQ28
MOV.Q RQ28, (SP, 152)
// polpak.c:16005 erfx = exp ( - xsq * xsq ) * exp ( - del ) * erfx;
FNEG RQ10, RQ11
FMUL RQ11, RQ10, RQ13
MOV RQ13, RQ4
BSR exp
MOV RQ2, RQ14
FNEG RQ28, RQ11
MOV RQ11, RQ4
BSR exp
MOV RQ2, RQ13
FMUL RQ14, RQ13, RQ11
FMUL RQ11, RQ30, RQ13
MOV RQ13, RQ30
// polpak.c:16007 erfx = ( 0.5 - erfx ) + 0.5;
MOV 0x3FE0000000000000, RQ29
FSUB RQ29, RQ30, RQ14
FADD RQ14, 8, RQ30
MOV.Q (SP, 928), RQ24
FCMPGE 0, RQ24
BT .L00800890
// polpak.c:16009 {
FNEG RQ30, RQ30
.L00800890:
.L0080088C:
.L00800888:
.L00800882:
// polpak.c:16017 return erfx;
MOV RQ30, RQ2
.L00C00ABD:
ADD 800, SP
BRA __epilog_0005_00000200FFFF
===>
Looks like in this case, whatever compiler you had used, had optimized away the arrays...
BGBCC had instead created local stack arrays and then initialized them, mostly because they were not marked as "static" or "const".
Note that BGBCC isn't smart enough to propagate constants from arrays, ...
But, yeah, even with a lot of this, still often beating RISC-V in terms of performance...