Liste des Groupes | Revenir à c arch |
BGB wrote:Some stats I have (for GLQuake):
On 4/11/2024 8:40 PM, MitchAlsup1 wrote:BGB wrote:
>On 4/11/2024 6:06 PM, MitchAlsup1 wrote:>>
>
While I admit that <basically> anything bigger than 50-bits will be fine
as displacements, they are not fine for constants and especially FP
constants and many bit twiddling constants.
>The number of cases where this comes up is not statistically significant enough to have a meaningful impact on performance.>Fraction of a percent edge-cases are not deal-breakers, as I see it.>
Idle speculation::
>
.globl r8_erf ; -- Begin function r8_erf
.type r8_erf,@function
r8_erf: ; @r8_erf
; %bb.0:
add sp,sp,#-128
std #4614300636657501161,[sp,88] // a[0]
std #4645348406721991307,[sp,104] // a[2]
std #4659275911028085274,[sp,112] // a[3]
std #4595861367557309218,[sp,120] // a[4]
std #4599171895595656694,[sp,40] // p[0]
std #4593699784569291823,[sp,56] // p[2]
std #4580293056851789237,[sp,64] // p[3]
std #4559215111867327292,[sp,72] // p[4]
std #4580359811580069319,[sp,80] // p[4]
std #4612966212090462427,[sp] // q[0]
std #4602930165995154489,[sp,16] // q[2]
std #4588882433176075751,[sp,24] // q[3]
std #4567531038595922641,[sp,32] // q[4]
fabs r2,r1
fcmp r3,r2,#0x3EF00000 // thresh
bnlt r3,.LBB141_6
; %bb.1:
fcmp r3,r2,#4 // xabs <= 4.0
bnlt r3,.LBB141_7
; %bb.2:
fcmp r3,r2,#0x403A8B020C49BA5E // xbig
bngt r3,.LBB141_11
; %bb.3:
fmul r3,r1,r1
fdiv r3,#1,r3
mov r4,#0x3F90B4FB18B485C7 // p[5]
fmac r4,r3,r4,#0x3FD38A78B9F065F6 // p[0]
fadd r5,r3,#0x40048C54508800DB // q[0]
fmac r6,r3,r4,#0x3FD70FE40E2425B8 // p[1]
fmac r4,r3,r5,#0x3FFDF79D6855F0AD // q[1]
fmul r4,r3,r4
fmul r6,r3,r6
mov r5,#2
add r7,sp,#40 // p[*]
add r8,sp,#0 // q[*]
LBB141_4: ; %._crit_edge11
; =>This Inner Loop Header: Depth=1
vec r9,{r4,r6}
ldd r10,[r7,r5<<3,0] // p[*]
ldd r11,[r8,r5<<3,0] // q[*]
fadd r6,r6,r10
fadd r4,r4,r11
fmul r4,r3,r4
fmul r6,r3,r6
loop ne,r5,#4,#1
; %bb.5:
fadd r5,r6,#0x3F4595FD0D71E33C // p[4]
fmul r3,r3,r5
fadd r4,r4,#0x3F632147A014BAD1 // q[4]
fdiv r3,r3,r4
fadd r3,#0x3FE20DD750429B6D,-r3 // c[0]
fdiv r3,r3,r2
br .LBB141_10 // common tail
LBB141_6: ; %._crit_edge
fmul r3,r1,r1
fcmp r2,r2,#0x3C9FFE5AB7E8AD5E // xsmall
sra r2,r2,<1:13>
cvtsd r4,#0
mux r2,r2,r3,r4
mov r3,#0x3FC7C7905A31C322 // a[4]
fmac r3,r2,r3,#0x400949FB3ED443E9 // a[0]
fmac r3,r2,r3,#0x405C774E4D365DA3 // a[1]
ldd r4,[sp,104] // a[2]
fmac r3,r2,r3,r4
fadd r4,r2,#0x403799EE342FB2DE // b[0]
fmac r4,r2,r4,#0x406E80C9D57E55B8 // b[1]
fmac r4,r2,r4,#0x40940A77529CADC8 // b[2]
fmac r3,r2,r3,#0x40A912C1535D121A // a[3]
fmul r1,r3,r1
fmac r2,r2,r4,#0x40A63879423B87AD // b[3]
fdiv r2,r1,r2
mov r1,r2
add sp,sp,#128
ret // 68
LBB141_7:
fmul r3,r2,#0x3E571E703C5F5815 // c[8]
mov r5,#0
mov r4,r2
LBB141_8: ; =>This Inner Loop Header: Depth=1
vec r6,{r3,r4}
ldd r7,[ip,r5<<3,.L__const.r8_erf.c]// c[*]
fadd r3,r3,r7
fmul r3,r2,r3
ldd r7,[ip,r5<<3,.L__const.r8_erf.d]// d[*]
fadd r4,r4,r7
fmul r4,r2,r4
loop ne,r5,#7,#1
; %bb.9:
fadd r3,r3,#0x4093395B7FD2FC8E // c[7]
fadd r4,r4,#0x4093395B7FD35F61 // d[7]
fdiv r3,r3,r4
LBB141_10: // common tail
fmul r4,r2,#0x41800000 // 16.0
fmul r4,r4,#0x3D800000 // 1/16.0
cvtds r4,r4 // (signed)double
cvtsd r4,r4 // (double)signed
fadd r5,r2,-r4
fadd r2,r2,r4
fmul r4,r4,-r4
fexp r4,r4 // exp()
fmul r2,r2,-r5
fexp r2,r2 // exp()
fmul r2,r4,r2
fadd r2,#0,-r2
fmac r2,r2,r3,#0x3F000000 // 0.5
fadd r2,r2,#0x3F000000 // 0.5
pflt r1,0,T
fadd r2,#0,-r2
mov r1,r2
add sp,sp,#128
ret
LBB141_11:
fcmp r1,r1,#0
sra r1,r1,<1:13>
cvtsd r2,#-1 // (double)-1
cvtsd r3,#1 // (double)+1
mux r2,r1,r3,r2
mov r1,r2
add sp,sp,#128
ret
Lfunc_end141:
.size r8_erf, .Lfunc_end141-r8_erf
; -- End functionThese patterns seem rather unusual...
Don't really know the ABI.Patterns don't really fit observations for typical compiler output though (mostly in the FP constants, and particular ones that fall outside the scope of what can be exactly represented as Binary16 or similar, are rare).You are N E V E R going to find the coefficients of a Chebyshev
polynomial to fit in a small FP container; excepting the very
occasional C0 or C1 term {which are mostly 1.0 and 0.0}
Les messages affichés proviennent d'usenet.