Liste des Groupes | Revenir à c arch |
On 4/11/2024 8:40 PM, MitchAlsup1 wrote:BGB wrote:
On 4/11/2024 6:06 PM, MitchAlsup1 wrote:>
>
While I admit that <basically> anything bigger than 50-bits will be fine
as displacements, they are not fine for constants and especially FP
constants and many bit twiddling constants.
>The number of cases where this comes up is not statistically significant enough to have a meaningful impact on performance.Fraction of a percent edge-cases are not deal-breakers, as I see it.Idle speculation::
.globl r8_erf ; -- Begin function r8_erf
.type r8_erf,@function
r8_erf: ; @r8_erf
; %bb.0:
add sp,sp,#-128
std #4614300636657501161,[sp,88] // a[0]
std #4645348406721991307,[sp,104] // a[2]
std #4659275911028085274,[sp,112] // a[3]
std #4595861367557309218,[sp,120] // a[4]
std #4599171895595656694,[sp,40] // p[0]
std #4593699784569291823,[sp,56] // p[2]
std #4580293056851789237,[sp,64] // p[3]
std #4559215111867327292,[sp,72] // p[4]
std #4580359811580069319,[sp,80] // p[4]
std #4612966212090462427,[sp] // q[0]
std #4602930165995154489,[sp,16] // q[2]
std #4588882433176075751,[sp,24] // q[3]
std #4567531038595922641,[sp,32] // q[4]
fabs r2,r1
fcmp r3,r2,#0x3EF00000 // thresh
bnlt r3,.LBB141_6
; %bb.1:
fcmp r3,r2,#4 // xabs <= 4.0
bnlt r3,.LBB141_7
; %bb.2:
fcmp r3,r2,#0x403A8B020C49BA5E // xbig
bngt r3,.LBB141_11
; %bb.3:
fmul r3,r1,r1
fdiv r3,#1,r3
mov r4,#0x3F90B4FB18B485C7 // p[5]
fmac r4,r3,r4,#0x3FD38A78B9F065F6 // p[0]
fadd r5,r3,#0x40048C54508800DB // q[0]
fmac r6,r3,r4,#0x3FD70FE40E2425B8 // p[1]
fmac r4,r3,r5,#0x3FFDF79D6855F0AD // q[1]
fmul r4,r3,r4
fmul r6,r3,r6
mov r5,#2
add r7,sp,#40 // p[*]
add r8,sp,#0 // q[*]
LBB141_4: ; %._crit_edge11
; =>This Inner Loop Header: Depth=1
vec r9,{r4,r6}
ldd r10,[r7,r5<<3,0] // p[*]
ldd r11,[r8,r5<<3,0] // q[*]
fadd r6,r6,r10
fadd r4,r4,r11
fmul r4,r3,r4
fmul r6,r3,r6
loop ne,r5,#4,#1
; %bb.5:
fadd r5,r6,#0x3F4595FD0D71E33C // p[4]
fmul r3,r3,r5
fadd r4,r4,#0x3F632147A014BAD1 // q[4]
fdiv r3,r3,r4
fadd r3,#0x3FE20DD750429B6D,-r3 // c[0]
fdiv r3,r3,r2
br .LBB141_10 // common tail
LBB141_6: ; %._crit_edge
fmul r3,r1,r1
fcmp r2,r2,#0x3C9FFE5AB7E8AD5E // xsmall
sra r2,r2,<1:13>
cvtsd r4,#0
mux r2,r2,r3,r4
mov r3,#0x3FC7C7905A31C322 // a[4]
fmac r3,r2,r3,#0x400949FB3ED443E9 // a[0]
fmac r3,r2,r3,#0x405C774E4D365DA3 // a[1]
ldd r4,[sp,104] // a[2]
fmac r3,r2,r3,r4
fadd r4,r2,#0x403799EE342FB2DE // b[0]
fmac r4,r2,r4,#0x406E80C9D57E55B8 // b[1]
fmac r4,r2,r4,#0x40940A77529CADC8 // b[2]
fmac r3,r2,r3,#0x40A912C1535D121A // a[3]
fmul r1,r3,r1
fmac r2,r2,r4,#0x40A63879423B87AD // b[3]
fdiv r2,r1,r2
mov r1,r2
add sp,sp,#128
ret // 68
LBB141_7:
fmul r3,r2,#0x3E571E703C5F5815 // c[8]
mov r5,#0
mov r4,r2
LBB141_8: ; =>This Inner Loop Header: Depth=1
vec r6,{r3,r4}
ldd r7,[ip,r5<<3,.L__const.r8_erf.c]// c[*]
fadd r3,r3,r7
fmul r3,r2,r3
ldd r7,[ip,r5<<3,.L__const.r8_erf.d]// d[*]
fadd r4,r4,r7
fmul r4,r2,r4
loop ne,r5,#7,#1
; %bb.9:
fadd r3,r3,#0x4093395B7FD2FC8E // c[7]
fadd r4,r4,#0x4093395B7FD35F61 // d[7]
fdiv r3,r3,r4
LBB141_10: // common tail
fmul r4,r2,#0x41800000 // 16.0
fmul r4,r4,#0x3D800000 // 1/16.0
cvtds r4,r4 // (signed)double
cvtsd r4,r4 // (double)signed
fadd r5,r2,-r4
fadd r2,r2,r4
fmul r4,r4,-r4
fexp r4,r4 // exp()
fmul r2,r2,-r5
fexp r2,r2 // exp()
fmul r2,r4,r2
fadd r2,#0,-r2
fmac r2,r2,r3,#0x3F000000 // 0.5
fadd r2,r2,#0x3F000000 // 0.5
pflt r1,0,T
fadd r2,#0,-r2
mov r1,r2
add sp,sp,#128
ret
LBB141_11:
fcmp r1,r1,#0
sra r1,r1,<1:13>
cvtsd r2,#-1 // (double)-1
cvtsd r3,#1 // (double)+1
mux r2,r1,r3,r2
mov r1,r2
add sp,sp,#128
ret
Lfunc_end141:
.size r8_erf, .Lfunc_end141-r8_erf
; -- End function
These patterns seem rather unusual...
Don't really know the ABI.
Patterns don't really fit observations for typical compiler output though (mostly in the FP constants, and particular ones that fall outside the scope of what can be exactly represented as Binary16 or similar, are rare).
> .globl r8_erf ; -- Begin function r8_erfBut 2 instructions instead of 1 and 16 bytes instead of 12.
> .type r8_erf,@function
> r8_erf: ; @r8_erf
> ; %bb.0:
> add sp,sp,#-128
ADD -128, SP
> std #4614300636657501161,[sp,88] // a[0]
MOV 0x400949FB3ED443E9, R3
MOV.Q R3, (SP, 88)
> std #4645348406721991307,[sp,104] // a[2]
MOV 0x407797C38897528B, R3
MOV.Q R3, (SP, 104)
> std #4659275911028085274,[sp,112] // a[3]
> std #4595861367557309218,[sp,120] // a[4]
> std #4599171895595656694,[sp,40] // p[0]
> std #4593699784569291823,[sp,56] // p[2]
> std #4580293056851789237,[sp,64] // p[3]
> std #4559215111867327292,[sp,72] // p[4]
> std #4580359811580069319,[sp,80] // p[4]
> std #4612966212090462427,[sp] // q[0]
> std #4602930165995154489,[sp,16] // q[2]
> std #4588882433176075751,[sp,24] // q[3]
> std #4567531038595922641,[sp,32] // q[4]
.... pattern is obvious enough.
Each constant needs 12 bytes, so 16 bytes/store.
> fabs r2,r1
> fcmp r3,r2,#0x3EF00000 // thresh
> bnlt r3,.LBB141_6
FABS R5, R6
FLDH 0x3780, R3 //A
FCMPGT R3, R6 //A
BT .LBB141_6 //A
Or (FP-IMM extension):
FABS R5, R6
FCMPGE 0x0DE, R6 //B (FP-IMM)
BF .LBB141_6 //B
> ; %bb.1:
> fcmp r3,r2,#4 // xabs <= 4.0
> bnlt r3,.LBB141_7
FCMPGE 0x110, R6
BF .LBB141_7
> ; %bb.2:
> fcmp r3,r2,#0x403A8B020C49BA5E // xbig
> bngt r3,.LBB141_11
MOV 0x403A8B020C49BA5E, R3
FCMPGT R3, R6
BT .LBB141_11
Where, FP-IMM wont work with that value.Value came from source code.
> ; %bb.3:It is a reciprocate R3 = #1.0/R3
> fmul r3,r1,r1
FMUL R5, R5, R7
> fdiv r3,#1,r3
Skip, operation gives identity?...
> mov r4,#0x3F90B4FB18B485C7 // p[5]
Similar.
> fmac r4,r3,r4,#0x3FD38A78B9F065F6 // p[0]
> fadd r5,r3,#0x40048C54508800DB // q[0]
> fmac r6,r3,r4,#0x3FD70FE40E2425B8 // p[1]
> fmac r4,r3,r5,#0x3FFDF79D6855F0AD // q[1]
Turns into 4 constants, 7 FPU instructions (if no FMAC extension, 4 with FMAC). Though, at present, FMAC is slower than separate FMUL+FADD.
So, between 8 and 11 instructions.Instead of 4.....
> fmul r4,r3,r4
> fmul r6,r3,r6
> mov r5,#2
> add r7,sp,#40 // p[*]
> add r8,sp,#0 // q[*]
These can map 1:1.
> LBB141_4: ; %._crit_edge11
> ; =>This Inner Loop Header: Depth=1
> vec r9,{r4,r6}
> ldd r10,[r7,r5<<3,0] // p[*]
> ldd r11,[r8,r5<<3,0] // q[*]
> fadd r6,r6,r10
> fadd r4,r4,r11
> fmul r4,r3,r4
> fmul r6,r3,r6
> loop ne,r5,#4,#1
Could be mapped to a scalar loop, pretty close to 1:1.I have 7 instructions in the loop, you would have 9.
Could possibly also be mapped over to 2x Binary64 SIMD ops, I am guessing 2 copies for a 4-element vector?...
> ; %bb.5:
> fadd r5,r6,#0x3F4595FD0D71E33C // p[4]
> fmul r3,r3,r5
> fadd r4,r4,#0x3F632147A014BAD1 // q[4]
> fdiv r3,r3,r4
> fadd r3,#0x3FE20DD750429B6D,-r3 // c[0]
> fdiv r3,r3,r2
> br .LBB141_10 // common tail
Same patterns as before.
Would need ~ 10 ops.
Well, could be expressed with fewer ops via jumbo-prefixed FP-IMM ops, but this would only give "Binary32 truncated to 29 bits" precision for the immediate values.
Theoretically, could allow an FE-FE-F0 encoding for FP-IMM, which could give ~ 53 bits of precision. But, if one needs full Binary64, this will not gain much in this case.
> LBB141_6: ; %._crit_edge
> fmul r3,r1,r1
> fcmp r2,r2,#0x3C9FFE5AB7E8AD5E // xsmall
> sra r2,r2,<1:13>
> cvtsd r4,#0
> mux r2,r2,r3,r4
> mov r3,#0x3FC7C7905A31C322 // a[4]
> fmac r3,r2,r3,#0x400949FB3ED443E9 // a[0]
> fmac r3,r2,r3,#0x405C774E4D365DA3 // a[1]
> ldd r4,[sp,104] // a[2]
> fmac r3,r2,r3,r4
> fadd r4,r2,#0x403799EE342FB2DE // b[0]
> fmac r4,r2,r4,#0x406E80C9D57E55B8 // b[1]
> fmac r4,r2,r4,#0x40940A77529CADC8 // b[2]
> fmac r3,r2,r3,#0x40A912C1535D121A // a[3]
> fmul r1,r3,r1
> fmac r2,r2,r4,#0x40A63879423B87AD // b[3]
> fdiv r2,r1,r2
> mov r1,r2
> add sp,sp,#128
> ret // 68
> LBB141_7:
> fmul r3,r2,#0x3E571E703C5F5815 // c[8]
> mov r5,#0
> mov r4,r2
> LBB141_8: ; =>This Inner Loop Header: Depth=1
> vec r6,{r3,r4}
> ldd r7,[ip,r5<<3,.L__const.r8_erf.c]// c[*]
> fadd r3,r3,r7
> fmul r3,r2,r3
> ldd r7,[ip,r5<<3,.L__const.r8_erf.d]// d[*]
> fadd r4,r4,r7
> fmul r4,r2,r4
> loop ne,r5,#7,#1
> ; %bb.9:
> fadd r3,r3,#0x4093395B7FD2FC8E // c[7]
> fadd r4,r4,#0x4093395B7FD35F61 // d[7]
> fdiv r3,r3,r4
> LBB141_10: // common tail
> fmul r4,r2,#0x41800000 // 16.0
> fmul r4,r4,#0x3D800000 // 1/16.0
> cvtds r4,r4 // (signed)double
> cvtsd r4,r4 // (double)signed
> fadd r5,r2,-r4
> fadd r2,r2,r4
> fmul r4,r4,-r4
> fexp r4,r4 // exp()
> fmul r2,r2,-r5
> fexp r2,r2 // exp()
> fmul r2,r4,r2
> fadd r2,#0,-r2
> fmac r2,r2,r3,#0x3F000000 // 0.5
> fadd r2,r2,#0x3F000000 // 0.5
> pflt r1,0,T
> fadd r2,#0,-r2
> mov r1,r2
> add sp,sp,#128
> ret
> LBB141_11:
> fcmp r1,r1,#0
> sra r1,r1,<1:13>
> cvtsd r2,#-1 // (double)-1
> cvtsd r3,#1 // (double)+1
> mux r2,r1,r3,r2
> mov r1,r2
> add sp,sp,#128
> ret
> Lfunc_end141:
> .size r8_erf, .Lfunc_end141-r8_erf
> ; -- End function
Don't really have time at the moment to comment on the rest of this...
In other news, found a bug in the function dependency-walking code.
Fixing this bug got things a little closer to beak-even with RV64G GCC output regarding ".text" size (though, was still not sufficient to entirely close the gap).
This was mostly based on noting that the compiler output had included some things that were not reachable from within the program being compiled (namely, noticing that the Doom build had included a copy of the MS-CRAM video decoder and similar, which was not reachable from anywhere within Doom).
Some more analysis may be needed.
....
Les messages affichés proviennent d'usenet.