Re: "Mini" tags to reduce the number of op codes

Liste des GroupesRevenir à c arch 
Sujet : Re: "Mini" tags to reduce the number of op codes
De : cr88192 (at) *nospam* gmail.com (BGB)
Groupes : comp.arch
Date : 12. Apr 2024, 19:12:28
Autres entêtes
Organisation : A noiseless patient Spider
Message-ID : <uvbtif$2gat0$1@dont-email.me>
References : 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
User-Agent : Mozilla Thunderbird
On 4/11/2024 8:40 PM, MitchAlsup1 wrote:
BGB wrote:
 
On 4/11/2024 6:06 PM, MitchAlsup1 wrote:
>
>
While I admit that <basically> anything bigger than 50-bits will be fine
as displacements, they are not fine for constants and especially FP
constants and many bit twiddling constants.
>
 
The number of cases where this comes up is not statistically significant enough to have a meaningful impact on performance.
 
Fraction of a percent edge-cases are not deal-breakers, as I see it.
 Idle speculation::
      .globl    r8_erf                          ; -- Begin function r8_erf
     .type    r8_erf,@function
r8_erf:                                 ; @r8_erf
; %bb.0:
     add    sp,sp,#-128
     std    #4614300636657501161,[sp,88]    // a[0]
     std    #4645348406721991307,[sp,104]    // a[2]
     std    #4659275911028085274,[sp,112]    // a[3]
     std    #4595861367557309218,[sp,120]    // a[4]
     std    #4599171895595656694,[sp,40]    // p[0]
     std    #4593699784569291823,[sp,56]    // p[2]
     std    #4580293056851789237,[sp,64]    // p[3]
     std    #4559215111867327292,[sp,72]    // p[4]
     std    #4580359811580069319,[sp,80]    // p[4]
     std    #4612966212090462427,[sp]    // q[0]
     std    #4602930165995154489,[sp,16]    // q[2]
     std    #4588882433176075751,[sp,24]    // q[3]
     std    #4567531038595922641,[sp,32]    // q[4]
     fabs    r2,r1
     fcmp    r3,r2,#0x3EF00000        // thresh
     bnlt    r3,.LBB141_6
; %bb.1:
     fcmp    r3,r2,#4            // xabs <= 4.0
     bnlt    r3,.LBB141_7
; %bb.2:
     fcmp    r3,r2,#0x403A8B020C49BA5E    // xbig
     bngt    r3,.LBB141_11
; %bb.3:
     fmul    r3,r1,r1
     fdiv    r3,#1,r3
     mov    r4,#0x3F90B4FB18B485C7        // p[5]
     fmac    r4,r3,r4,#0x3FD38A78B9F065F6    // p[0]
     fadd    r5,r3,#0x40048C54508800DB    // q[0]
     fmac    r6,r3,r4,#0x3FD70FE40E2425B8    // p[1]
     fmac    r4,r3,r5,#0x3FFDF79D6855F0AD    // q[1]
     fmul    r4,r3,r4
     fmul    r6,r3,r6
     mov    r5,#2
     add    r7,sp,#40            // p[*]
     add    r8,sp,#0            // q[*]
LBB141_4:                              ; %._crit_edge11
                                        ; =>This Inner Loop Header: Depth=1
     vec    r9,{r4,r6}
     ldd    r10,[r7,r5<<3,0]        // p[*]
     ldd    r11,[r8,r5<<3,0]        // q[*]
     fadd    r6,r6,r10
     fadd    r4,r4,r11
     fmul    r4,r3,r4
     fmul    r6,r3,r6
     loop    ne,r5,#4,#1
; %bb.5:
     fadd    r5,r6,#0x3F4595FD0D71E33C    // p[4]
     fmul    r3,r3,r5
     fadd    r4,r4,#0x3F632147A014BAD1    // q[4]
     fdiv    r3,r3,r4
     fadd    r3,#0x3FE20DD750429B6D,-r3    // c[0]
     fdiv    r3,r3,r2
     br    .LBB141_10            // common tail
LBB141_6:                              ; %._crit_edge
     fmul    r3,r1,r1
     fcmp    r2,r2,#0x3C9FFE5AB7E8AD5E    // xsmall
     sra    r2,r2,<1:13>
     cvtsd    r4,#0
     mux    r2,r2,r3,r4
     mov    r3,#0x3FC7C7905A31C322        // a[4]
     fmac    r3,r2,r3,#0x400949FB3ED443E9    // a[0]
     fmac    r3,r2,r3,#0x405C774E4D365DA3    // a[1]
     ldd    r4,[sp,104]            // a[2]
     fmac    r3,r2,r3,r4
     fadd    r4,r2,#0x403799EE342FB2DE    // b[0]
     fmac    r4,r2,r4,#0x406E80C9D57E55B8    // b[1]
     fmac    r4,r2,r4,#0x40940A77529CADC8    // b[2]
     fmac    r3,r2,r3,#0x40A912C1535D121A    // a[3]
     fmul    r1,r3,r1
     fmac    r2,r2,r4,#0x40A63879423B87AD    // b[3]
     fdiv    r2,r1,r2
     mov    r1,r2
     add    sp,sp,#128
     ret                // 68
LBB141_7:
     fmul    r3,r2,#0x3E571E703C5F5815    // c[8]
     mov    r5,#0
     mov    r4,r2
LBB141_8:                              ; =>This Inner Loop Header: Depth=1
     vec    r6,{r3,r4}
     ldd    r7,[ip,r5<<3,.L__const.r8_erf.c]// c[*]
     fadd    r3,r3,r7
     fmul    r3,r2,r3
     ldd    r7,[ip,r5<<3,.L__const.r8_erf.d]// d[*]
     fadd    r4,r4,r7
     fmul    r4,r2,r4
     loop    ne,r5,#7,#1
; %bb.9:
     fadd    r3,r3,#0x4093395B7FD2FC8E    // c[7]
     fadd    r4,r4,#0x4093395B7FD35F61    // d[7]
     fdiv    r3,r3,r4
LBB141_10:                // common tail
     fmul    r4,r2,#0x41800000        // 16.0
     fmul    r4,r4,#0x3D800000        // 1/16.0
     cvtds    r4,r4                // (signed)double
     cvtsd    r4,r4                // (double)signed
     fadd    r5,r2,-r4
     fadd    r2,r2,r4
     fmul    r4,r4,-r4
     fexp    r4,r4                // exp()
     fmul    r2,r2,-r5
     fexp    r2,r2                // exp()
     fmul    r2,r4,r2
     fadd    r2,#0,-r2
     fmac    r2,r2,r3,#0x3F000000        // 0.5
     fadd    r2,r2,#0x3F000000        // 0.5
     pflt    r1,0,T
     fadd    r2,#0,-r2
     mov    r1,r2
     add    sp,sp,#128
     ret
LBB141_11:
     fcmp    r1,r1,#0
     sra    r1,r1,<1:13>
     cvtsd    r2,#-1                // (double)-1
     cvtsd    r3,#1                // (double)+1
     mux    r2,r1,r3,r2
     mov    r1,r2
     add    sp,sp,#128
     ret
Lfunc_end141:
     .size    r8_erf, .Lfunc_end141-r8_erf
                                        ; -- End function
These patterns seem rather unusual...
Don't really know the ABI.
Patterns don't really fit observations for typical compiler output though (mostly in the FP constants, and particular ones that fall outside the scope of what can be exactly represented as Binary16 or similar, are rare).
 >      .globl    r8_erf                          ; -- Begin function r8_erf
 >      .type    r8_erf,@function
 > r8_erf:                                 ; @r8_erf
 > ; %bb.0:
 >      add    sp,sp,#-128
ADD -128, SP
 >      std    #4614300636657501161,[sp,88]    // a[0]
   MOV 0x400949FB3ED443E9, R3
   MOV.Q R3, (SP, 88)
 >      std    #4645348406721991307,[sp,104]    // a[2]
   MOV 0x407797C38897528B, R3
   MOV.Q R3, (SP, 104)
 >      std    #4659275911028085274,[sp,112]    // a[3]
 >      std    #4595861367557309218,[sp,120]    // a[4]
 >      std    #4599171895595656694,[sp,40]    // p[0]
 >      std    #4593699784569291823,[sp,56]    // p[2]
 >      std    #4580293056851789237,[sp,64]    // p[3]
 >      std    #4559215111867327292,[sp,72]    // p[4]
 >      std    #4580359811580069319,[sp,80]    // p[4]
 >      std    #4612966212090462427,[sp]    // q[0]
 >      std    #4602930165995154489,[sp,16]    // q[2]
 >      std    #4588882433176075751,[sp,24]    // q[3]
 >      std    #4567531038595922641,[sp,32]    // q[4]
... pattern is obvious enough.
Each constant needs 12 bytes, so 16 bytes/store.
 >      fabs    r2,r1
 >      fcmp    r3,r2,#0x3EF00000        // thresh
 >      bnlt    r3,.LBB141_6
   FABS   R5, R6
   FLDH   0x3780, R3  //A
   FCMPGT R3, R6      //A
   BT     .LBB141_6   //A
Or (FP-IMM extension):
   FABS   R5, R6
   FCMPGE 0x0DE, R6    //B (FP-IMM)
   BF     .LBB141_6   //B
 > ; %bb.1:
 >      fcmp    r3,r2,#4            // xabs <= 4.0
 >      bnlt    r3,.LBB141_7
    FCMPGE  0x110, R6
    BF      .LBB141_7
 > ; %bb.2:
 >      fcmp    r3,r2,#0x403A8B020C49BA5E    // xbig
 >      bngt    r3,.LBB141_11
   MOV    0x403A8B020C49BA5E, R3
   FCMPGT R3, R6
   BT     .LBB141_11
Where, FP-IMM wont work with that value.
 > ; %bb.3:
 >      fmul    r3,r1,r1
   FMUL  R5, R5, R7
 >      fdiv    r3,#1,r3
   Skip, operation gives identity?...
 >      mov    r4,#0x3F90B4FB18B485C7        // p[5]
Similar.
 >      fmac    r4,r3,r4,#0x3FD38A78B9F065F6    // p[0]
 >      fadd    r5,r3,#0x40048C54508800DB    // q[0]
 >      fmac    r6,r3,r4,#0x3FD70FE40E2425B8    // p[1]
 >      fmac    r4,r3,r5,#0x3FFDF79D6855F0AD    // q[1]
Turns into 4 constants, 7 FPU instructions (if no FMAC extension, 4 with FMAC). Though, at present, FMAC is slower than separate FMUL+FADD.
So, between 8 and 11 instructions.
 >      fmul    r4,r3,r4
 >      fmul    r6,r3,r6
 >      mov    r5,#2
 >      add    r7,sp,#40            // p[*]
 >      add    r8,sp,#0            // q[*]
These can map 1:1.
 > LBB141_4:                              ; %._crit_edge11
 >                                         ; =>This Inner Loop Header: Depth=1
 >      vec    r9,{r4,r6}
 >      ldd    r10,[r7,r5<<3,0]        // p[*]
 >      ldd    r11,[r8,r5<<3,0]        // q[*]
 >      fadd    r6,r6,r10
 >      fadd    r4,r4,r11
 >      fmul    r4,r3,r4
 >      fmul    r6,r3,r6
 >      loop    ne,r5,#4,#1
Could be mapped to a scalar loop, pretty close to 1:1.
Could possibly also be mapped over to 2x Binary64 SIMD ops, I am guessing 2 copies for a 4-element vector?...
 > ; %bb.5:
 >      fadd    r5,r6,#0x3F4595FD0D71E33C    // p[4]
 >      fmul    r3,r3,r5
 >      fadd    r4,r4,#0x3F632147A014BAD1    // q[4]
 >      fdiv    r3,r3,r4
 >      fadd    r3,#0x3FE20DD750429B6D,-r3    // c[0]
 >      fdiv    r3,r3,r2
 >      br    .LBB141_10            // common tail
Same patterns as before.
Would need ~ 10 ops.
Well, could be expressed with fewer ops via jumbo-prefixed FP-IMM ops, but this would only give "Binary32 truncated to 29 bits" precision for the immediate values.
Theoretically, could allow an FE-FE-F0 encoding for FP-IMM, which could give ~ 53 bits of precision. But, if one needs full Binary64, this will not gain much in this case.
 > LBB141_6:                              ; %._crit_edge
 >      fmul    r3,r1,r1
 >      fcmp    r2,r2,#0x3C9FFE5AB7E8AD5E    // xsmall
 >      sra    r2,r2,<1:13>
 >      cvtsd    r4,#0
 >      mux    r2,r2,r3,r4
 >      mov    r3,#0x3FC7C7905A31C322        // a[4]
 >      fmac    r3,r2,r3,#0x400949FB3ED443E9    // a[0]
 >      fmac    r3,r2,r3,#0x405C774E4D365DA3    // a[1]
 >      ldd    r4,[sp,104]            // a[2]
 >      fmac    r3,r2,r3,r4
 >      fadd    r4,r2,#0x403799EE342FB2DE    // b[0]
 >      fmac    r4,r2,r4,#0x406E80C9D57E55B8    // b[1]
 >      fmac    r4,r2,r4,#0x40940A77529CADC8    // b[2]
 >      fmac    r3,r2,r3,#0x40A912C1535D121A    // a[3]
 >      fmul    r1,r3,r1
 >      fmac    r2,r2,r4,#0x40A63879423B87AD    // b[3]
 >      fdiv    r2,r1,r2
 >      mov    r1,r2
 >      add    sp,sp,#128
 >      ret                // 68
 > LBB141_7:
 >      fmul    r3,r2,#0x3E571E703C5F5815    // c[8]
 >      mov    r5,#0
 >      mov    r4,r2
 > LBB141_8:                              ; =>This Inner Loop Header: Depth=1
 >      vec    r6,{r3,r4}
 >      ldd    r7,[ip,r5<<3,.L__const.r8_erf.c]// c[*]
 >      fadd    r3,r3,r7
 >      fmul    r3,r2,r3
 >      ldd    r7,[ip,r5<<3,.L__const.r8_erf.d]// d[*]
 >      fadd    r4,r4,r7
 >      fmul    r4,r2,r4
 >      loop    ne,r5,#7,#1
 > ; %bb.9:
 >      fadd    r3,r3,#0x4093395B7FD2FC8E    // c[7]
 >      fadd    r4,r4,#0x4093395B7FD35F61    // d[7]
 >      fdiv    r3,r3,r4
 > LBB141_10:                // common tail
 >      fmul    r4,r2,#0x41800000        // 16.0
 >      fmul    r4,r4,#0x3D800000        // 1/16.0
 >      cvtds    r4,r4                // (signed)double
 >      cvtsd    r4,r4                // (double)signed
 >      fadd    r5,r2,-r4
 >      fadd    r2,r2,r4
 >      fmul    r4,r4,-r4
 >      fexp    r4,r4                // exp()
 >      fmul    r2,r2,-r5
 >      fexp    r2,r2                // exp()
 >      fmul    r2,r4,r2
 >      fadd    r2,#0,-r2
 >      fmac    r2,r2,r3,#0x3F000000        // 0.5
 >      fadd    r2,r2,#0x3F000000        // 0.5
 >      pflt    r1,0,T
 >      fadd    r2,#0,-r2
 >      mov    r1,r2
 >      add    sp,sp,#128
 >      ret
 > LBB141_11:
 >      fcmp    r1,r1,#0
 >      sra    r1,r1,<1:13>
 >      cvtsd    r2,#-1                // (double)-1
 >      cvtsd    r3,#1                // (double)+1
 >      mux    r2,r1,r3,r2
 >      mov    r1,r2
 >      add    sp,sp,#128
 >      ret
 > Lfunc_end141:
 >      .size    r8_erf, .Lfunc_end141-r8_erf
 >                                         ; -- End function
Don't really have time at the moment to comment on the rest of this...
In other news, found a bug in the function dependency-walking code.
Fixing this bug got things a little closer to beak-even with RV64G GCC output regarding ".text" size (though, was still not sufficient to entirely close the gap).
This was mostly based on noting that the compiler output had included some things that were not reachable from within the program being compiled (namely, noticing that the Doom build had included a copy of the MS-CRAM video decoder and similar, which was not reachable from anywhere within Doom).
Some more analysis may be needed.
...

Date Sujet#  Auteur
3 Apr 24 * "Mini" tags to reduce the number of op codes81Stephen Fuld
3 Apr 24 +* Re: "Mini" tags to reduce the number of op codes8Anton Ertl
15 Apr 24 i+* Re: "Mini" tags to reduce the number of op codes6MitchAlsup1
15 Apr 24 ii`* Re: "Mini" tags to reduce the number of op codes5Terje Mathisen
15 Apr 24 ii +- Re: "Mini" tags to reduce the number of op codes1Terje Mathisen
15 Apr 24 ii `* Re: "Mini" tags to reduce the number of op codes3MitchAlsup1
16 Apr 24 ii  `* Re: "Mini" tags to reduce the number of op codes2Terje Mathisen
16 Apr 24 ii   `- Re: "Mini" tags to reduce the number of op codes1MitchAlsup1
17 Apr 24 i`- Re: "Mini" tags to reduce the number of op codes1Stephen Fuld
3 Apr 24 +* Re: "Mini" tags to reduce the number of op codes3Thomas Koenig
17 Apr 24 i`* Re: "Mini" tags to reduce the number of op codes2Stephen Fuld
17 Apr 24 i `- Re: "Mini" tags to reduce the number of op codes1BGB-Alt
3 Apr 24 +* Re: "Mini" tags to reduce the number of op codes12BGB-Alt
3 Apr 24 i+* Re: "Mini" tags to reduce the number of op codes9MitchAlsup1
4 Apr 24 ii+* Re: "Mini" tags to reduce the number of op codes7Terje Mathisen
4 Apr 24 iii+* Re: "Mini" tags to reduce the number of op codes3Michael S
4 Apr 24 iiii`* Re: "Mini" tags to reduce the number of op codes2Terje Mathisen
4 Apr 24 iiii `- Re: "Mini" tags to reduce the number of op codes1Michael S
5 Apr 24 iii`* Re: "Mini" tags to reduce the number of op codes3BGB-Alt
5 Apr 24 iii `* Re: "Mini" tags to reduce the number of op codes2MitchAlsup1
5 Apr 24 iii  `- Re: "Mini" tags to reduce the number of op codes1BGB
17 Apr 24 ii`- Re: "Mini" tags to reduce the number of op codes1Stephen Fuld
3 Apr 24 i`* Re: "Mini" tags to reduce the number of op codes2MitchAlsup1
4 Apr 24 i `- Re: "Mini" tags to reduce the number of op codes1BGB
5 Apr 24 +* Re: "Mini" tags to reduce the number of op codes54John Savard
5 Apr 24 i+- Re: "Mini" tags to reduce the number of op codes1BGB-Alt
5 Apr 24 i`* Re: "Mini" tags to reduce the number of op codes52MitchAlsup1
7 Apr 24 i `* Re: "Mini" tags to reduce the number of op codes51John Savard
7 Apr 24 i  +* Re: "Mini" tags to reduce the number of op codes6MitchAlsup1
8 Apr 24 i  i`* Re: "Mini" tags to reduce the number of op codes5John Savard
8 Apr 24 i  i +* Re: "Mini" tags to reduce the number of op codes2Thomas Koenig
17 Apr 24 i  i i`- Re: "Mini" tags to reduce the number of op codes1John Savard
8 Apr 24 i  i `* Re: "Mini" tags to reduce the number of op codes2MitchAlsup1
17 Apr 24 i  i  `- Re: "Mini" tags to reduce the number of op codes1John Savard
7 Apr 24 i  `* Re: "Mini" tags to reduce the number of op codes44Thomas Koenig
7 Apr 24 i   `* Re: "Mini" tags to reduce the number of op codes43MitchAlsup1
8 Apr 24 i    `* Re: "Mini" tags to reduce the number of op codes42Thomas Koenig
8 Apr 24 i     +- Re: "Mini" tags to reduce the number of op codes1Anton Ertl
9 Apr 24 i     `* Re: "Mini" tags to reduce the number of op codes40Thomas Koenig
9 Apr 24 i      +* Re: "Mini" tags to reduce the number of op codes38BGB
9 Apr 24 i      i`* Re: "Mini" tags to reduce the number of op codes37MitchAlsup1
10 Apr 24 i      i `* Re: "Mini" tags to reduce the number of op codes36BGB-Alt
10 Apr 24 i      i  +* Re: "Mini" tags to reduce the number of op codes31MitchAlsup1
10 Apr 24 i      i  i+* Re: "Mini" tags to reduce the number of op codes23BGB
10 Apr 24 i      i  ii`* Re: "Mini" tags to reduce the number of op codes22MitchAlsup1
10 Apr 24 i      i  ii +* Re: "Mini" tags to reduce the number of op codes3BGB-Alt
10 Apr 24 i      i  ii i`* Re: "Mini" tags to reduce the number of op codes2MitchAlsup1
11 Apr 24 i      i  ii i `- Re: "Mini" tags to reduce the number of op codes1BGB
10 Apr 24 i      i  ii +- Re: "Mini" tags to reduce the number of op codes1BGB-Alt
11 Apr 24 i      i  ii +* Re: "Mini" tags to reduce the number of op codes16MitchAlsup1
11 Apr 24 i      i  ii i`* Re: "Mini" tags to reduce the number of op codes15Michael S
11 Apr 24 i      i  ii i `* Re: "Mini" tags to reduce the number of op codes14BGB
11 Apr 24 i      i  ii i  `* Re: "Mini" tags to reduce the number of op codes13MitchAlsup1
11 Apr 24 i      i  ii i   +* Re: "Mini" tags to reduce the number of op codes9BGB-Alt
12 Apr 24 i      i  ii i   i`* Re: "Mini" tags to reduce the number of op codes8MitchAlsup1
12 Apr 24 i      i  ii i   i `* Re: "Mini" tags to reduce the number of op codes7BGB
12 Apr 24 i      i  ii i   i  `* Re: "Mini" tags to reduce the number of op codes6MitchAlsup1
12 Apr 24 i      i  ii i   i   `* Re: "Mini" tags to reduce the number of op codes5BGB
13 Apr 24 i      i  ii i   i    +- Re: "Mini" tags to reduce the number of op codes1MitchAlsup1
13 Apr 24 i      i  ii i   i    `* Re: "Mini" tags to reduce the number of op codes3MitchAlsup1
13 Apr 24 i      i  ii i   i     +- Re: "Mini" tags to reduce the number of op codes1BGB
15 Apr 24 i      i  ii i   i     `- Re: "Mini" tags to reduce the number of op codes1BGB-Alt
12 Apr 24 i      i  ii i   `* Re: "Mini" tags to reduce the number of op codes3Michael S
12 Apr 24 i      i  ii i    +- Re: "Mini" tags to reduce the number of op codes1Michael S
15 Apr 24 i      i  ii i    `- Re: "Mini" tags to reduce the number of op codes1MitchAlsup1
11 Apr 24 i      i  ii `- Re: "Mini" tags to reduce the number of op codes1Terje Mathisen
11 Apr 24 i      i  i`* Re: "Mini" tags to reduce the number of op codes7Paul A. Clayton
11 Apr 24 i      i  i +- Re: "Mini" tags to reduce the number of op codes1BGB
11 Apr 24 i      i  i +* Re: "Mini" tags to reduce the number of op codes2BGB-Alt
12 Apr 24 i      i  i i`- Re: "Mini" tags to reduce the number of op codes1MitchAlsup1
12 Apr 24 i      i  i +* Re: "Mini" tags to reduce the number of op codes2MitchAlsup1
21 Apr 24 i      i  i i`- Re: "Mini" tags to reduce the number of op codes1Paul A. Clayton
21 Apr 24 i      i  i `- Re: "Mini" tags to reduce the number of op codes1Paul A. Clayton
10 Apr 24 i      i  `* Re: "Mini" tags to reduce the number of op codes4Chris M. Thomasson
10 Apr 24 i      i   `* Re: "Mini" tags to reduce the number of op codes3BGB
10 Apr 24 i      i    `* Re: "Mini" tags to reduce the number of op codes2Chris M. Thomasson
10 Apr 24 i      i     `- Re: "Mini" tags to reduce the number of op codes1BGB-Alt
13 Apr 24 i      `- Re: "Mini" tags to reduce the number of op codes1Brian G. Lucas
15 Apr 24 +- Re: "Mini" tags to reduce the number of op codes1MitchAlsup1
17 Apr 24 `* Re: "Mini" tags to reduce the number of op codes2Stephen Fuld
17 Apr 24  `- Re: "Mini" tags to reduce the number of op codes1MitchAlsup1

Haut de la page

Les messages affichés proviennent d'usenet.

NewsPortal