Re: [GHC] #16243: Improve fregs-graph.

16 Feb 2019

      #16243: Improve fregs-graph.
-------------------------------------+-------------------------------------
        Reporter:  AndreasK          |                Owner:  (none)
            Type:  task              |               Status:  new
        Priority:  normal            |            Milestone:
       Component:  Compiler          |              Version:  8.6.3
  (CodeGen)                          |
      Resolution:                    |             Keywords:  fregs-graph
Operating System:  Unknown/Multiple  |         Architecture:
                                     |  Unknown/Multiple
 Type of failure:  None/Unknown      |            Test Case:
      Blocked By:                    |             Blocking:
 Related Tickets:  #7679             |  Differential Rev(s):
       Wiki Page:                    |
-------------------------------------+-------------------------------------

Comment (by AndreasK):

 == The case for live range splitting.

 I've looked a bit at loops recently.

 Below is a inner loop from nofib/fannkuch-redux. It's not some random code
 either. The loop makes up ~30% of the programs execution time

 First of the code we generate isn't great. We spill/reload variables
 across the call that are not used in the loop.
 Even so the final result is code that uses fewer vregs than we have
 registers available:

 {{{
 #!C
       ca12: // global
           _s9D0::I64 = %MO_UU_Conv_W8_W64(I8[_s9zN::I64]);
           if (_s9D0::I64 != 0) goto ca1f; else goto ca1i;
       ca1f: // global
           I64[Sp - 16] = block_ca1d_info;
           R3 = _s9zN::I64 + _s9D0::I64;
           R2 = _s9zN::I64;
           I64[Sp - 8] = _s9CV::I64;
           I64[Sp] = _s9Cd::I64;
           I64[Sp + 40] = _s9Cc::I64;
           I64[Sp + 48] = _s9Cb::I64;
           I64[Sp + 56] = _s9zN::I64;
           Sp = Sp - 16;
           call $wflopp_r9x0_info(R3,
                                  R2) returns to ca1d, args: 8, res: 8,
 upd: 8;
       ca1d: // global
           _s9z5::I64 = I64[Sp + 24];
           _s9zv::I64 = I64[Sp + 32];
           _s9zx::I64 = I64[Sp + 40];
           _s9zz::I64 = I64[Sp + 48];
           _s9zN::I64 = I64[Sp + 72];
           _s9Cb::I64 = I64[Sp + 64];
           _s9Cc::I64 = I64[Sp + 56];
           _s9Cd::I64 = I64[Sp + 16];
           _s9CV::I64 = I64[Sp + 8] + 1;
           Sp = Sp + 16;
           goto ca12;
 }}}

 We get this liveness:
 {{{
 #!asm
         ca12:
                 movzbl (%vI_s9zN),%vI_s9D0
                     # born:    %vI_s9D0
                 testq %vI_s9D0,%vI_s9D0
                 jne _ca1f
                     # r_dying: %vI_s9D0
                 jmp _ca1i
                     # r_dying: %vI_s9z5 %vI_s9zv %vI_s9zx %vI_s9zz
 %vI_s9zN %vI_s9Cb %vI_s9Cc %vI_s9Cd %vI_s9CV
      NONREC
         ca1f:
                 movq $block_ca1d_info,-16(%rbp)
                 movq %vI_s9zN,%rsi
                     # born:    %r4
                 addq %vI_s9D0,%rsi
                     # r_dying: %vI_s9D0
                 movq %vI_s9zN,%r14
                     # born:    %r14
                 movq %vI_s9CV,-8(%rbp)
                     # r_dying: %vI_s9CV
                 movq %vI_s9Cd,(%rbp)
                     # r_dying: %vI_s9Cd
                 movq %vI_s9Cc,40(%rbp)
                     # r_dying: %vI_s9Cc
                 movq %vI_s9Cb,48(%rbp)
                     # r_dying: %vI_s9Cb
                 movq %vI_s9zN,56(%rbp)
                     # r_dying: %vI_s9zN
                 addq $-16,%rbp
                 jmp $wflopp_r9x0_info
                     # r_dying: %r4 %r14
      NONREC
         ca1d:
                 movq 24(%rbp),%vI_s9z5
                     # born:    %vI_s9z5
                 movq 32(%rbp),%vI_s9zv
                     # born:    %vI_s9zv
                 movq 40(%rbp),%vI_s9zx
                     # born:    %vI_s9zx
                 movq 48(%rbp),%vI_s9zz
                     # born:    %vI_s9zz
                 movq 72(%rbp),%vI_s9zN
                     # born:    %vI_s9zN
                 movq 64(%rbp),%vI_s9Cb
                     # born:    %vI_s9Cb
                 movq 56(%rbp),%vI_s9Cc
                     # born:    %vI_s9Cc
                 movq 16(%rbp),%vI_s9Cd
                     # born:    %vI_s9Cd
                 movq 8(%rbp),%vI_nakO
                     # born:    %vI_nakO
                 leaq 1(%vI_nakO),%vI_s9CV
                     # born:    %vI_s9CV
                     # r_dying: %vI_nakO
                 addq $16,%rbp
                 jmp _ca12
                     # r_dying: %vI_s9z5 %vI_s9zv %vI_s9zx %vI_s9zz
 %vI_s9zN %vI_s9Cb %vI_s9Cc %vI_s9Cd %vI_s9CV
 }}}

 However the graph register spills many of these:
 * vI_s9zN - 5 uses
 * vI_s9Cd - 2 uses
 * and a few more but the details don't matter that much.

 With live range splitting ideally we just split ranges overlapping the
 loop into three ranges, so the ranges overlapping the loop could be
 spilled/loaded on loop entry/exit not affecting the performance of the
 loop.

-- 
Ticket URL: http://ghc.haskell.org/trac/ghc/ticket/16243#comment:2
GHC http://www.haskell.org/ghc/
The Glasgow Haskell Compiler