[GHC] #13094: Poor register allocation and redundant moves when using `foreign import prim`

10 Jan 2017

      #13094: Poor register allocation and redundant moves when using `foreign import
prim`
----------------------------------------+---------------------------------
           Reporter:  jberryman         |             Owner:
               Type:  bug               |            Status:  new
           Priority:  normal            |         Milestone:
          Component:  Compiler          |           Version:  8.0.1
           Keywords:                    |  Operating System:  Linux
       Architecture:  Unknown/Multiple  |   Type of failure:  None/Unknown
          Test Case:                    |        Blocked By:
           Blocking:                    |   Related Tickets:  #12232
Differential Rev(s):                    |         Wiki Page:
----------------------------------------+---------------------------------
 This may well be the same as #12232 but I didn't want to risk hijacking
 that one.

 I've been looking at the dumped asm from a library where I use `foreign
 import prim` to call a simple assembly routine (that may be irrelevant). I
 see many `movq` instructions, and when I traced them by hand many or most
 seemed superfluous saving and restoring of registers.

 Here is a contiguous and mostly self-contained snippet of the assembly,
 with notes. I have four values I'm interested in V0, V1, V2, V3 which I
 started tracing at the note "registers good for sipRound_s_x2 at this
 point".

 Below I specifically follow V3 through the assmebly, marking lines with
 {{{"*** V3 ***"}}}, and the moves don't seem sensible to my (untrained)
 eye:

 {{{
 _c7Ns:
         movq %r9,%rbx
         movq %rax,%r9
         movq %r8,%rax     ;               *** V3  ***
         movq %rbx,%r8
         movq %rdi,%rbx
         movq %rax,%rdi    ;               *** V3 (back to rdi!) ***
         movq %rsi,%rax
         movq %rbx,%rsi
         movq %r14,%rbx
         movq %rax,%r14
         movq %rcx,16(%rbp)
         addq $16,%rbp
         jmp *8(%rbp)
 .align 8
         .quad   836
         .quad   32
 block_info:
 _c7Nk:
         movq 16(%rbp),%rax
         movq $8,16(%rbp)
         movq 24(%rbp),%rcx
         incq %rcx
         movq %rcx,24(%rbp)
         movq 32(%rbp),%rdx
         incq %rdx
         movq %rdx,32(%rbp)
         xorq 8(%rbp),%rbx
         addq $16,%rbp
         movl $8,%r8d
         xorl %r9d,%r9d
 _n7T1:
         movq %rax,64(%rsp)
         movq %r8,%rax
 ; --- registers good for sipRound_s_x2 at this point ------
         movq %rdi,%r8   ; moving V3 *OUT* of rdi         *** V3 ***
         movq %rsi,%rdi  ; moving V2 *OUT* of rdi
         movq %r14,%rsi  ; moving V1 *OUT* of r14
         movq %rbx,%r14  ; moving V0 *OUT* of rbx
         movq 64(%rsp),%rbx
 _c7My:              ; (NOTE: there are a few jumps to here from other
 sections not included here)
         cmpq 24(%rbx),%rdx
         je _c7Ns
 _c7Nr:
         movq 16(%rbx),%r10
         movzbl (%r10,%rdx,1),%r10d
         cmpq $1,%rax
         jne _c7N9
 _c7Nl:
         movq $block_info,-16(%rbp)
         shlq $8,%r9
         orq %r10,%r9
 ;----- at this point (name: register): V0: r14, V1: rsi, V2: rdi, V3: r8
         movq %rdi,%rax ; save rdi, trying to be rsi
         movq %r8,%rdi  ; prepared rdi(V3)                  *** V3 ***
         xorq %r9,%rdi
         movq %rsi,%rcx
         movq %rax,%rsi ; prepared rsi(V2)
         movq %r14,%rax
         movq %rcx,%r14 ; prepared r14(V1)
         movq %rbx,%rcx
         movq %rax,%rbx ; prepared rbx(V0)
         movq %r9,-8(%rbp)
         movq %rcx,(%rbp)
         addq $-16,%rbp
         jmp sipRound_s_x2
 .align 8
         .quad   1733
         .quad   32
 block_info:
 _c7N2:
         movq 24(%rbp),%rax
         movq $7,24(%rbp)
         movq 32(%rbp),%rcx
         incq %rcx
         movq %rcx,32(%rbp)
         movq 40(%rbp),%rdx
         incq %rdx
         movq %rdx,40(%rbp)
         movq 16(%rbp),%r8
         xorq 8(%rbp),%rbx
         addq $24,%rbp
         movl $7,%r9d
 _n7T2:
         movq %rax,64(%rsp)
         movq %r9,%rax
         movq %r8,%r9
         movq %rdi,%r8
         movq %rsi,%rdi
         movq %r14,%rsi
         movq %rbx,%r14
         movq 64(%rsp),%rbx
         jmp _c7My
 _c7N9:
         cmpq $1,%rax
         jbe _c7N3
 _c7MW:
         decq %rax
         movq %rax,(%rbp)
         incq %rcx
         movq %rcx,8(%rbp)
         incq %rdx
         movq %rdx,16(%rbp)
         shlq $8,%r9
         orq %r10,%r9
         jmp _c7My
 _c7N3:
         movq $block_info,-24(%rbp)
         movq %rdi,%rax
         movq %r8,%rdi   ;                   *** V3 (back to rdi!) ***
         xorq %r9,%rdi
         movq %rsi,%rcx
         movq %rax,%rsi
         movq %r14,%rax
         movq %rcx,%r14
         movq %rbx,%rcx
         movq %rax,%rbx
         movq %r9,-16(%rbp)
         movq %r10,-8(%rbp)
         movq %rcx,(%rbp)
         addq $-24,%rbp
         jmp sipRound_s_x2
         .size $whashRemainingBytes_info, .-$whashRemainingBytes_info

 }}}

 This is my first time looking closely at assembly, so maybe this is normal
 or no big deal performance-wise (I haven't gotten around to trying to
 correlate number of moves with performance of my variations yet), or I'm
 missing something obvious. I wasn't able to make sense of an objdump-ed
 version of the llvm-compiled program. The code for the version where the
 same {{{sipRound}}} stuff is implemented in normal haskell also has a lot
 of moves (even more in fact), but they seem interspersed throughout and
 are less easy to sort through, e.g. a random snippet:

 {{{
         xorq %rdi,%r8
         movq %r14,%rdi
         shrq $32,%rdi
         shlq $32,%r14
         orq %rdi,%r14
         addq %r8,%r14
         movq %r14,%rdi
         addq %rsi,%rdi
         movq %rsi,%r10
         shrq $51,%r10
         shlq $13,%rsi
         orq %r10,%rsi
         xorq %rdi,%rsi
         movq %r8,%r10
         shrq $43,%r10
         shlq $21,%r8
         orq %r10,%r8
         xorq %r14,%r8
         movq %rax,%r10
         shrq $32,%r10
         shlq $32,%rax
         orq %r10,%rax
         addq %r8,%rax
         movq %rax,%r10
 }}}

 Intuitively this is also bad, after all the haskell we're compiling looks
 almost trivially like the good assembly we'd like/expect (although perhaps
 {{{rotl}}} is shaking things up here, as we have to implement it with two
 shifts and an OR):

 {{{#!hs
     -- in the Identity monad
  do v0 <- return $ v0 + v1
     v1 <- return $ rotl v1 13
     v1 <- return $ v1 `xor` v0
     v0 <- return $ rotl v0 32
     v2 <- return $ v2 + v3
     v3 <- return $ rotl v3 16
     v3 <- return $ v3 `xor` v2
     v0 <- return $ v0 + v3
     v3 <- return $ rotl v3 21
     v3 <- return $ v3 `xor` v0
 }}}

 You can check out the branch here, which I'll keep  at 838b27a:

 https://github.com/jberryman/hashabler/tree/saved-branch-exhibiting-
 seemingly-bad-register-allocation

 You can build and observe the assembly with:

 {{{
 $ cabal configure -fdev --enable-benchmarks   && cabal build  core
 $ gvim ./dist/build/core/core-tmp/core.dump-asm
 }}}

--
Ticket URL: http://ghc.haskell.org/trac/ghc/ticket/13094
GHC http://www.haskell.org/ghc/
The Glasgow Haskell Compiler