Re: [GHC] #13629: sqrt should use machine instruction on x86_64

30 Apr 2017

      #13629: sqrt should use machine instruction on x86_64
-------------------------------------+-------------------------------------
        Reporter:  bgamari           |                Owner:  (none)
            Type:  bug               |               Status:  new
        Priority:  normal            |            Milestone:  8.4.1
       Component:  Compiler (NCG)    |              Version:  8.0.1
      Resolution:                    |             Keywords:
Operating System:  Unknown/Multiple  |         Architecture:
                                     |  Unknown/Multiple
 Type of failure:  Runtime           |            Test Case:
  performance bug                    |  numeric/num009
      Blocked By:                    |             Blocking:
 Related Tickets:  #13570            |  Differential Rev(s):  Phab:D3508
       Wiki Page:                    |
-------------------------------------+-------------------------------------

Comment (by bgamari):

 I've reproduced the regression locally. While it's hard to measure the
 runtime difference with the default test configuration, if you increase
 the argument from 3000 to 10000 it becomes quite clear. Before the patch
 the runtime is 1.14 seconds, after it's 1.30 seconds.

 This is quite surprising since there are a few bits of the generated code
 that get rather significantly shorter. Namely,
 {{{#!asm
         ...
         subq $8,%rsp
         movsd %xmm0,%xmm4
         movss %xmm6,%xmm0
         mulss %xmm6,%xmm0
         mulss %xmm6,%xmm0
         movq %rax,%rbx
         movl $1,%eax
         movq %rcx,%r14
         movq %rdx,72(%rsp)
         movq %rsi,80(%rsp)
         movq %rdi,88(%rsp)
         movsd %xmm4,96(%rsp)
         movsd %xmm1,104(%rsp)
         movsd %xmm2,112(%rsp)
         movsd %xmm3,120(%rsp)
         call sqrtf
         addq $8,%rsp
         movss _n8uc(%rip),%xmm1
         divss %xmm0,%xmm1
         movss %xmm1,%xmm0
         movsd 112(%rsp),%xmm2
         mulss %xmm2,%xmm0
         movss %xmm1,%xmm2
         movsd 104(%rsp),%xmm3
         mulss %xmm3,%xmm2
         movsd 96(%rsp),%xmm3
         mulss %xmm3,%xmm1
         movq 64(%rsp),%rax
         leaq 1(%rax),%rcx
 }}}
 Whereas after we get,
 {{{#!asm
         ...
         movss %xmm6,%xmm4
         mulss %xmm6,%xmm4
         mulss %xmm6,%xmm4
         sqrtss %xmm4,%xmm4
         movss _n8ud(%rip),%xmm5
         divss %xmm4,%xmm5
         movss %xmm5,%xmm4
         mulss %xmm3,%xmm4
         movss %xmm5,%xmm3
         mulss %xmm2,%xmm3
         mulss %xmm1,%xmm5
         leaq 1(%rdx),%rbx
 }}}

--
Ticket URL: http://ghc.haskell.org/trac/ghc/ticket/13629#comment:16
GHC http://www.haskell.org/ghc/
The Glasgow Haskell Compiler