[2/16] SBM: Inner loops of the hand-tweaked assembly benchmarks

I've taken the two benchmarks byte-bs----acc and space-bs-c8-acc-1 and gradually tweaked their inner loops from something that used memory all the time to something that used registers more and more efficiently. I've done this gradually, pretty much one register at a time. Along the way, I've also done a simple common subexpression/loop hoisting thing in which I combined the pointer to the start of the string and the index into the string into a single pointer. Doing this in real life may cause bad problems with the garbage collector. At the end, I go a bit mad and start doing heroic optimizations (reading four bytes at a time, using MMX registers to read 8 bytes at a time, twisted MMX math to keep 8 space counters in an MMX register + a bit of loop unrolling). Here follows first the two original inner loops and then the 23 hand-tweaked versions. I used the following shell code to isolate the inner loops: (for F in hs/byte-bs----acc.s hs/space-bs-c8-acc-1.s hand/*.s ; \ do echo "------------------------------"; \ echo "$F:"; \ echo ; \ cat "$F" | perl -e 'while(<>){ if (/Main_zdwcnt_info:/ .. /.section .data/) { print; }}' | head -n-1; \ done; \ echo "=============================="; \ ) > xx.txt -Peter ------------------------------ hs/byte-bs----acc.s: Main_zdwcnt_info: .LcYL: cmpl $0,16(%ebp) jle .LcYO movl 12(%ebp),%eax incl %eax movl (%ebp),%ecx incl %ecx subl $1,16(%ebp) movl %eax,12(%ebp) movl %ecx,(%ebp) jmp Main_zdwcnt_info .LcYO: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) ------------------------------ hs/space-bs-c8-acc-1.s: Main_zdwcnt_info: .Lc16u: cmpl $0,16(%ebp) jle .Lc16x movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16F movl 12(%ebp),%eax incl %eax movl (%ebp),%ecx incl %ecx subl $1,16(%ebp) movl %eax,12(%ebp) movl %ecx,(%ebp) jmp Main_zdwcnt_info .Lc16x: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) .Lc16F: movl 12(%ebp),%eax incl %eax subl $1,16(%ebp) movl %eax,12(%ebp) jmp Main_zdwcnt_info ------------------------------ hand/byte-bs----acc-a.s: Main_zdwcnt_info: .LcYN: cmpl $0,16(%ebp) jle .LcYQ movl 00(%ebp),%ecx movl 12(%ebp),%eax movl 16(%ebp),%edx incl %ecx incl %eax decl %edx movl %ecx,00(%ebp) movl %eax,12(%ebp) movl %edx,16(%ebp) jmp Main_zdwcnt_info .LcYQ: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) ------------------------------ hand/byte-bs----acc-b.s: Main_zdwcnt_info: .LcYN: cmpl $0,16(%ebp) jle .LcYQ movl 00(%ebp),%ecx movl 12(%ebp),%eax movl 16(%ebp),%edx .L_again: cmpl $0,%edx jle .L_out incl %ecx incl %eax decl %edx jmp .L_again .L_out: movl %ecx,00(%ebp) movl %eax,12(%ebp) movl %edx,16(%ebp) jmp Main_zdwcnt_info .LcYQ: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) ------------------------------ hand/byte-bs----acc-c.s: Main_zdwcnt_info: .LcYN: cmpl $0,16(%ebp) jle .LcYQ movl 00(%ebp),%ecx movl 12(%ebp),%eax movl 16(%ebp),%edx cmpl $0,%edx jle .L_out .L_again: incl %ecx incl %eax decl %edx cmpl $0,%edx jg .L_again .L_out: movl %ecx,00(%ebp) movl %eax,12(%ebp) movl %edx,16(%ebp) jmp Main_zdwcnt_info .LcYQ: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) ------------------------------ hand/byte-bs----acc-d.s: Main_zdwcnt_info: .LcYN: cmpl $0,16(%ebp) jle .LcYQ movl 00(%ebp),%ecx movl 12(%ebp),%eax movl 16(%ebp),%edx cmpl $0,%edx jle .L_out .align 16 .L_again: incl %ecx incl %eax decl %edx cmpl $0,%edx jg .L_again .L_out: movl %ecx,00(%ebp) movl %eax,12(%ebp) movl %edx,16(%ebp) jmp Main_zdwcnt_info .LcYQ: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-a.s: Main_zdwcnt_info: .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H movl 12(%ebp),%eax incl %eax movl (%ebp),%ecx incl %ecx subl $1,16(%ebp) movl %eax,12(%ebp) movl %ecx,(%ebp) jmp Main_zdwcnt_info .Lc16H: movl 12(%ebp),%eax incl %eax subl $1,16(%ebp) movl %eax,12(%ebp) jmp Main_zdwcnt_info .Lc16z: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-b.s: Main_zdwcnt_info: .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax je .Lc16H movl 12(%ebp),%eax incl %eax subl $1,16(%ebp) movl %eax,12(%ebp) jmp Main_zdwcnt_info .Lc16H: movl 12(%ebp),%eax incl %eax movl (%ebp),%ecx incl %ecx subl $1,16(%ebp) movl %eax,12(%ebp) movl %ecx,(%ebp) jmp Main_zdwcnt_info .Lc16z: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-c.s: Main_zdwcnt_info: .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H movl (%ebp),%ecx incl %ecx movl 12(%ebp),%eax incl %eax movl %ecx,(%ebp) movl %eax,12(%ebp) subl $1,16(%ebp) jmp Main_zdwcnt_info .Lc16z: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) .Lc16H: movl 12(%ebp),%eax incl %eax movl %eax,12(%ebp) subl $1,16(%ebp) jmp Main_zdwcnt_info ------------------------------ hand/space-bs-c8-acc-1-d.s: Main_zdwcnt_info: .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H addl $1,(%ebp) addl $1,12(%ebp) subl $1,16(%ebp) jmp Main_zdwcnt_info .Lc16z: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) .Lc16H: addl $1,12(%ebp) subl $1,16(%ebp) jmp Main_zdwcnt_info ------------------------------ hand/space-bs-c8-acc-1-e.s: Main_zdwcnt_info: .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H movl 12(%ebp),%eax incl %eax incl %ecx movl (%ebp),%eax incl %eax subl $1,16(%ebp) movl %ecx,12(%ebp) movl %eax,(%ebp) jmp Main_zdwcnt_info .Lc16z: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) .Lc16H: incl %ecx subl $1,16(%ebp) movl %ecx,12(%ebp) jmp Main_zdwcnt_info ------------------------------ hand/space-bs-c8-acc-1-f.s: Main_zdwcnt_info: .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H incl %ecx subl $1,16(%ebp) addl $1,(%ebp) movl %ecx,12(%ebp) jmp Main_zdwcnt_info .Lc16z: movl (%ebp),%esi addl $20,%ebp jmp *(%ebp) .Lc16H: incl %ecx subl $1,16(%ebp) movl %ecx,12(%ebp) jmp Main_zdwcnt_info ------------------------------ hand/space-bs-c8-acc-1-g.s: Main_zdwcnt_info: movl (%ebp),%esi .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movl 12(%ebp),%ecx movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H incl %ecx subl $1,16(%ebp) inc %esi movl %ecx,12(%ebp) jmp .Lc16w .Lc16z: addl $20,%ebp jmp *(%ebp) .Lc16H: incl %ecx subl $1,16(%ebp) movl %ecx,12(%ebp) jmp .Lc16w ------------------------------ hand/space-bs-c8-acc-1-h.s: Main_zdwcnt_info: movl (%ebp),%esi movl 12(%ebp),%ecx .Lc16w: cmpl $0,16(%ebp) jle .Lc16z movl 4(%ebp),%eax movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H incl %ecx subl $1,16(%ebp) inc %esi jmp .Lc16w .Lc16z: addl $20,%ebp jmp *(%ebp) .Lc16H: incl %ecx subl $1,16(%ebp) jmp .Lc16w ------------------------------ hand/space-bs-c8-acc-1-i.s: Main_zdwcnt_info: movl (%ebp),%esi movl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w: cmpl $0,%edx jle .Lc16z movl 4(%ebp),%eax movzbl (%eax,%ecx,1),%eax cmpl $32,%eax jne .Lc16H incl %ecx decl %edx inc %esi jmp .Lc16w .Lc16z: addl $20,%ebp jmp *(%ebp) .Lc16H: incl %ecx decl %edx jmp .Lc16w ------------------------------ hand/space-bs-c8-acc-1-j.s: Main_zdwcnt_info: movl (%ebp),%esi movl 4(%ebp),%ecx addl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w: cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax cmpl $32,%eax jne .Lc16H incl %ecx decl %edx inc %esi jmp .Lc16w .Lc16z: addl $20,%ebp jmp *(%ebp) .Lc16H: incl %ecx decl %edx jmp .Lc16w ------------------------------ hand/space-bs-c8-acc-1-k.s: Main_zdwcnt_info: movl (%ebp),%esi movl 4(%ebp),%ecx addl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w: cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax cmpl $32,%eax jne .Lc16H incl %ecx decl %edx inc %esi jmp .Lc16w .Lc16H: incl %ecx decl %edx jmp .Lc16w .Lc16z: addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-l.s: Main_zdwcnt_info: movl (%ebp),%esi movl 4(%ebp),%ecx addl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w: cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16H inc %esi jmp .Lc16w .Lc16H: jmp .Lc16w .Lc16z: addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-m.s: Main_zdwcnt_info: movl (%ebp),%esi movl 4(%ebp),%ecx addl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w: cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w inc %esi jmp .Lc16w .Lc16z: addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-n.s: Main_zdwcnt_info: movl (%ebp),%esi movl 4(%ebp),%ecx addl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w: cmpl $0,%edx jle .Lc16z .Lc16xx: movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w inc %esi cmpl $0,%edx jg .Lc16xx .Lc16z: addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-o.s: Main_zdwcnt_info: movl (%ebp),%esi movl 4(%ebp),%ecx addl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w: cmpl $0,%edx jle .Lc16z .Lc16xx: movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w inc %esi cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w inc %esi cmpl $0,%edx jg .Lc16xx .Lc16z: addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-p.s: Main_zdwcnt_info: movl (%ebp),%esi movl 4(%ebp),%ecx addl 12(%ebp),%ecx movl 16(%ebp),%edx .Lc16w4: cmpl $4,%edx jl .Lc16wxx movl (%ecx),%eax addl $4,%ecx subl $4,%edx cmpb $32,%al jne .Lc16wa incl %esi .Lc16wa: cmpb $32,%ah jne .Lc16wb incl %esi .Lc16wb: shrl $16,%eax cmpb $32,%al jne .Lc16wc incl %esi .Lc16wc: cmpb $32,%ah jne .Lc16w4 incl %esi jmp .Lc16w4 .Lc16w1: cmpl $0,%edx jle .Lc16z .Lc16wxx: movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w1 inc %esi jmp .Lc16w1 .Lc16z: addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-q.s: Main_zdwcnt_info: movl (%ebp),%esi /* #spaces found */ movl 4(%ebp),%ecx /* ptr */ addl 12(%ebp),%ecx /* ... + idx */ movl 16(%ebp),%edx /* cnt of remaining bytes */ emms /* clear fp tags so we can use mmx instrs */ mov $0x20202020,%eax movd %eax,%mm1 /* mm1: 0000000020202020 */ movq %mm1,%mm0 /* mm0: 0000000020202020 */ psllq $32,%mm1 /* mm1: 2020202000000000 */ por %mm0,%mm1 /* mm1: 2020202020202020 */ mov $0x01010101,%eax movd %eax,%mm2 /* mm2: 0000000001010101 */ movq %mm2,%mm0 /* mm0: 0000000001010101 */ psllq $32,%mm2 /* mm2: 0101010100000000 */ por %mm0,%mm2 /* mm2: 0101010101010101 */ /* MMX loads can use any alignment (potentially at a speed-hit) */ /* this loop looks at 8 bytes at a time */ .Lc16w8: cmpl $8,%edx jl .Lc16w1 movq (%ecx),%mm0 /* mm0 holds 8 characters */ addl $8,%ecx subl $8,%edx pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */ /* the result flag is 00 or FF */ pand %mm2,%mm0 /* turn FF into 01, which is actually useful */ /* if we could just add the bytes up horizontally in %mm0, sigh.. .*/ movd %mm0,%eax push %eax add %ah, %al and $0x03,%eax add %eax,%esi pop %eax shr $16,%eax add %ah,%al and $0x03,%eax add %eax,%esi psrlq $32,%mm0 movd %mm0,%eax push %eax add %ah, %al and $0x03,%eax add %eax,%esi pop %eax shr $16,%eax add %ah,%al and $0x03,%eax add %eax,%esi jmp .Lc16w8 /* this loop looks at one byte at a time to handle the remainder */ .Lc16w1: cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w1 inc %esi jmp .Lc16w1 /* done, remember to clear fp/mmx tags with emms */ .Lc16z: emms addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-r.s: Main_zdwcnt_info: movl (%ebp),%esi /* #spaces found */ movl 4(%ebp),%ecx /* ptr */ addl 12(%ebp),%ecx /* ... + idx */ movl 16(%ebp),%edx /* cnt of remaining bytes */ emms /* clear fp tags so we can use mmx instrs */ mov $0x20202020,%eax movd %eax,%mm1 /* mm1: 0000000020202020 */ movq %mm1,%mm0 /* mm0: 0000000020202020 */ psllq $32,%mm1 /* mm1: 2020202000000000 */ por %mm0,%mm1 /* mm1: 2020202020202020 */ mov $0x01010101,%eax movd %eax,%mm2 /* mm2: 0000000001010101 */ movq %mm2,%mm0 /* mm0: 0000000001010101 */ psllq $32,%mm2 /* mm2: 0101010100000000 */ por %mm0,%mm2 /* mm2: 0101010101010101 */ /* MMX loads can use any alignment (potentially at a speed-hit) */ /* therefore we don't have to try to read 1-7 bytes one at a time */ /* first in order to end up with an aligned %ecx. */ .Lc16_mainloop: cmpl $8,%edx jl .Lc16w1 movl %edx,%eax shr $3,%eax cmpl $127,%eax jle .Lc16_127 movl $127,%eax .Lc16_127: shl $3,%eax sub %eax,%edx shr $3,%eax pxor %mm3,%mm3 /* clear block of space counters */ /* loop up to 127 times in a loop that looks at 8 bytes at a time. */ /* Going above 255 could overflow the 8 counters in mm3. */ /* Going above 127 could overflow the horizontal summation code. */ .Lc16w8: cmpl $0,%eax jle .Lc16w8end movq (%ecx),%mm0 /* mm0 holds 8 characters */ addl $8,%ecx decl %eax pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */ /* the result flag is 00 or FF */ pand %mm2,%mm0 /* turn FF into 01, which is actually useful */ paddb %mm0,%mm3 /* add to the 8 space counters */ jmp .Lc16w8 .Lc16w8end: /* sum the 8 space counters in mm3 and add to %esi */ /* if only MMX had horizontal byte adds... */ movd %mm3,%eax push %eax add %ah, %al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi pop %eax shr $16,%eax add %ah,%al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi psrlq $32,%mm3 movd %mm3,%eax push %eax add %ah, %al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi pop %eax shr $16,%eax add %ah,%al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi jmp .Lc16_mainloop /* this loop looks at one byte at a time to handle the remainder */ .Lc16w1: cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w1 inc %esi jmp .Lc16w1 /* done, remember to clear fp/mmx tags with emms */ .Lc16z: emms addl $20,%ebp jmp *(%ebp) ------------------------------ hand/space-bs-c8-acc-1-s.s: Main_zdwcnt_info: movl (%ebp),%esi /* #spaces found */ movl 4(%ebp),%ecx /* ptr */ addl 12(%ebp),%ecx /* ... + idx */ movl 16(%ebp),%edx /* cnt of remaining bytes */ emms /* clear fp tags so we can use mmx instrs */ mov $0x20202020,%eax movd %eax,%mm1 /* mm1: 0000000020202020 */ movq %mm1,%mm0 /* mm0: 0000000020202020 */ psllq $32,%mm1 /* mm1: 2020202000000000 */ por %mm0,%mm1 /* mm1: 2020202020202020 */ mov $0x01010101,%eax movd %eax,%mm2 /* mm2: 0000000001010101 */ movq %mm2,%mm0 /* mm0: 0000000001010101 */ psllq $32,%mm2 /* mm2: 0101010100000000 */ por %mm0,%mm2 /* mm2: 0101010101010101 */ /* MMX loads can use any alignment (potentially at a speed-hit) */ /* therefore we don't have to try to read 1-7 bytes one at a time */ /* first in order to end up with an aligned %ecx. */ .Lc16_mainloop: cmpl $8,%edx jl .Lc16w1 movl %edx,%eax shr $3,%eax cmpl $127,%eax jle .Lc16_127 movl $127,%eax .Lc16_127: shl $3,%eax sub %eax,%edx shr $3,%eax pxor %mm3,%mm3 /* clear block of space counters */ /* loop up to 127 times in a loop that looks at 8 bytes at a time. */ /* Going above 255 could overflow the 8 counters in mm3. */ /* Going above 127 could overflow the horizontal summation code. */ cmpl $0,%eax jle .Lc16w8end /* this is an unspeakably ugly and sloppy loop unrolling. Doesn't */ /* seem to help much on an Athlon64 3000+. */ test $1,%eax jz .Lc16w8 incl %eax jmp .Lc16w8x .Lc16w8: movq (%ecx),%mm0 /* mm0 holds 8 characters */ addl $8,%ecx pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */ /* the result flag is 00 or FF */ pand %mm2,%mm0 /* turn FF into 01, which is actually useful */ paddb %mm0,%mm3 /* add to the 8 space counters */ .Lc16w8x: movq (%ecx),%mm0 /* mm0 holds 8 characters */ addl $8,%ecx pcmpeqb %mm1,%mm0 /* cmp byte for byte with ' ' */ /* the result flag is 00 or FF */ pand %mm2,%mm0 /* turn FF into 01, which is actually useful */ paddb %mm0,%mm3 /* add to the 8 space counters */ subl $2,%eax jnz .Lc16w8 .Lc16w8end: /* sum the 8 space counters in mm3 and add to %esi */ /* if only MMX had horizontal byte adds... */ movd %mm3,%eax push %eax add %ah, %al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi pop %eax shr $16,%eax add %ah,%al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi psrlq $32,%mm3 movd %mm3,%eax push %eax add %ah, %al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi pop %eax shr $16,%eax add %ah,%al /* NOTE! potential overflow! */ and $0xFF,%eax add %eax,%esi jmp .Lc16_mainloop /* this loop looks at one byte at a time to handle the remainder */ .Lc16w1: cmpl $0,%edx jle .Lc16z movzbl (%ecx),%eax incl %ecx decl %edx cmpl $32,%eax jne .Lc16w1 inc %esi jmp .Lc16w1 /* done, remember to clear fp/mmx tags with emms */ .Lc16z: emms addl $20,%ebp jmp *(%ebp) ==============================
participants (1)
-
Peter Firefly Brodersen Lund