{See https://stackoverflow.com/questions/47478010/sse2-8x8-byte-matrix-transpose-code-twice-as-slow-on-haswell-then-on-ivy-bridge (C) Marco van de Voort 2017, for Candela license: LGPL-with-static-linking exception FPC is unfortunately still incompatible with Delphi 64-bit assembler syntax. I added some provisional IFDEFS but haven't tested them yet. Probably needs another optimization to group 8x8 rotate blocks into 64x64 blocks to improve looptiling. Note that rotating JPG can be done (in some cases?) without decompressing, making it vastly superior to jpg ->bmp -> rotate -> jpg. Search for "nativejpg" by Nils Holm } {$ifdef fpc} {$mode delphi} {$asmmode intel} type pbbyte = ^byte; {$endif} // rowpitch is the size of const inv8x8_4 : array[0..15] of byte = ( 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8); // rowpitch is the number of bytes to go to the next line, in delphi // rowpitch:= (scanline[n+1]-scanline[n]) // {$define loadinverted} // used in experiments with 270 instead of 90 degrees. {define iacamarker} // markers for intel IACA analysis tool. procedure rot8x8(src,dest:pbbyte;rowpitchsrc,rowpitchdest:integer;nrxstep,nrystep:integer){$ifndef fpc} {$endif}; // src rcx, dest rdx, rpsrc r8, rpdest r9 // vol: rax,r10,r11 // init src ptr op 0,0 // outerloop: y+=stepsize * rowpitch // innerloop: x+x+stepsize // init dest ptr op width-stepsize, outerloop x:=x-stepsize innerloop: // y:=y+step*rowpitch {$ifdef fpc} begin {$endif} asm {$ifndef fpc} .pushnv r12 .pushnv r13 .pushnv r14 .pushnv r15 .pushnv rbx .savenv xmm6 .savenv xmm7 .savenv xmm8 .savenv xmm9 .savenv xmm10 .savenv xmm11 .savenv xmm12 {$endif} mov r10d,nrxstep mov r11d,nrystep mov r14,rcx mov r15,rdx mov r12,r8 shl r12,1 add r12,r8 // r12 = 3*rpsrc mov r13,r9 shl r13,1 add r13,r9 // r13 = 3*rpdest mov rbx,r8 neg rbx movdqu xmm12,[rip+inv8x8_4] // 8: rpsrc 9: rpdest 10: nrxstep 11:nrystep 12: rpsrc*3 13:rpdest*3, r14 // src ptr r15 dest pter @outerloop: mov rcx,r14 mov rdx,r15 mov rax,r10 @innerloop: {$ifdef iacamarker} mov ebx, 111 // Start marker bytes db $64, $67, $90 // Start marker bytes {$endif} // load 8x8 bytes into 4 registers {$ifdef loadinverted} movq xmm6,[rcx] movq xmm4,[rcx+r8] movq xmm7,[rcx+2*r8] movq xmm1,[rcx+r12] {$else} movq xmm1,[rcx] movq xmm7,[rcx+r8] movq xmm4,[rcx+2*r8] movq xmm6,[rcx+r12] {$endif} PUNPCKLBW xmm1,xmm4 // 0 2 0 2 0 2 PUNPCKLBW xmm7,xmm6 // 1 3 1 3 1 3 movdqa xmm2,xmm1 punpcklbw xmm1,xmm7 // 0 1 2 3 0 1 2 3 punpckhbw xmm2,xmm7 lea rcx,[rcx+4*r8] // same for 4..7 {$ifdef loadinverted} movq xmm3,[rcx+r12] movq xmm5,[rcx+r8] {$else} movq xmm3,[rcx] movq xmm5,[rcx+2*r8] {$endif} PUNPCKLBW xmm3,xmm5 {$ifdef loadinverted} movq xmm7,[rcx+2*r8] movq xmm8,[rcx] {$else} movq xmm7,[rcx+r8] movq xmm8,[rcx+r12] {$endif} PUNPCKLBW xmm7,xmm8 movdqa xmm4,xmm3 punpcklbw xmm3,xmm7 punpckhbw xmm4,xmm7 {$ifdef loadinverted} movdqa xmm5, xmm3 punpckldq xmm5, xmm1 // xmm5 = [ a i b j ] {$else} movdqa xmm5, xmm1 punpckldq xmm5, xmm3 // xmm5 = [ a i b j ] {$endif} {$ifdef doshuf} pshufb xmm5,xmm12 {$endif} movq [rdx], xmm5 movhps [rdx+r9], xmm5 // still a pure store, doesn't cost a shuffle {$ifdef loadinverted} movdqa xmm5,xmm3 punpckhdq xmm5, xmm1 // xmm5 = [ a i b j ] {$else} movdqa xmm5,xmm1 punpckhdq xmm5, xmm3 // xmm5 = [ a i b j ] {$endif} {$ifdef doshuf} pshufb xmm5,xmm12 {$endif} movq [rdx+2*r9], xmm5 movhps [rdx+r13], xmm5 // still a pure store, doesn't cost a shuffle lea rdx,[rdx+4*r9] {$ifdef loadinverted} movdqa xmm5, xmm4 punpckldq xmm5, xmm2 // xmm5 = [ a i b j ] {$else} movdqa xmm5, xmm2 punpckldq xmm5, xmm4 // xmm5 = [ a i b j ] {$endif} {$ifdef doshuf} pshufb xmm5,xmm12 {$endif} movq [rdx], xmm5 movhps [rdx+r9], xmm5 // still a pure store, doesn't cost a shuffle {$ifdef loadinverted} movdqa xmm5,xmm4 punpckhdq xmm5, xmm2 // xmm5 = [ a i b j ] {$else} movdqa xmm5,xmm2 punpckhdq xmm5, xmm4 // xmm5 = [ a i b j ] {$endif} {$ifdef doshuf} pshufb xmm5,xmm12 {$endif} movq [rdx+2*r9], xmm5 movhps [rdx+r13], xmm5 // still a pure store, doesn't cost a shuffle // init src ptr op 0,0, outerloop: y+=step * rowpitch innerloop: // x+x+stepsize // init dest ptr op width-stepsize, outerloop x:=x-stepsize innerloop: // y:=y+step*rowpitch // rcx undo add. // rbx=-r8 lea rcx,[rcx+4*rbx+8] // correct pointer and move 8 to right. lea rdx,[rdx+4*r9] // also jump over second block of 4 {$ifdef iacamarker} mov ebx, 222 // End marker bytes db $64, $67, $90 // End marker bytes {$endif} dec rax jne @innerloop lea r14,[r14+r8*8] lea r15,[r15-8] dec r11 jne @outerloop {$ifdef fpc} end['r12','r13','r14','r15','rbx','xmm6','xmm7','xmm8','xmm9','xmm10','xmm11','xmm12']; {$endif} end; begin end.