{See
https://stackoverflow.com/questions/47478010/sse2-8x8-byte-matrix-transpose-code-twice-as-slow-on-haswell-then-on-ivy-bridge

(C) Marco van de Voort 2017, for Candela
license: LGPL-with-static-linking exception

The base trick is to punpck three times (b->w twice and then l->q) to group 
pixels in 8 byte groups. This is better than a bunch of SHUF

FPC is unfortunately still incompatible with Delphi 64-bit assembler syntax. I
added some provisional IFDEFS but haven't tested them yet.

Probably needs another optimization to group 8x8 rotate blocks into 64x64
blocks to improve looptiling.

Note that rotating JPG can be done (in some cases?) without decompressing,
making it vastly superior to jpg ->bmp -> rotate ->  jpg.

Search for "nativejpg" by Nils Holm

}

{$ifdef fpc}
{$mode delphi}
{$asmmode intel}
type pbbyte = ^byte;
{$endif}

// rowpitch is the size of 

const    inv8x8_4  :  array[0..15] of byte = ( 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8);

// rowpitch is the number of bytes to go to the next line, in delphi
// rowpitch:= (scanline[n+1]-scanline[n])
//

{$define loadinverted} // used in experiments with 270 instead of 90 degrees.
{define iacamarker}  // markers for intel IACA analysis tool.
procedure rot8x8(src,dest:pbbyte;rowpitchsrc,rowpitchdest:integer;nrxstep,nrystep:integer){$ifndef fpc} {$endif};
// src rcx, dest rdx, rpsrc r8, rpdest r9
// vol:  rax,r10,r11

// init src ptr op 0,0
// outerloop: y+=stepsize * rowpitch    
// innerloop:  x+x+stepsize
// init dest ptr op width-stepsize,  outerloop x:=x-stepsize   innerloop:
// y:=y+step*rowpitch
{$ifdef fpc}
begin
{$endif}
asm
 {$ifndef fpc}
  .pushnv r12
  .pushnv r13
  .pushnv r14
  .pushnv r15
  .pushnv rbx
  .savenv xmm6
  .savenv xmm7
  .savenv xmm8
  .savenv xmm9
  .savenv xmm10
  .savenv xmm11
  .savenv xmm12
 {$endif}
  mov r10d,nrxstep
  mov r11d,nrystep
  mov r14,rcx
  mov r15,rdx
  mov r12,r8
  shl r12,1
  add r12,r8   // r12 = 3*rpsrc
  mov r13,r9
  shl r13,1
  add r13,r9   // r13 = 3*rpdest
  mov rbx,r8
  neg rbx
  movdqu xmm12,[rip+inv8x8_4]

// 8: rpsrc 9: rpdest 10: nrxstep 11:nrystep 12: rpsrc*3 13:rpdest*3, r14
// src ptr r15 dest pter

@outerloop:
  mov  rcx,r14
  mov  rdx,r15
  mov  rax,r10

@innerloop:

  {$ifdef iacamarker}
        mov ebx, 111          // Start marker bytes
       db $64, $67, $90   // Start marker bytes
{$endif}

  // load 8x8 bytes into 4 registers
  {$ifdef loadinverted}
  movq xmm6,[rcx]
  movq xmm4,[rcx+r8]
  movq xmm7,[rcx+2*r8]
  movq xmm1,[rcx+r12]
  {$else}
  movq xmm1,[rcx]
  movq xmm7,[rcx+r8]
  movq xmm4,[rcx+2*r8]
  movq xmm6,[rcx+r12]
  {$endif}
  PUNPCKLBW xmm1,xmm4   // 0 2 0 2 0 2
  PUNPCKLBW xmm7,xmm6   // 1 3 1 3 1 3

  movdqa xmm2,xmm1
  punpcklbw xmm1,xmm7   // 0 1 2 3 0 1 2 3
  punpckhbw xmm2,xmm7

  lea rcx,[rcx+4*r8]
  // same for 4..7
  {$ifdef loadinverted}
  movq xmm3,[rcx+r12]
  movq xmm5,[rcx+r8]
  {$else}
  movq xmm3,[rcx]
  movq xmm5,[rcx+2*r8]
  {$endif}
  PUNPCKLBW xmm3,xmm5
  {$ifdef loadinverted}
  movq xmm7,[rcx+2*r8]
  movq xmm8,[rcx]
  {$else}
  movq xmm7,[rcx+r8]
  movq xmm8,[rcx+r12]
  {$endif}
  PUNPCKLBW xmm7,xmm8

  movdqa xmm4,xmm3
  punpcklbw xmm3,xmm7
  punpckhbw xmm4,xmm7

  {$ifdef loadinverted}
  movdqa    xmm5, xmm3
  punpckldq xmm5, xmm1     // xmm5 = [ a i b j ]
  {$else}
  movdqa    xmm5, xmm1
  punpckldq xmm5, xmm3     // xmm5 = [ a i b j ]
  {$endif}
  {$ifdef doshuf}
    pshufb   xmm5,xmm12
  {$endif}

  movq     [rdx], xmm5
  movhps   [rdx+r9], xmm5  // still a pure store, doesn't cost a shuffle

  {$ifdef loadinverted}
  movdqa   xmm5,xmm3
  punpckhdq xmm5, xmm1     // xmm5 = [ a i b j ]
  {$else}
    movdqa   xmm5,xmm1
  punpckhdq xmm5, xmm3     // xmm5 = [ a i b j ]
  {$endif}
  {$ifdef doshuf}
    pshufb   xmm5,xmm12
  {$endif}

  movq     [rdx+2*r9], xmm5
  movhps   [rdx+r13], xmm5  // still a pure store, doesn't cost a shuffle

  lea     rdx,[rdx+4*r9]

  {$ifdef loadinverted}
  movdqa    xmm5, xmm4
  punpckldq xmm5, xmm2     // xmm5 = [ a i b j ]
  {$else}
  movdqa    xmm5, xmm2
  punpckldq xmm5, xmm4     // xmm5 = [ a i b j ]

  {$endif}
  {$ifdef doshuf}
    pshufb   xmm5,xmm12
  {$endif}
  movq     [rdx], xmm5
  movhps   [rdx+r9], xmm5  // still a pure store, doesn't cost a shuffle

  {$ifdef loadinverted}
  movdqa   xmm5,xmm4
  punpckhdq xmm5, xmm2     // xmm5 = [ a i b j ]
    {$else}
  movdqa   xmm5,xmm2
  punpckhdq xmm5, xmm4     // xmm5 = [ a i b j ]

  {$endif}
  {$ifdef doshuf}
    pshufb   xmm5,xmm12
  {$endif}
  movq     [rdx+2*r9], xmm5
  movhps   [rdx+r13], xmm5  // still a pure store, doesn't cost a shuffle

// init src ptr op 0,0, outerloop: y+=step * rowpitch    innerloop:
// x+x+stepsize
// init dest ptr op width-stepsize,  outerloop x:=x-stepsize   innerloop:
// y:=y+step*rowpitch

  // rcx undo add.
  // rbx=-r8
  lea       rcx,[rcx+4*rbx+8] // correct pointer and move 8 to right.
  lea       rdx,[rdx+4*r9]   // also jump over second block of 4
{$ifdef iacamarker}
    mov ebx, 222          // End marker bytes
    db $64, $67, $90   // End marker bytes
{$endif}

  dec      rax
  jne @innerloop

  lea  r14,[r14+r8*8]
  lea  r15,[r15-8]

  dec r11
  jne @outerloop
{$ifdef fpc}
  end['r12','r13','r14','r15','rbx','xmm6','xmm7','xmm8','xmm9','xmm10','xmm11','xmm12'];
{$endif}
end;

begin
end.