Revision 3f87f39c

View differences:

libavcodec/x86/fft_mmx.asm
457 457

  
458 458
; On x86_32, this function does the register saving and restoring for all of fft.
459 459
; The others pass args in registers and don't spill anything.
460
cglobal fft_dispatch%3%2, 2,5,0, z, nbits
460
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
461 461
    lea r2, [dispatch_tab%3%2 GLOBAL]
462 462
    mov r2, [r2 + (nbitsq-2)*gprsize]
463 463
    call r2
libavcodec/x86/h264_deblock_sse2.asm
278 278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279 279
;-----------------------------------------------------------------------------
280 280
INIT_XMM
281
cglobal x264_deblock_v_luma_sse2
281
cglobal x264_deblock_v_luma_sse2, 5,5,10
282 282
    movd    m8, [r4] ; tc0
283 283
    lea     r4, [r1*3]
284 284
    dec     r2d        ; alpha-1
......
318 318
    DEBLOCK_P0_Q0
319 319
    mova    [r4+2*r1], m1
320 320
    mova    [r0], m2
321
    ret
321
    RET
322 322

  
323 323
;-----------------------------------------------------------------------------
324 324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 325
;-----------------------------------------------------------------------------
326 326
INIT_MMX
327
cglobal x264_deblock_h_luma_sse2
328
    movsxd r10, esi
327
cglobal x264_deblock_h_luma_sse2, 5,7
328
    movsxd r10, r1d
329 329
    lea    r11, [r10+r10*2]
330
    lea    rax, [r0-4]
331
    lea    r9,  [r0-4+r11]
330
    lea    r6,  [r0-4]
331
    lea    r5,  [r0-4+r11]
332
%ifdef WIN64
333
    sub    rsp, 0x98
334
    %define pix_tmp rsp+0x30
335
%else
332 336
    sub    rsp, 0x68
333 337
    %define pix_tmp rsp
338
%endif
334 339

  
335 340
    ; transpose 6x16 -> tmp space
336
    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp
337
    lea    rax, [rax+r10*8]
338
    lea    r9,  [r9 +r10*8]
339
    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
341
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
342
    lea    r6, [r6+r10*8]
343
    lea    r5, [r5+r10*8]
344
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
340 345

  
341 346
    ; vertical filter
342 347
    ; alpha, beta, tc0 are still in r2d, r3d, r4
343
    ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
348
    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344 349
    lea    r0, [pix_tmp+0x30]
345
    mov    esi, 0x10
350
    mov    r1d, 0x10
351
%ifdef WIN64
352
    mov    [rsp+0x20], r4
353
%endif
346 354
    call   x264_deblock_v_luma_sse2
347 355

  
348 356
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
349
    add    rax, 2
350
    add    r9,  2
357
    add    r6, 2
358
    add    r5, 2
351 359
    movq   m0, [pix_tmp+0x18]
352 360
    movq   m1, [pix_tmp+0x28]
353 361
    movq   m2, [pix_tmp+0x38]
354 362
    movq   m3, [pix_tmp+0x48]
355
    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
363
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
356 364

  
357 365
    shl    r10, 3
358
    sub    rax, r10
359
    sub    r9,  r10
366
    sub    r6,  r10
367
    sub    r5,  r10
360 368
    shr    r10, 3
361 369
    movq   m0, [pix_tmp+0x10]
362 370
    movq   m1, [pix_tmp+0x20]
363 371
    movq   m2, [pix_tmp+0x30]
364 372
    movq   m3, [pix_tmp+0x40]
365
    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
373
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
366 374

  
375
%ifdef WIN64
376
    add    rsp, 0x98
377
%else
367 378
    add    rsp, 0x68
368
    ret
379
%endif
380
    RET
369 381

  
370 382
%else
371 383

  
......
388 400
    mova    m3, [r0+r1]   ; q1
389 401
    LOAD_MASK r2, r3
390 402

  
391
    mov     r3, r4m
403
    mov     r3, r4mp
392 404
    movd    m4, [r3] ; tc0
393 405
    punpcklbw m4, m4
394 406
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
......
428 440
;-----------------------------------------------------------------------------
429 441
INIT_MMX
430 442
cglobal x264_deblock_h_luma_%1, 0,5
431
    mov    r0, r0m
443
    mov    r0, r0mp
432 444
    mov    r3, r1m
433 445
    lea    r4, [r3*3]
434 446
    sub    r0, 4
......
459 471
    ADD    esp, 20
460 472

  
461 473
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
462
    mov    r0, r0m
474
    mov    r0, r0mp
463 475
    sub    r0, 2
464 476
    lea    r1, [r0+r4]
465 477

  
......
607 619
;-----------------------------------------------------------------------------
608 620
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609 621
;-----------------------------------------------------------------------------
610
cglobal x264_deblock_%2_luma_intra_%1, 4,6
622
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
611 623
%ifndef ARCH_X86_64
612 624
    sub     esp, 0x60
613 625
%endif
......
669 681
;-----------------------------------------------------------------------------
670 682
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671 683
;-----------------------------------------------------------------------------
672
cglobal x264_deblock_h_luma_intra_%1
684
cglobal x264_deblock_h_luma_intra_%1, 4,7
673 685
    movsxd r10, r1d
674 686
    lea    r11, [r10*3]
675
    lea    rax, [r0-4]
676
    lea    r9,  [r0-4+r11]
687
    lea    r6,  [r0-4]
688
    lea    r5,  [r0-4+r11]
677 689
    sub    rsp, 0x88
678 690
    %define pix_tmp rsp
679 691

  
680 692
    ; transpose 8x16 -> tmp space
681
    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
682
    lea    rax, [rax+r10*8]
683
    lea    r9,  [r9+r10*8]
684
    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
693
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
694
    lea    r6, [r6+r10*8]
695
    lea    r5, [r5+r10*8]
696
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
685 697

  
686 698
    lea    r0,  [pix_tmp+0x40]
687 699
    mov    r1,  0x10
688 700
    call   x264_deblock_v_luma_intra_%1
689 701

  
690 702
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
691
    lea    r9, [rax+r11]
692
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
703
    lea    r5, [r6+r11]
704
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
693 705
    shl    r10, 3
694
    sub    rax, r10
695
    sub    r9,  r10
706
    sub    r6,  r10
707
    sub    r5,  r10
696 708
    shr    r10, 3
697
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
709
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
698 710
    add    rsp, 0x88
699
    ret
711
    RET
700 712
%else
701 713
cglobal x264_deblock_h_luma_intra_%1, 2,4
702 714
    lea    r3,  [r1*3]
......
725 737
    ADD    esp, 16
726 738

  
727 739
    mov    r1,  r1m
728
    mov    r0,  r0m
740
    mov    r0,  r0mp
729 741
    lea    r3,  [r1*3]
730 742
    sub    r0,  4
731 743
    lea    r2,  [r0+r3]
libavcodec/x86/h264_idct_sse2.asm
31 31

  
32 32
SECTION .text
33 33

  
34
%macro IDCT4_1D 6
35
    SUMSUB_BA   m%3, m%1
36
    SUMSUBD2_AB m%2, m%4, m%6, m%5
37
    SUMSUB_BADC m%2, m%3, m%5, m%1
38
    SWAP %1, %2, %5, %4, %3
39
%endmacro
40

  
41 34
INIT_XMM
42
cglobal x264_add8x4_idct_sse2, 3,3
35
cglobal x264_add8x4_idct_sse2, 3,3,8
43 36
    movq   m0, [r1+ 0]
44 37
    movq   m1, [r1+ 8]
45 38
    movq   m2, [r1+16]
libavcodec/x86/x86inc.asm
20 20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 21
;*****************************************************************************
22 22

  
23
%ifdef ARCH_X86_64
24
    %ifidn __OUTPUT_FORMAT__,win32
25
        %define WIN64
26
    %else
27
        %define UNIX64
28
    %endif
29
%endif
30

  
23 31
; FIXME: All of the 64bit asm functions that take a stride as an argument
24 32
; via register, assume that the high dword of that register is filled with 0.
25 33
; This is true in practice (since we never do any 64bit arithmetic on strides,
......
28 36
; Name of the .rodata section.
29 37
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
30 38
; so use a different read-only section.
31
%macro SECTION_RODATA 0
39
%macro SECTION_RODATA 0-1 16
32 40
    %ifidn __OUTPUT_FORMAT__,macho64
33
        SECTION .text align=16
41
        SECTION .text align=%1
34 42
    %elifidn __OUTPUT_FORMAT__,macho
35
        SECTION .text align=16
43
        SECTION .text align=%1
36 44
        fakegot:
37 45
    %else
38
        SECTION .rodata align=16
46
        SECTION .rodata align=%1
39 47
    %endif
40 48
%endmacro
41 49

  
42
; PIC support macros. All these macros are totally harmless when PIC is
43
; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
44
; objects cannot directly access global variables by address, they need to
45
; go through the GOT (global offset table). Most OSes do not care about it
46
; and let you load non-shared .so objects (Linux, Win32...). However, OS X
47
; requires PIC code in its .dylib objects.
48
;
49
; - GLOBAL should be used as a suffix for global addressing, eg.
50
;     picgetgot ebx
50
; PIC support macros.
51
; x86_64 can't fit 64bit address literals in most instruction types,
52
; so shared objects (under the assumption that they might be anywhere
53
; in memory) must use an address mode that does fit.
54
; So all accesses to global variables must use this macro, e.g.
51 55
;     mov eax, [foo GLOBAL]
52 56
;   instead of
53 57
;     mov eax, [foo]
54 58
;
55
; - picgetgot computes the GOT address into the given register in PIC
56
;   mode, otherwise does nothing. You need to do this before using GLOBAL.
57
;   Before in both execution order and compiled code order (so GLOBAL knows
58
;   which register the GOT is in).
59

  
60
%ifndef PIC
61
    %define GLOBAL
62
    %macro picgetgot 1
63
    %endmacro
64
%elifdef ARCH_X86_64
65
    %define PIC64
59
; x86_32 doesn't require PIC.
60
; Some distros prefer shared objects to be PIC, but nothing breaks if
61
; the code contains a few textrels, so we'll skip that complexity.
62

  
63
%ifdef WIN64
64
    %define PIC
65
%elifndef ARCH_X86_64
66
    %undef PIC
67
%endif
68
%ifdef PIC
66 69
    %define GLOBAL wrt rip
67
    %macro picgetgot 1
68
    %endmacro
69 70
%else
70
    %define PIC32
71
    %ifidn __OUTPUT_FORMAT__,macho
72
        ; There is no real global offset table on OS X, but we still
73
        ; need to reference our variables by offset.
74
        %macro picgetgot 1
75
            call %%getgot
76
          %%getgot:
77
            pop %1
78
            add %1, $$ - %%getgot
79
            %undef GLOBAL
80
            %define GLOBAL + %1 - fakegot
81
        %endmacro
82
    %else ; elf
83
        extern _GLOBAL_OFFSET_TABLE_
84
        %macro picgetgot 1
85
            call %%getgot
86
          %%getgot:
87
            pop %1
88
            add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
89
            %undef GLOBAL
90
            %define GLOBAL + %1 wrt ..gotoff
91
        %endmacro
92
    %endif
71
    %define GLOBAL
93 72
%endif
94 73

  
95 74
; Macros to eliminate most code duplication between x86_32 and x86_64:
......
99 78

  
100 79
; PROLOGUE:
101 80
; %1 = number of arguments. loads them from stack if needed.
102
; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
103
; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
81
; %2 = number of registers used. pushes callee-saved regs if needed.
82
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
104 83
; %4 = list of names to define to registers
105 84
; PROLOGUE can also be invoked by adding the same options to cglobal
106 85

  
107 86
; e.g.
108
; cglobal foo, 2,3,0, dst, src, tmp
109
; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
87
; cglobal foo, 2,3, dst, src, tmp
88
; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
110 89

  
111 90
; TODO Some functions can use some args directly from the stack. If they're the
112 91
; last args then you can just not declare them, but if they're in the middle
......
119 98
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
120 99
; which are slow when a normal ret follows a branch.
121 100

  
101
; registers:
102
; rN and rNq are the native-size register holding function argument N
103
; rNd, rNw, rNb are dword, word, and byte size
104
; rNm is the original location of arg N (a register or on the stack), dword
105
; rNmp is native size
106

  
122 107
%macro DECLARE_REG 6
123 108
    %define r%1q %2
124 109
    %define r%1d %3
125 110
    %define r%1w %4
126 111
    %define r%1b %5
127 112
    %define r%1m %6
113
    %ifid %6 ; i.e. it's a register
114
        %define r%1mp %2
115
    %elifdef ARCH_X86_64 ; memory
116
        %define r%1mp qword %6
117
    %else
118
        %define r%1mp dword %6
119
    %endif
128 120
    %define r%1  %2
129 121
%endmacro
130 122

  
......
150 142
DECLARE_REG_SIZE di, dil
151 143
DECLARE_REG_SIZE bp, bpl
152 144

  
145
; t# defines for when per-arch register allocation is more complex than just function arguments
146

  
147
%macro DECLARE_REG_TMP 1-*
148
    %assign %%i 0
149
    %rep %0
150
        CAT_XDEFINE t, %%i, r%1
151
        %assign %%i %%i+1
152
        %rotate 1
153
    %endrep
154
%endmacro
155

  
156
%macro DECLARE_REG_TMP_SIZE 0-*
157
    %rep %0
158
        %define t%1q t%1 %+ q
159
        %define t%1d t%1 %+ d
160
        %define t%1w t%1 %+ w
161
        %define t%1b t%1 %+ b
162
        %rotate 1
163
    %endrep
164
%endmacro
165

  
166
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
167

  
153 168
%ifdef ARCH_X86_64
154 169
    %define gprsize 8
155 170
%else
......
224 239
    %assign n_arg_names %%i
225 240
%endmacro
226 241

  
227
%ifdef ARCH_X86_64 ;==========================================================
228
%ifidn __OUTPUT_FORMAT__,win32
242
%ifdef WIN64 ; Windows x64 ;=================================================
229 243

  
230 244
DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
231 245
DECLARE_REG 1, rdx, edx, dx,  dl,  edx
......
239 253

  
240 254
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
241 255
    %if %1 < %2
242
        mov r%1, [rsp + 8 + %1*8]
256
        mov r%1, [rsp + stack_offset + 8 + %1*8]
257
    %endif
258
%endmacro
259

  
260
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
261
    ASSERT %2 >= %1
262
    %assign regs_used %2
263
    ASSERT regs_used <= 7
264
    %if %0 > 2
265
        %assign xmm_regs_used %3
266
    %else
267
        %assign xmm_regs_used 0
268
    %endif
269
    ASSERT xmm_regs_used <= 16
270
    %if regs_used > 4
271
        push r4
272
        push r5
273
        %assign stack_offset stack_offset+16
274
    %endif
275
    %if xmm_regs_used > 6
276
        sub rsp, (xmm_regs_used-6)*16+16
277
        %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
278
        %assign %%i xmm_regs_used
279
        %rep (xmm_regs_used-6)
280
            %assign %%i %%i-1
281
            movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
282
        %endrep
283
    %endif
284
    LOAD_IF_USED 4, %1
285
    LOAD_IF_USED 5, %1
286
    LOAD_IF_USED 6, %1
287
    DEFINE_ARGS %4
288
%endmacro
289

  
290
%macro RESTORE_XMM_INTERNAL 1
291
    %if xmm_regs_used > 6
292
        %assign %%i xmm_regs_used
293
        %rep (xmm_regs_used-6)
294
            %assign %%i %%i-1
295
            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
296
        %endrep
297
        add %1, (xmm_regs_used-6)*16+16
298
    %endif
299
%endmacro
300

  
301
%macro RESTORE_XMM 1
302
    RESTORE_XMM_INTERNAL %1
303
    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
304
    %assign xmm_regs_used 0
305
%endmacro
306

  
307
%macro RET 0
308
    RESTORE_XMM_INTERNAL rsp
309
    %if regs_used > 4
310
        pop r5
311
        pop r4
243 312
    %endif
313
    ret
244 314
%endmacro
245 315

  
246
%else ;=======================================================================
316
%macro REP_RET 0
317
    %if regs_used > 4 || xmm_regs_used > 6
318
        RET
319
    %else
320
        rep ret
321
    %endif
322
%endmacro
323

  
324
%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
247 325

  
248 326
DECLARE_REG 0, rdi, edi, di,  dil, edi
249 327
DECLARE_REG 1, rsi, esi, si,  sil, esi
......
261 339
    %endif
262 340
%endmacro
263 341

  
264
%endif ; !WIN64
265

  
266
%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
342
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
267 343
    ASSERT %2 >= %1
268 344
    ASSERT %2 <= 7
269
    %assign stack_offset 0
270
%ifidn __OUTPUT_FORMAT__,win32
271
    LOAD_IF_USED 4, %1
272
    LOAD_IF_USED 5, %1
273
%endif
274 345
    LOAD_IF_USED 6, %1
275 346
    DEFINE_ARGS %4
276 347
%endmacro
......
315 386
    %endif
316 387
%endmacro
317 388

  
318
%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
389
%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
319 390
    ASSERT %2 >= %1
320
    %assign stack_offset 0
321 391
    %assign regs_used %2
322
    %ifdef PIC
323
    %if %3
324
        %assign regs_used regs_used+1
325
    %endif
326
    %endif
327 392
    ASSERT regs_used <= 7
328 393
    PUSH_IF_USED 3
329 394
    PUSH_IF_USED 4
......
336 401
    LOAD_IF_USED 4, %1
337 402
    LOAD_IF_USED 5, %1
338 403
    LOAD_IF_USED 6, %1
339
    %if %3
340
        picgetgot r%2
341
    %endif
342 404
    DEFINE_ARGS %4
343 405
%endmacro
344 406

  
......
382 444
    align function_align
383 445
    %1:
384 446
    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
447
    %assign stack_offset 0
385 448
    %if %0 > 1
386 449
        PROLOGUE %2
387 450
    %endif
......
389 452

  
390 453
%macro cextern 1
391 454
    %ifdef PREFIX
392
        extern _%1
393
        %define %1 _%1
394
    %else
395
        extern %1
455
        %xdefine %1 _%1
396 456
    %endif
457
    extern %1
397 458
%endmacro
398 459

  
399 460
; This is needed for ELF, otherwise the GNU linker assumes the stack is
......
523 584
    %assign %%i 0
524 585
    %rep num_mmregs
525 586
    CAT_XDEFINE m, %%i, %1_m %+ %%i
587
    CAT_XDEFINE n, m %+ %%i, %%i
526 588
    %assign %%i %%i+1
527 589
    %endrep
528 590
%endmacro
......
534 596
    %endif
535 597
%endmacro
536 598

  
537
; substitutions which are functionally identical but reduce code size
599
;Substitutions that reduce instruction size but are functionally equivalent
538 600
%define movdqa movaps
539 601
%define movdqu movups
540 602

  
603
%macro add 2
604
    %ifnum %2
605
        %if %2==128
606
            sub %1, -128
607
        %else
608
            add %1, %2
609
        %endif
610
    %else
611
        add %1, %2
612
    %endif
613
%endmacro
614

  
615
%macro sub 2
616
    %ifnum %2
617
        %if %2==128
618
            add %1, -128
619
        %else
620
            sub %1, %2
621
        %endif
622
    %else
623
        sub %1, %2
624
    %endif
625
%endmacro
libavcodec/x86/x86util.asm
93 93
    SBUTTERFLY qdq, %4, %8, %2
94 94
    SWAP %2, %5
95 95
    SWAP %4, %7
96
%if 0<11
96
%if %0<11
97 97
    movdqa m%5, %10
98 98
%endif
99 99
%endif
......
165 165
    palignr %1, %2, %3
166 166
%endmacro
167 167

  
168
%macro SUMSUB_BA 2
168
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
169
%ifnum %5
170
    mova   m%1, m%5
171
    mova   m%3, m%5
172
%else
173
    mova   m%1, %5
174
    mova   m%3, m%1
175
%endif
176
    pand   m%1, m%2 ; dst .. y6 .. y4
177
    pand   m%3, m%4 ; src .. y6 .. y4
178
    psrlw  m%2, 8   ; dst .. y7 .. y5
179
    psrlw  m%4, 8   ; src .. y7 .. y5
180
%endmacro
181

  
182
%macro SUMSUB_BA 2-3
183
%if %0==2
169 184
    paddw   %1, %2
170 185
    paddw   %2, %2
171 186
    psubw   %2, %1
187
%else
188
    mova    %3, %1
189
    paddw   %1, %2
190
    psubw   %2, %3
191
%endif
172 192
%endmacro
173 193

  
174
%macro SUMSUB_BADC 4
194
%macro SUMSUB_BADC 4-5
195
%if %0==5
196
    SUMSUB_BA %1, %2, %5
197
    SUMSUB_BA %3, %4, %5
198
%else
175 199
    paddw   %1, %2
176 200
    paddw   %3, %4
177 201
    paddw   %2, %2
178 202
    paddw   %4, %4
179 203
    psubw   %2, %1
180 204
    psubw   %4, %3
205
%endif
181 206
%endmacro
182 207

  
183
%macro HADAMARD8_1D 8
184
    SUMSUB_BADC %1, %5, %2, %6
185
    SUMSUB_BADC %3, %7, %4, %8
208
%macro HADAMARD4_V 4+
209
    SUMSUB_BADC %1, %2, %3, %4
186 210
    SUMSUB_BADC %1, %3, %2, %4
187
    SUMSUB_BADC %5, %7, %6, %8
211
%endmacro
212

  
213
%macro HADAMARD8_V 8+
188 214
    SUMSUB_BADC %1, %2, %3, %4
189 215
    SUMSUB_BADC %5, %6, %7, %8
216
    SUMSUB_BADC %1, %3, %2, %4
217
    SUMSUB_BADC %5, %7, %6, %8
218
    SUMSUB_BADC %1, %5, %2, %6
219
    SUMSUB_BADC %3, %7, %4, %8
220
%endmacro
221

  
222
%macro TRANS_SSE2 5-6
223
; TRANSPOSE2x2
224
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
225
; %2: ord/unord (for compat with sse4, unused)
226
; %3/%4: source regs
227
; %5/%6: tmp regs
228
%ifidn %1, d
229
%define mask [mask_10 GLOBAL]
230
%define shift 16
231
%elifidn %1, q
232
%define mask [mask_1100 GLOBAL]
233
%define shift 32
234
%endif
235
%if %0==6 ; less dependency if we have two tmp
236
    mova   m%5, mask   ; ff00
237
    mova   m%6, m%4    ; x5x4
238
    psll%1 m%4, shift  ; x4..
239
    pand   m%6, m%5    ; x5..
240
    pandn  m%5, m%3    ; ..x0
241
    psrl%1 m%3, shift  ; ..x1
242
    por    m%4, m%5    ; x4x0
243
    por    m%3, m%6    ; x5x1
244
%else ; more dependency, one insn less. sometimes faster, sometimes not
245
    mova   m%5, m%4    ; x5x4
246
    psll%1 m%4, shift  ; x4..
247
    pxor   m%4, m%3    ; (x4^x1)x0
248
    pand   m%4, mask   ; (x4^x1)..
249
    pxor   m%3, m%4    ; x4x0
250
    psrl%1 m%4, shift  ; ..(x1^x4)
251
    pxor   m%5, m%4    ; x5x1
252
    SWAP   %4, %3, %5
253
%endif
254
%endmacro
255

  
256
%macro TRANS_SSE4 5-6 ; see above
257
%ifidn %1, d
258
    mova   m%5, m%3
259
%ifidn %2, ord
260
    psrl%1 m%3, 16
261
%endif
262
    pblendw m%3, m%4, 10101010b
263
    psll%1 m%4, 16
264
%ifidn %2, ord
265
    pblendw m%4, m%5, 01010101b
266
%else
267
    psrl%1 m%5, 16
268
    por    m%4, m%5
269
%endif
270
%elifidn %1, q
271
    mova   m%5, m%3
272
    shufps m%3, m%4, 10001000b
273
    shufps m%5, m%4, 11011101b
274
    SWAP   %4, %5
275
%endif
276
%endmacro
277

  
278
%macro HADAMARD 5-6
279
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
280
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
281
; %3/%4: regs
282
; %5(%6): tmpregs
283
%if %1!=0 ; have to reorder stuff for horizontal op
284
    %ifidn %2, sumsub
285
         %define ORDER ord
286
         ; sumsub needs order because a-b != b-a unless a=b
287
    %else
288
         %define ORDER unord
289
         ; if we just max, order doesn't matter (allows pblendw+or in sse4)
290
    %endif
291
    %if %1==1
292
         TRANS d, ORDER, %3, %4, %5, %6
293
    %elif %1==2
294
         %if mmsize==8
295
             SBUTTERFLY dq, %3, %4, %5
296
         %else
297
             TRANS q, ORDER, %3, %4, %5, %6
298
         %endif
299
    %elif %1==4
300
         SBUTTERFLY qdq, %3, %4, %5
301
    %endif
302
%endif
303
%ifidn %2, sumsub
304
    SUMSUB_BA m%3, m%4, m%5
305
%else
306
    %ifidn %2, amax
307
        %if %0==6
308
            ABS2 m%3, m%4, m%5, m%6
309
        %else
310
            ABS1 m%3, m%5
311
            ABS1 m%4, m%5
312
        %endif
313
    %endif
314
    pmaxsw m%3, m%4
315
%endif
316
%endmacro
317

  
318

  
319
%macro HADAMARD2_2D 6-7 sumsub
320
    HADAMARD 0, sumsub, %1, %2, %5
321
    HADAMARD 0, sumsub, %3, %4, %5
322
    SBUTTERFLY %6, %1, %2, %5
323
%ifnum %7
324
    HADAMARD 0, amax, %1, %2, %5, %7
325
%else
326
    HADAMARD 0, %7, %1, %2, %5
327
%endif
328
    SBUTTERFLY %6, %3, %4, %5
329
%ifnum %7
330
    HADAMARD 0, amax, %3, %4, %5, %7
331
%else
332
    HADAMARD 0, %7, %3, %4, %5
333
%endif
334
%endmacro
335

  
336
%macro HADAMARD4_2D 5-6 sumsub
337
    HADAMARD2_2D %1, %2, %3, %4, %5, wd
338
    HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
339
    SWAP %2, %3
340
%endmacro
341

  
342
%macro HADAMARD4_2D_SSE 5-6 sumsub
343
    HADAMARD  0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
344
    HADAMARD  0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
345
    SBUTTERFLY   wd, %1, %2, %5     ; %1: m0 1+0 %2: m1 1+0
346
    SBUTTERFLY   wd, %3, %4, %5     ; %3: m0 3+2 %4: m1 3+2
347
    HADAMARD2_2D %1, %3, %2, %4, %5, dq
348
    SBUTTERFLY  qdq, %1, %2, %5
349
    HADAMARD  0, %6, %1, %2, %5     ; 2nd H m1/m0 row 0+1
350
    SBUTTERFLY  qdq, %3, %4, %5
351
    HADAMARD  0, %6, %3, %4, %5     ; 2nd H m1/m0 row 2+3
352
%endmacro
353

  
354
%macro HADAMARD8_2D 9-10 sumsub
355
    HADAMARD2_2D %1, %2, %3, %4, %9, wd
356
    HADAMARD2_2D %5, %6, %7, %8, %9, wd
357
    HADAMARD2_2D %1, %3, %2, %4, %9, dq
358
    HADAMARD2_2D %5, %7, %6, %8, %9, dq
359
    HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
360
    HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
361
%ifnidn %10, amax
362
    SWAP %2, %5
363
    SWAP %4, %7
364
%endif
190 365
%endmacro
191 366

  
192 367
%macro SUMSUB2_AB 3
......
197 372
    psubw   %3, %2
198 373
%endmacro
199 374

  
375
%macro SUMSUB2_BA 3
376
    mova    m%3, m%1
377
    paddw   m%1, m%2
378
    paddw   m%1, m%2
379
    psubw   m%2, m%3
380
    psubw   m%2, m%3
381
%endmacro
382

  
200 383
%macro SUMSUBD2_AB 4
201 384
    mova    %4, %1
202 385
    mova    %3, %2
203 386
    psraw   %2, 1
204
    psraw   %4, 1
205
    paddw   %1, %2
206
    psubw   %4, %3
387
    psraw   %1, 1
388
    paddw   %2, %4
389
    psubw   %1, %3
390
%endmacro
391

  
392
%macro DCT4_1D 5
393
%ifnum %5
394
    SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
395
    SUMSUB_BA   m%3, m%4, m%5
396
    SUMSUB2_AB  m%1, m%2, m%5
397
    SWAP %1, %3, %4, %5, %2
398
%else
399
    SUMSUB_BADC m%4, m%1, m%3, m%2
400
    SUMSUB_BA   m%3, m%4
401
    mova       [%5], m%2
402
    SUMSUB2_AB m%1, [%5], m%2
403
    SWAP %1, %3, %4, %2
404
%endif
405
%endmacro
406

  
407
%macro IDCT4_1D 5-6
408
%ifnum %5
409
    SUMSUBD2_AB m%2, m%4, m%6, m%5
410
    SUMSUB_BA   m%3, m%1, m%6
411
    SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
412
%else
413
    SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
414
    SUMSUB_BA   m%3, m%1
415
    SUMSUB_BADC m%4, m%3, m%2, m%1
416
%endif
417
    SWAP %1, %4, %3
207 418
%endmacro
208 419

  
209 420
%macro LOAD_DIFF 5
......
222 433
%endif
223 434
%endmacro
224 435

  
225
%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
226
    LOAD_DIFF %1, %5, none, [%7],      [%8]
227
    LOAD_DIFF %2, %6, none, [%7+r1],   [%8+r3]
228
    LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3]
229
    LOAD_DIFF %4, %6, none, [%7+r4],   [%8+r5]
436
%macro LOAD_DIFF8x4_SSE2 8
437
    LOAD_DIFF  m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
438
    LOAD_DIFF  m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
439
    LOAD_DIFF  m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
440
    LOAD_DIFF  m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
230 441
%endmacro
231 442

  
232
%macro STORE_DIFF 4
443
%macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
444
    movh       m%2, [%8+%1*FDEC_STRIDE]
445
    movh       m%1, [%7+%1*FENC_STRIDE]
446
    punpcklbw  m%1, m%2
447
    movh       m%3, [%8+%2*FDEC_STRIDE]
448
    movh       m%2, [%7+%2*FENC_STRIDE]
449
    punpcklbw  m%2, m%3
450
    movh       m%4, [%8+%3*FDEC_STRIDE]
451
    movh       m%3, [%7+%3*FENC_STRIDE]
452
    punpcklbw  m%3, m%4
453
    movh       m%5, [%8+%4*FDEC_STRIDE]
454
    movh       m%4, [%7+%4*FENC_STRIDE]
455
    punpcklbw  m%4, m%5
456
    pmaddubsw  m%1, m%6
457
    pmaddubsw  m%2, m%6
458
    pmaddubsw  m%3, m%6
459
    pmaddubsw  m%4, m%6
460
%endmacro
461

  
462
%macro STORE_DCT 6
463
    movq   [%5+%6+ 0], m%1
464
    movq   [%5+%6+ 8], m%2
465
    movq   [%5+%6+16], m%3
466
    movq   [%5+%6+24], m%4
467
    movhps [%5+%6+32], m%1
468
    movhps [%5+%6+40], m%2
469
    movhps [%5+%6+48], m%3
470
    movhps [%5+%6+56], m%4
471
%endmacro
472

  
473
%macro STORE_IDCT 4
474
    movhps [r0-4*FDEC_STRIDE], %1
475
    movh   [r0-3*FDEC_STRIDE], %1
476
    movhps [r0-2*FDEC_STRIDE], %2
477
    movh   [r0-1*FDEC_STRIDE], %2
478
    movhps [r0+0*FDEC_STRIDE], %3
479
    movh   [r0+1*FDEC_STRIDE], %3
480
    movhps [r0+2*FDEC_STRIDE], %4
481
    movh   [r0+3*FDEC_STRIDE], %4
482
%endmacro
483

  
484
%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
485
    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9]
486
    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3]
487
    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
488
    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5]
489
%if %10
490
    lea %8, [%8+4*r1]
491
    lea %9, [%9+4*r3]
492
%endif
493
%endmacro
494

  
495
%macro DIFFx2 6-7
496
    movh       %3, %5
497
    punpcklbw  %3, %4
233 498
    psraw      %1, 6
499
    paddsw     %1, %3
500
    movh       %3, %6
501
    punpcklbw  %3, %4
502
    psraw      %2, 6
503
    paddsw     %2, %3
504
    packuswb   %2, %1
505
%endmacro
506

  
507
%macro STORE_DIFF 4
234 508
    movh       %2, %4
235 509
    punpcklbw  %2, %3
510
    psraw      %1, 6
236 511
    paddsw     %1, %2
237 512
    packuswb   %1, %1
238 513
    movh       %4, %1

Also available in: Unified diff