Revision 9f3d6ca4

View differences:

libavcodec/x86/Makefile
9 9

  
10 10
MMX-OBJS-$(CONFIG_H264DSP)             += x86/h264dsp_mmx.o
11 11
YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
12
                                          x86/h264_deblock_10bit.o      \
12 13
                                          x86/h264_weight.o             \
13 14
                                          x86/h264_idct.o               \
14 15

  
libavcodec/x86/dsputil_mmx.c
43 43
{0x8000000080000000ULL, 0x8000000080000000ULL};
44 44

  
45 45
DECLARE_ALIGNED(8,  const uint64_t, ff_pw_1  ) = 0x0001000100010001ULL;
46
DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2  ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
46 47
DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3  ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
47 48
DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4  ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
48 49
DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
libavcodec/x86/h264_deblock.asm
324 324
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 325
;-----------------------------------------------------------------------------
326 326
%macro DEBLOCK_LUMA 1
327
cglobal deblock_v_luma_%1, 5,5,10
327
cglobal deblock_v_luma_8_%1, 5,5,10
328 328
    movd    m8, [r4] ; tc0
329 329
    lea     r4, [r1*3]
330 330
    dec     r2d        ; alpha-1
......
369 369
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
370 370
;-----------------------------------------------------------------------------
371 371
INIT_MMX
372
cglobal deblock_h_luma_%1, 5,7
372
cglobal deblock_h_luma_8_%1, 5,7
373 373
    movsxd r10, r1d
374 374
    lea    r11, [r10+r10*2]
375 375
    lea    r6,  [r0-4]
......
396 396
%ifdef WIN64
397 397
    mov    [rsp+0x20], r4
398 398
%endif
399
    call   deblock_v_luma_%1
399
    call   deblock_v_luma_8_%1
400 400

  
401 401
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
402 402
    add    r6, 2
......
436 436
;-----------------------------------------------------------------------------
437 437
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
438 438
;-----------------------------------------------------------------------------
439
cglobal deblock_%2_luma_%1, 5,5
439
cglobal deblock_%2_luma_8_%1, 5,5
440 440
    lea     r4, [r1*3]
441 441
    dec     r2     ; alpha-1
442 442
    neg     r4
......
489 489
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
490 490
;-----------------------------------------------------------------------------
491 491
INIT_MMX
492
cglobal deblock_h_luma_%1, 0,5
492
cglobal deblock_h_luma_8_%1, 0,5
493 493
    mov    r0, r0mp
494 494
    mov    r3, r1m
495 495
    lea    r4, [r3*3]
......
512 512
    PUSH   dword r2m
513 513
    PUSH   dword 16
514 514
    PUSH   dword r0
515
    call   deblock_%2_luma_%1
515
    call   deblock_%2_luma_8_%1
516 516
%ifidn %2, v8
517 517
    add    dword [esp   ], 8 ; pix_tmp+0x38
518 518
    add    dword [esp+16], 2 ; tc0+2
519
    call   deblock_%2_luma_%1
519
    call   deblock_%2_luma_8_%1
520 520
%endif
521 521
    ADD    esp, 20
522 522

  
......
685 685
;-----------------------------------------------------------------------------
686 686
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
687 687
;-----------------------------------------------------------------------------
688
cglobal deblock_%2_luma_intra_%1, 4,6,16
688
cglobal deblock_%2_luma_intra_8_%1, 4,6,16
689 689
%ifndef ARCH_X86_64
690 690
    sub     esp, 0x60
691 691
%endif
......
747 747
;-----------------------------------------------------------------------------
748 748
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
749 749
;-----------------------------------------------------------------------------
750
cglobal deblock_h_luma_intra_%1, 4,7
750
cglobal deblock_h_luma_intra_8_%1, 4,7
751 751
    movsxd r10, r1d
752 752
    lea    r11, [r10*3]
753 753
    lea    r6,  [r0-4]
......
763 763

  
764 764
    lea    r0,  [pix_tmp+0x40]
765 765
    mov    r1,  0x10
766
    call   deblock_v_luma_intra_%1
766
    call   deblock_v_luma_intra_8_%1
767 767

  
768 768
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
769 769
    lea    r5, [r6+r11]
......
776 776
    add    rsp, 0x88
777 777
    RET
778 778
%else
779
cglobal deblock_h_luma_intra_%1, 2,4
779
cglobal deblock_h_luma_intra_8_%1, 2,4
780 780
    lea    r3,  [r1*3]
781 781
    sub    r0,  4
782 782
    lea    r2,  [r0+r3]
......
795 795
    PUSH   dword r2m
796 796
    PUSH   dword 16
797 797
    PUSH   r0
798
    call   deblock_%2_luma_intra_%1
798
    call   deblock_%2_luma_intra_8_%1
799 799
%ifidn %2, v8
800 800
    add    dword [rsp], 8 ; pix_tmp+8
801
    call   deblock_%2_luma_intra_%1
801
    call   deblock_%2_luma_intra_8_%1
802 802
%endif
803 803
    ADD    esp, 16
804 804

  
......
851 851
;-----------------------------------------------------------------------------
852 852
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
853 853
;-----------------------------------------------------------------------------
854
cglobal deblock_v_chroma_mmxext, 5,6
854
cglobal deblock_v_chroma_8_mmxext, 5,6
855 855
    CHROMA_V_START
856 856
    movq  m0, [t5]
857 857
    movq  m1, [t5+r1]
......
865 865
;-----------------------------------------------------------------------------
866 866
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
867 867
;-----------------------------------------------------------------------------
868
cglobal deblock_h_chroma_mmxext, 5,7
868
cglobal deblock_h_chroma_8_mmxext, 5,7
869 869
%ifdef ARCH_X86_64
870 870
    %define buf0 [rsp-24]
871 871
    %define buf1 [rsp-16]
......
911 911
;-----------------------------------------------------------------------------
912 912
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
913 913
;-----------------------------------------------------------------------------
914
cglobal deblock_v_chroma_intra_mmxext, 4,5
914
cglobal deblock_v_chroma_intra_8_mmxext, 4,5
915 915
    CHROMA_V_START
916 916
    movq  m0, [t5]
917 917
    movq  m1, [t5+r1]
......
925 925
;-----------------------------------------------------------------------------
926 926
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
927 927
;-----------------------------------------------------------------------------
928
cglobal deblock_h_chroma_intra_mmxext, 4,6
928
cglobal deblock_h_chroma_intra_8_mmxext, 4,6
929 929
    CHROMA_H_START
930 930
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
931 931
    call ff_chroma_intra_body_mmxext
libavcodec/x86/h264_deblock_10bit.asm
1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Oskar Arvidsson <oskar@irock.se>
7
;*          Loren Merritt <lorenm@u.washington.edu>
8
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9
;*
10
;* This file is part of Libav.
11
;*
12
;* Libav is free software; you can redistribute it and/or
13
;* modify it under the terms of the GNU Lesser General Public
14
;* License as published by the Free Software Foundation; either
15
;* version 2.1 of the License, or (at your option) any later version.
16
;*
17
;* Libav is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
;* Lesser General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU Lesser General Public
23
;* License along with Libav; if not, write to the Free Software
24
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
;******************************************************************************
26

  
27
%include "x86inc.asm"
28
%include "x86util.asm"
29

  
30
SECTION_RODATA
31

  
32
pw_pixel_max: times 8 dw ((1 << 10)-1)
33

  
34
SECTION .text
35

  
36
cextern pw_2
37
cextern pw_4
38

  
39
; out: %4 = |%1-%2|-%3
40
; clobbers: %5
41
%macro ABS_SUB 5
42
    psubusw %5, %2, %1
43
    psubusw %4, %1, %2
44
    por     %4, %5
45
    psubw   %4, %3
46
%endmacro
47

  
48
; out: %4 = |%1-%2|<%3
49
%macro DIFF_LT   5
50
    psubusw %4, %2, %1
51
    psubusw %5, %1, %2
52
    por     %5, %4 ; |%1-%2|
53
    pxor    %4, %4
54
    psubw   %5, %3 ; |%1-%2|-%3
55
    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
56
%endmacro
57

  
58
%macro LOAD_AB 4
59
    movd       %1, %3
60
    movd       %2, %4
61
    SPLATW     %1, %1
62
    SPLATW     %2, %2
63
%endmacro
64

  
65
; in:  %2=tc reg
66
; out: %1=splatted tc
67
%macro LOAD_TC 2
68
    movd        %1, [%2]
69
    punpcklbw   %1, %1
70
%if mmsize == 8
71
    pshufw      %1, %1, 0
72
%else
73
    pshuflw     %1, %1, 01010000b
74
    pshufd      %1, %1, 01010000b
75
%endif
76
    psraw       %1, 6
77
%endmacro
78

  
79
; in: %1=p1, %2=p0, %3=q0, %4=q1
80
;     %5=alpha, %6=beta, %7-%9=tmp
81
; out: %7=mask
82
%macro LOAD_MASK 9
83
    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
84
    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
85
    pand        %8, %9
86
    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
87
    pxor        %7, %7
88
    pand        %8, %9
89
    pcmpgtw     %7, %8
90
%endmacro
91

  
92
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
93
; out: %1=p0', m2=q0'
94
%macro DEBLOCK_P0_Q0 7
95
    psubw   %3, %4
96
    pxor    %7, %7
97
    paddw   %3, [pw_4]
98
    psubw   %7, %5
99
    psubw   %6, %2, %1
100
    psllw   %6, 2
101
    paddw   %3, %6
102
    psraw   %3, 3
103
    mova    %6, [pw_pixel_max]
104
    CLIPW   %3, %7, %5
105
    pxor    %7, %7
106
    paddw   %1, %3
107
    psubw   %2, %3
108
    CLIPW   %1, %7, %6
109
    CLIPW   %2, %7, %6
110
%endmacro
111

  
112
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
113
%macro LUMA_Q1 6
114
    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
115
    paddw       %1, %6
116
    pxor        %6, %6
117
    psraw       %1, 1
118
    psubw       %6, %5
119
    psubw       %1, %2
120
    CLIPW       %1, %6, %5
121
    paddw       %1, %2
122
%endmacro
123

  
124
%macro LUMA_DEBLOCK_ONE 3
125
    DIFF_LT     m5, %1, bm, m4, m6
126
    pxor        m6, m6
127
    mova        %3, m4
128
    pcmpgtw     m6, tcm
129
    pand        m4, tcm
130
    pandn       m6, m7
131
    pand        m4, m6
132
    LUMA_Q1 m5, %2, m1, m2, m4, m6
133
%endmacro
134

  
135
%macro LUMA_H_STORE 2
136
%if mmsize == 8
137
    movq        [r0-4], m0
138
    movq        [r0+r1-4], m1
139
    movq        [r0+r1*2-4], m2
140
    movq        [r0+%2-4], m3
141
%else
142
    movq        [r0-4], m0
143
    movhps      [r0+r1-4], m0
144
    movq        [r0+r1*2-4], m1
145
    movhps      [%1-4], m1
146
    movq        [%1+r1-4], m2
147
    movhps      [%1+r1*2-4], m2
148
    movq        [%1+%2-4], m3
149
    movhps      [%1+r1*4-4], m3
150
%endif
151
%endmacro
152

  
153
%macro DEBLOCK_LUMA 1
154
;-----------------------------------------------------------------------------
155
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
156
;-----------------------------------------------------------------------------
157
cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
158
    %assign pad 5*mmsize+12-(stack_offset&15)
159
    %define tcm [rsp]
160
    %define ms1 [rsp+mmsize]
161
    %define ms2 [rsp+mmsize*2]
162
    %define am  [rsp+mmsize*3]
163
    %define bm  [rsp+mmsize*4]
164
    SUB        rsp, pad
165
    shl        r2d, 2
166
    shl        r3d, 2
167
    LOAD_AB     m4, m5, r2, r3
168
    mov         r3, 32/mmsize
169
    mov         r2, r0
170
    sub         r0, r1
171
    mova        am, m4
172
    sub         r0, r1
173
    mova        bm, m5
174
    sub         r0, r1
175
.loop:
176
    mova        m0, [r0+r1]
177
    mova        m1, [r0+r1*2]
178
    mova        m2, [r2]
179
    mova        m3, [r2+r1]
180

  
181
    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
182
    LOAD_TC     m6, r4
183
    mova       tcm, m6
184

  
185
    mova        m5, [r0]
186
    LUMA_DEBLOCK_ONE m1, m0, ms1
187
    mova   [r0+r1], m5
188

  
189
    mova        m5, [r2+r1*2]
190
    LUMA_DEBLOCK_ONE m2, m3, ms2
191
    mova   [r2+r1], m5
192

  
193
    pxor        m5, m5
194
    mova        m6, tcm
195
    pcmpgtw     m5, tcm
196
    psubw       m6, ms1
197
    pandn       m5, m7
198
    psubw       m6, ms2
199
    pand        m5, m6
200
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
201
    mova [r0+r1*2], m1
202
    mova      [r2], m2
203

  
204
    add         r0, mmsize
205
    add         r2, mmsize
206
    add         r4, mmsize/8
207
    dec         r3
208
    jg .loop
209
    ADD         rsp, pad
210
    RET
211

  
212
cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
213
    %assign pad 7*mmsize+12-(stack_offset&15)
214
    %define tcm [rsp]
215
    %define ms1 [rsp+mmsize]
216
    %define ms2 [rsp+mmsize*2]
217
    %define p1m [rsp+mmsize*3]
218
    %define p2m [rsp+mmsize*4]
219
    %define am  [rsp+mmsize*5]
220
    %define bm  [rsp+mmsize*6]
221
    SUB        rsp, pad
222
    shl        r2d, 2
223
    shl        r3d, 2
224
    LOAD_AB     m4, m5, r2, r3
225
    mov         r3, r1
226
    mova        am, m4
227
    add         r3, r1
228
    mov         r5, 32/mmsize
229
    mova        bm, m5
230
    add         r3, r1
231
%if mmsize == 16
232
    mov         r2, r0
233
    add         r2, r3
234
%endif
235
.loop:
236
%if mmsize == 8
237
    movq        m2, [r0-8]     ; y q2 q1 q0
238
    movq        m7, [r0+0]
239
    movq        m5, [r0+r1-8]
240
    movq        m3, [r0+r1+0]
241
    movq        m0, [r0+r1*2-8]
242
    movq        m6, [r0+r1*2+0]
243
    movq        m1, [r0+r3-8]
244
    TRANSPOSE4x4W 2, 5, 0, 1, 4
245
    SWAP         2, 7
246
    movq        m7, [r0+r3]
247
    TRANSPOSE4x4W 2, 3, 6, 7, 4
248
%else
249
    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
250
    movu        m0, [r0+r1-8]
251
    movu        m2, [r0+r1*2-8]
252
    movu        m3, [r2-8]
253
    TRANSPOSE4x4W 5, 0, 2, 3, 6
254
    mova       tcm, m3
255

  
256
    movu        m4, [r2+r1-8]
257
    movu        m1, [r2+r1*2-8]
258
    movu        m3, [r2+r3-8]
259
    movu        m7, [r2+r1*4-8]
260
    TRANSPOSE4x4W 4, 1, 3, 7, 6
261

  
262
    mova        m6, tcm
263
    punpcklqdq  m6, m7
264
    punpckhqdq  m5, m4
265
    SBUTTERFLY qdq, 0, 1, 7
266
    SBUTTERFLY qdq, 2, 3, 7
267
%endif
268

  
269
    mova       p2m, m6
270
    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
271
    LOAD_TC     m6, r4
272
    mova       tcm, m6
273

  
274
    LUMA_DEBLOCK_ONE m1, m0, ms1
275
    mova       p1m, m5
276

  
277
    mova        m5, p2m
278
    LUMA_DEBLOCK_ONE m2, m3, ms2
279
    mova       p2m, m5
280

  
281
    pxor        m5, m5
282
    mova        m6, tcm
283
    pcmpgtw     m5, tcm
284
    psubw       m6, ms1
285
    pandn       m5, m7
286
    psubw       m6, ms2
287
    pand        m5, m6
288
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
289
    mova        m0, p1m
290
    mova        m3, p2m
291
    TRANSPOSE4x4W 0, 1, 2, 3, 4
292
    LUMA_H_STORE r2, r3
293

  
294
    add         r4, mmsize/8
295
    lea         r0, [r0+r1*(mmsize/2)]
296
    lea         r2, [r2+r1*(mmsize/2)]
297
    dec         r5
298
    jg .loop
299
    ADD        rsp, pad
300
    RET
301
%endmacro
302

  
303
INIT_XMM
304
%ifdef ARCH_X86_64
305
; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
306
;      m12=alpha, m13=beta
307
; out: m0=p1', m3=q1', m1=p0', m2=q0'
308
; clobbers: m4, m5, m6, m7, m10, m11, m14
309
%macro DEBLOCK_LUMA_INTER_SSE2 0
310
    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
311
    LOAD_TC     m6, r4
312
    DIFF_LT     m8, m1, m13, m10, m4
313
    DIFF_LT     m9, m2, m13, m11, m4
314
    pand        m6, m7
315

  
316
    mova       m14, m6
317
    pxor        m4, m4
318
    pcmpgtw     m6, m4
319
    pand        m6, m14
320

  
321
    mova        m5, m10
322
    pand        m5, m6
323
    LUMA_Q1 m8, m0, m1, m2, m5, m4
324

  
325
    mova        m5, m11
326
    pand        m5, m6
327
    LUMA_Q1 m9, m3, m1, m2, m5, m4
328

  
329
    pxor        m4, m4
330
    psubw       m6, m10
331
    pcmpgtw     m4, m14
332
    pandn       m4, m7
333
    psubw       m6, m11
334
    pand        m4, m6
335
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
336

  
337
    SWAP         0, 8
338
    SWAP         3, 9
339
%endmacro
340

  
341
%macro DEBLOCK_LUMA_64 1
342
cglobal deblock_v_luma_10_%1, 5,5,15
343
    %define p2 m8
344
    %define p1 m0
345
    %define p0 m1
346
    %define q0 m2
347
    %define q1 m3
348
    %define q2 m9
349
    %define mask0 m7
350
    %define mask1 m10
351
    %define mask2 m11
352
    shl        r2d, 2
353
    shl        r3d, 2
354
    LOAD_AB    m12, m13, r2, r3
355
    mov         r2, r0
356
    sub         r0, r1
357
    sub         r0, r1
358
    sub         r0, r1
359
    mov         r3, 2
360
.loop:
361
    mova        p2, [r0]
362
    mova        p1, [r0+r1]
363
    mova        p0, [r0+r1*2]
364
    mova        q0, [r2]
365
    mova        q1, [r2+r1]
366
    mova        q2, [r2+r1*2]
367
    DEBLOCK_LUMA_INTER_SSE2
368
    mova   [r0+r1], p1
369
    mova [r0+r1*2], p0
370
    mova      [r2], q0
371
    mova   [r2+r1], q1
372
    add         r0, mmsize
373
    add         r2, mmsize
374
    add         r4, 2
375
    dec         r3
376
    jg .loop
377
    REP_RET
378

  
379
cglobal deblock_h_luma_10_%1, 5,7,15
380
    shl        r2d, 2
381
    shl        r3d, 2
382
    LOAD_AB    m12, m13, r2, r3
383
    mov         r2, r1
384
    add         r2, r1
385
    add         r2, r1
386
    mov         r5, r0
387
    add         r5, r2
388
    mov         r6, 2
389
.loop:
390
    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
391
    movu        m0, [r0+r1-8]
392
    movu        m2, [r0+r1*2-8]
393
    movu        m9, [r5-8]
394
    movu        m5, [r5+r1-8]
395
    movu        m1, [r5+r1*2-8]
396
    movu        m3, [r5+r2-8]
397
    movu        m7, [r5+r1*4-8]
398

  
399
    TRANSPOSE4x4W 8, 0, 2, 9, 10
400
    TRANSPOSE4x4W 5, 1, 3, 7, 10
401

  
402
    punpckhqdq  m8, m5
403
    SBUTTERFLY qdq, 0, 1, 10
404
    SBUTTERFLY qdq, 2, 3, 10
405
    punpcklqdq  m9, m7
406

  
407
    DEBLOCK_LUMA_INTER_SSE2
408

  
409
    TRANSPOSE4x4W 0, 1, 2, 3, 4
410
    LUMA_H_STORE r5, r2
411
    add         r4, 2
412
    lea         r0, [r0+r1*8]
413
    lea         r5, [r5+r1*8]
414
    dec         r6
415
    jg .loop
416
    REP_RET
417
%endmacro
418

  
419
INIT_XMM
420
DEBLOCK_LUMA_64 sse2
421
INIT_AVX
422
DEBLOCK_LUMA_64 avx
423
%endif
424

  
425
%macro SWAPMOVA 2
426
%ifid %1
427
    SWAP %1, %2
428
%else
429
    mova %1, %2
430
%endif
431
%endmacro
432

  
433
; in: t0-t2: tmp registers
434
;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
435
;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
436
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
437
%ifdef ARCH_X86_64
438
    paddw     t0, %3, %2
439
    mova      t2, %4
440
    paddw     t2, %3
441
%else
442
    mova      t0, %3
443
    mova      t2, %4
444
    paddw     t0, %2
445
    paddw     t2, %3
446
%endif
447
    paddw     t0, %1
448
    paddw     t2, t2
449
    paddw     t0, %5
450
    paddw     t2, %9
451
    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
452
    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
453

  
454
    psrlw     t2, 3
455
    psrlw     t1, t0, 2
456
    psubw     t2, %3
457
    psubw     t1, %2
458
    pand      t2, %8
459
    pand      t1, %8
460
    paddw     t2, %3
461
    paddw     t1, %2
462
    SWAPMOVA %11, t1
463

  
464
    psubw     t1, t0, %3
465
    paddw     t0, t0
466
    psubw     t1, %5
467
    psubw     t0, %3
468
    paddw     t1, %6
469
    paddw     t1, %2
470
    paddw     t0, %6
471
    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
472
    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
473

  
474
    pxor      t0, t1
475
    pxor      t1, %1
476
    pand      t0, %8
477
    pand      t1, %7
478
    pxor      t0, t1
479
    pxor      t0, %1
480
    SWAPMOVA %10, t0
481
    SWAPMOVA %12, t2
482
%endmacro
483

  
484
%macro LUMA_INTRA_INIT 1
485
    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
486
    %define t0 m4
487
    %define t1 m5
488
    %define t2 m6
489
    %define t3 m7
490
    %assign i 4
491
%rep %1
492
    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
493
    %assign i i+1
494
%endrep
495
    SUB    rsp, pad
496
%endmacro
497

  
498
; in: %1-%3=tmp, %4=p2, %5=q2
499
%macro LUMA_INTRA_INTER 5
500
    LOAD_AB t0, t1, r2d, r3d
501
    mova    %1, t0
502
    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
503
%ifdef ARCH_X86_64
504
    mova    %2, t0        ; mask0
505
    psrlw   t3, %1, 2
506
%else
507
    mova    t3, %1
508
    mova    %2, t0        ; mask0
509
    psrlw   t3, 2
510
%endif
511
    paddw   t3, [pw_2]    ; alpha/4+2
512
    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
513
    pand    t2, %2
514
    mova    t3, %5        ; q2
515
    mova    %1, t2        ; mask1
516
    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
517
    pand    t2, %1
518
    mova    t3, %4        ; p2
519
    mova    %3, t2        ; mask1q
520
    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
521
    pand    t2, %1
522
    mova    %1, t2        ; mask1p
523
%endmacro
524

  
525
%macro LUMA_H_INTRA_LOAD 0
526
%if mmsize == 8
527
    movu    t0, [r0-8]
528
    movu    t1, [r0+r1-8]
529
    movu    m0, [r0+r1*2-8]
530
    movu    m1, [r0+r4-8]
531
    TRANSPOSE4x4W 4, 5, 0, 1, 2
532
    mova    t4, t0        ; p3
533
    mova    t5, t1        ; p2
534

  
535
    movu    m2, [r0]
536
    movu    m3, [r0+r1]
537
    movu    t0, [r0+r1*2]
538
    movu    t1, [r0+r4]
539
    TRANSPOSE4x4W 2, 3, 4, 5, 6
540
    mova    t6, t0        ; q2
541
    mova    t7, t1        ; q3
542
%else
543
    movu    t0, [r0-8]
544
    movu    t1, [r0+r1-8]
545
    movu    m0, [r0+r1*2-8]
546
    movu    m1, [r0+r5-8]
547
    movu    m2, [r4-8]
548
    movu    m3, [r4+r1-8]
549
    movu    t2, [r4+r1*2-8]
550
    movu    t3, [r4+r5-8]
551
    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
552
    mova    t4, t0        ; p3
553
    mova    t5, t1        ; p2
554
    mova    t6, t2        ; q2
555
    mova    t7, t3        ; q3
556
%endif
557
%endmacro
558

  
559
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
560
%macro LUMA_H_INTRA_STORE 9
561
%if mmsize == 8
562
    TRANSPOSE4x4W %1, %2, %3, %4, %9
563
    movq       [r0-8], m%1
564
    movq       [r0+r1-8], m%2
565
    movq       [r0+r1*2-8], m%3
566
    movq       [r0+r4-8], m%4
567
    movq       m%1, %8
568
    TRANSPOSE4x4W %5, %6, %7, %1, %9
569
    movq       [r0], m%5
570
    movq       [r0+r1], m%6
571
    movq       [r0+r1*2], m%7
572
    movq       [r0+r4], m%1
573
%else
574
    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
575
    movq       [r0-8], m%1
576
    movq       [r0+r1-8], m%2
577
    movq       [r0+r1*2-8], m%3
578
    movq       [r0+r5-8], m%4
579
    movhps     [r4-8], m%1
580
    movhps     [r4+r1-8], m%2
581
    movhps     [r4+r1*2-8], m%3
582
    movhps     [r4+r5-8], m%4
583
%ifnum %8
584
    SWAP       %1, %8
585
%else
586
    mova       m%1, %8
587
%endif
588
    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
589
    movq       [r0], m%5
590
    movq       [r0+r1], m%6
591
    movq       [r0+r1*2], m%7
592
    movq       [r0+r5], m%1
593
    movhps     [r4], m%5
594
    movhps     [r4+r1], m%6
595
    movhps     [r4+r1*2], m%7
596
    movhps     [r4+r5], m%1
597
%endif
598
%endmacro
599

  
600
%ifdef ARCH_X86_64
601
;-----------------------------------------------------------------------------
602
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
603
;-----------------------------------------------------------------------------
604
%macro DEBLOCK_LUMA_INTRA_64 1
605
cglobal deblock_v_luma_intra_10_%1, 4,7,16
606
    %define t0 m1
607
    %define t1 m2
608
    %define t2 m4
609
    %define p2 m8
610
    %define p1 m9
611
    %define p0 m10
612
    %define q0 m11
613
    %define q1 m12
614
    %define q2 m13
615
    %define aa m5
616
    %define bb m14
617
    lea     r4, [r1*4]
618
    lea     r5, [r1*3] ; 3*stride
619
    neg     r4
620
    add     r4, r0     ; pix-4*stride
621
    mov     r6, 2
622
    mova    m0, [pw_2]
623
    shl    r2d, 2
624
    shl    r3d, 2
625
    LOAD_AB aa, bb, r2d, r3d
626
.loop
627
    mova    p2, [r4+r1]
628
    mova    p1, [r4+2*r1]
629
    mova    p0, [r4+r5]
630
    mova    q0, [r0]
631
    mova    q1, [r0+r1]
632
    mova    q2, [r0+2*r1]
633

  
634
    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
635
    mova    t2, aa
636
    psrlw   t2, 2
637
    paddw   t2, m0 ; alpha/4+2
638
    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
639
    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
640
    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
641
    pand    m6, m3
642
    pand    m7, m6
643
    pand    m6, t1
644
    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
645
    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
646
    add     r0, mmsize
647
    add     r4, mmsize
648
    dec     r6
649
    jg .loop
650
    REP_RET
651

  
652
;-----------------------------------------------------------------------------
653
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
654
;-----------------------------------------------------------------------------
655
cglobal deblock_h_luma_intra_10_%1, 4,7,16
656
    %define t0 m15
657
    %define t1 m14
658
    %define t2 m2
659
    %define q3 m5
660
    %define q2 m8
661
    %define q1 m9
662
    %define q0 m10
663
    %define p0 m11
664
    %define p1 m12
665
    %define p2 m13
666
    %define p3 m4
667
    %define spill [rsp]
668
    %assign pad 24-(stack_offset&15)
669
    SUB     rsp, pad
670
    lea     r4, [r1*4]
671
    lea     r5, [r1*3] ; 3*stride
672
    add     r4, r0     ; pix+4*stride
673
    mov     r6, 2
674
    mova    m0, [pw_2]
675
    shl    r2d, 2
676
    shl    r3d, 2
677
.loop
678
    movu    q3, [r0-8]
679
    movu    q2, [r0+r1-8]
680
    movu    q1, [r0+r1*2-8]
681
    movu    q0, [r0+r5-8]
682
    movu    p0, [r4-8]
683
    movu    p1, [r4+r1-8]
684
    movu    p2, [r4+r1*2-8]
685
    movu    p3, [r4+r5-8]
686
    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
687

  
688
    LOAD_AB m1, m2, r2d, r3d
689
    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
690
    psrlw   m1, 2
691
    paddw   m1, m0 ; alpha/4+2
692
    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
693
    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
694
    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
695
    pand    m6, m3
696
    pand    m7, m6
697
    pand    m6, t1
698

  
699
    mova spill, q3
700
    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
701
    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
702
    mova    m7, spill
703

  
704
    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
705

  
706
    lea     r0, [r0+r1*8]
707
    lea     r4, [r4+r1*8]
708
    dec     r6
709
    jg .loop
710
    ADD    rsp, pad
711
    RET
712
%endmacro
713

  
714
INIT_XMM
715
DEBLOCK_LUMA_INTRA_64 sse2
716
INIT_AVX
717
DEBLOCK_LUMA_INTRA_64 avx
718

  
719
%endif
720

  
721
%macro DEBLOCK_LUMA_INTRA 1
722
;-----------------------------------------------------------------------------
723
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
724
;-----------------------------------------------------------------------------
725
cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
726
    LUMA_INTRA_INIT 3
727
    lea     r4, [r1*4]
728
    lea     r5, [r1*3]
729
    neg     r4
730
    add     r4, r0
731
    mov     r6, 32/mmsize
732
    shl    r2d, 2
733
    shl    r3d, 2
734
.loop:
735
    mova    m0, [r4+r1*2] ; p1
736
    mova    m1, [r4+r5]   ; p0
737
    mova    m2, [r0]      ; q0
738
    mova    m3, [r0+r1]   ; q1
739
    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
740
    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
741
    mova    t3, [r0+r1*2] ; q2
742
    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
743
    add     r0, mmsize
744
    add     r4, mmsize
745
    dec     r6
746
    jg .loop
747
    ADD    rsp, pad
748
    RET
749

  
750
;-----------------------------------------------------------------------------
751
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
752
;-----------------------------------------------------------------------------
753
cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
754
    LUMA_INTRA_INIT 8
755
%if mmsize == 8
756
    lea     r4, [r1*3]
757
    mov     r5, 32/mmsize
758
%else
759
    lea     r4, [r1*4]
760
    lea     r5, [r1*3] ; 3*stride
761
    add     r4, r0     ; pix+4*stride
762
    mov     r6, 32/mmsize
763
%endif
764
    shl    r2d, 2
765
    shl    r3d, 2
766
.loop:
767
    LUMA_H_INTRA_LOAD
768
    LUMA_INTRA_INTER t8, t9, t10, t5, t6
769

  
770
    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
771
    mova    t3, t6     ; q2
772
    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
773

  
774
    mova    m2, t4
775
    mova    m0, t11
776
    mova    m1, t5
777
    mova    m3, t8
778
    mova    m6, t6
779

  
780
    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
781

  
782
    lea     r0, [r0+r1*(mmsize/2)]
783
%if mmsize == 8
784
    dec     r5
785
%else
786
    lea     r4, [r4+r1*(mmsize/2)]
787
    dec     r6
788
%endif
789
    jg .loop
790
    ADD    rsp, pad
791
    RET
792
%endmacro
793

  
794
%ifndef ARCH_X86_64
795
INIT_MMX
796
DEBLOCK_LUMA mmxext
797
DEBLOCK_LUMA_INTRA mmxext
798
INIT_XMM
799
DEBLOCK_LUMA sse2
800
DEBLOCK_LUMA_INTRA sse2
801
INIT_AVX
802
DEBLOCK_LUMA avx
803
DEBLOCK_LUMA_INTRA avx
804
%endif
libavcodec/x86/h264dsp_mmx.c
218 218
    );
219 219
}
220 220

  
221
#define LF_FUNC(DIR, TYPE, OPT) \
222
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
223
                                                  int alpha, int beta, int8_t *tc0);
224
#define LF_IFUNC(DIR, TYPE, OPT) \
225
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
226
                                                  int alpha, int beta);
227

  
228
LF_FUNC (h,  chroma,       mmxext)
229
LF_IFUNC(h,  chroma_intra, mmxext)
230
LF_FUNC (v,  chroma,       mmxext)
231
LF_IFUNC(v,  chroma_intra, mmxext)
232

  
233
LF_FUNC (h,  luma,         mmxext)
234
LF_IFUNC(h,  luma_intra,   mmxext)
235
#if HAVE_YASM && ARCH_X86_32
236
LF_FUNC (v8, luma,         mmxext)
237
static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
221
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
222
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
223
                                                                int alpha, int beta, int8_t *tc0);
224
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
225
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
226
                                                                int alpha, int beta);
227

  
228
#define LF_FUNCS(type, depth)\
229
LF_FUNC (h,  chroma,       depth, mmxext)\
230
LF_IFUNC(h,  chroma_intra, depth, mmxext)\
231
LF_FUNC (v,  chroma,       depth, mmxext)\
232
LF_IFUNC(v,  chroma_intra, depth, mmxext)\
233
LF_FUNC (h,  luma,         depth, mmxext)\
234
LF_IFUNC(h,  luma_intra,   depth, mmxext)\
235
LF_FUNC (h,  luma,         depth, sse2)\
236
LF_IFUNC(h,  luma_intra,   depth, sse2)\
237
LF_FUNC (v,  luma,         depth, sse2)\
238
LF_IFUNC(v,  luma_intra,   depth, sse2)\
239
LF_FUNC (h,  luma,         depth,  avx)\
240
LF_IFUNC(h,  luma_intra,   depth,  avx)\
241
LF_FUNC (v,  luma,         depth,  avx)\
242
LF_IFUNC(v,  luma_intra,   depth,  avx)
243

  
244
LF_FUNCS( uint8_t,  8)
245
LF_FUNCS(uint16_t, 10)
246

  
247
LF_FUNC (v8, luma,             8, mmxext)
248
static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
238 249
{
239 250
    if((tc0[0] & tc0[1]) >= 0)
240
        ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
251
        ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0);
241 252
    if((tc0[2] & tc0[3]) >= 0)
242
        ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
253
        ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2);
243 254
}
244
LF_IFUNC(v8, luma_intra,   mmxext)
245
static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
255
LF_IFUNC(v8, luma_intra,        8, mmxext)
256
static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta)
246 257
{
247
    ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
248
    ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
258
    ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta);
259
    ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta);
249 260
}
250
#endif
251 261

  
252
LF_FUNC (h,  luma,         sse2)
253
LF_IFUNC(h,  luma_intra,   sse2)
254
LF_FUNC (v,  luma,         sse2)
255
LF_IFUNC(v,  luma_intra,   sse2)
256
LF_FUNC (h,  luma,         avx)
257
LF_IFUNC(h,  luma_intra,   avx)
258
LF_FUNC (v,  luma,         avx)
259
LF_IFUNC(v,  luma_intra,   avx)
262
LF_FUNC (v,  luma,            10, mmxext)
263
LF_IFUNC(v,  luma_intra,      10, mmxext)
260 264

  
261 265
/***********************************/
262 266
/* weighted prediction */
......
318 322
            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
319 323
            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
320 324

  
321
            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext;
322
            c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext;
323
            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext;
324
            c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext;
325
            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext;
326
            c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
327
            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext;
328
            c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext;
325 329
#if ARCH_X86_32
326
            c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext;
327
            c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext;
328
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext;
329
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext;
330
            c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext;
331
            c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext;
332
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
333
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
330 334
#endif
331 335
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
332 336
            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
......
364 368
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
365 369

  
366 370
#if HAVE_ALIGNED_STACK
367
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2;
368
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2;
369
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2;
370
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2;
371
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
372
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
373
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
374
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
371 375
#endif
372 376

  
373 377
                c->h264_idct_add16 = ff_h264_idct_add16_sse2;
......
383 387
            }
384 388
            if (mm_flags&AV_CPU_FLAG_AVX) {
385 389
#if HAVE_ALIGNED_STACK
386
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx;
387
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx;
388
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx;
389
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx;
390
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
391
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
392
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
393
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
394
#endif
395
            }
396
        }
397
    }
398
#endif
399
    } else if (bit_depth == 10) {
400
#if HAVE_YASM
401
    if (mm_flags & AV_CPU_FLAG_MMX) {
402
        if (mm_flags & AV_CPU_FLAG_MMX2) {
403
#if ARCH_X86_32
404
            c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext;
405
            c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext;
406
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
407
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
408
#endif
409
            if (mm_flags&AV_CPU_FLAG_SSE2) {
410
#if HAVE_ALIGNED_STACK
411
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
412
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
413
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
414
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
415
#endif
416
            }
417
            if (mm_flags&AV_CPU_FLAG_AVX) {
418
#if HAVE_ALIGNED_STACK
419
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
420
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
421
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
422
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
390 423
#endif
391 424
            }
392 425
        }
libavcodec/x86/x86util.asm
457 457
    pshufw     %1, %2, (%3)*0x55
458 458
%endif
459 459
%endmacro
460

  
461
%macro CLIPW 3 ;(dst, min, max)
462
    pmaxsw %1, %2
463
    pminsw %1, %3
464
%endmacro

Also available in: Unified diff