Revision a33a2562

View differences:

libavcodec/x86/Makefile
10 10

  
11 11
MMX-OBJS-$(CONFIG_H264DSP)             += x86/h264dsp_mmx.o
12 12
YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock_sse2.o       \
13
                                          x86/h264_weight_sse2.o        \
13
                                          x86/h264_weight.o             \
14 14

  
15 15
YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o
16 16
MMX-OBJS-$(CONFIG_H264PRED)            += x86/h264_intrapred_init.o
libavcodec/x86/h264_weight.asm
1
;*****************************************************************************
2
;* SSE2-optimized weighted prediction code
3
;*****************************************************************************
4
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6
;*
7
;* This file is part of FFmpeg.
8
;*
9
;* FFmpeg is free software; you can redistribute it and/or
10
;* modify it under the terms of the GNU Lesser General Public
11
;* License as published by the Free Software Foundation; either
12
;* version 2.1 of the License, or (at your option) any later version.
13
;*
14
;* FFmpeg is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
;* Lesser General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU Lesser General Public
20
;* License along with FFmpeg; if not, write to the Free Software
21
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
;******************************************************************************
23

  
24
%include "x86inc.asm"
25

  
26
SECTION .text
27

  
28
;-----------------------------------------------------------------------------
29
; biweight pred:
30
;
31
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
32
;                               int log2_denom, int weightd, int weights,
33
;                               int offset);
34
; and
35
; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
36
;                             int log2_denom, int weight,
37
;                             int offset);
38
;-----------------------------------------------------------------------------
39

  
40
%macro WEIGHT_SETUP 0
41
    add        r4, r4
42
    inc        r4
43
    movd       m3, r3
44
    movd       m5, r4
45
    movd       m6, r2
46
    pslld      m5, m6
47
    psrld      m5, 1
48
%if mmsize == 16
49
    pshuflw    m3, m3, 0
50
    pshuflw    m5, m5, 0
51
    punpcklqdq m3, m3
52
    punpcklqdq m5, m5
53
%else
54
    pshufw     m3, m3, 0
55
    pshufw     m5, m5, 0
56
%endif
57
    pxor       m7, m7
58
%endmacro
59

  
60
%macro WEIGHT_OP 2
61
    movh          m0, [r0+%1]
62
    movh          m1, [r0+%2]
63
    punpcklbw     m0, m7
64
    punpcklbw     m1, m7
65
    pmullw        m0, m3
66
    pmullw        m1, m3
67
    paddsw        m0, m5
68
    paddsw        m1, m5
69
    psraw         m0, m6
70
    psraw         m1, m6
71
    packuswb      m0, m1
72
%endmacro
73

  
74
%macro WEIGHT_FUNC_DBL_MM 1
75
cglobal h264_weight_16x%1_mmx2, 5, 5, 0
76
    WEIGHT_SETUP
77
    mov        r2, %1
78
%if %1 == 16
79
.nextrow
80
    WEIGHT_OP 0,  4
81
    mova     [r0  ], m0
82
    WEIGHT_OP 8, 12
83
    mova     [r0+8], m0
84
    add        r0, r1
85
    dec        r2
86
    jnz .nextrow
87
    REP_RET
88
%else
89
    jmp _ff_h264_weight_16x16_mmx2.nextrow
90
%endif
91
%endmacro
92

  
93
INIT_MMX
94
WEIGHT_FUNC_DBL_MM 16
95
WEIGHT_FUNC_DBL_MM  8
96

  
97
%macro WEIGHT_FUNC_MM 4
98
cglobal h264_weight_%1x%2_%4, 7, 7, %
99
    WEIGHT_SETUP
100
    mov        r2, %2
101
%if %2 == 16
102
.nextrow
103
    WEIGHT_OP 0, mmsize/2
104
    mova     [r0], m0
105
    add        r0, r1
106
    dec        r2
107
    jnz .nextrow
108
    REP_RET
109
%else
110
    jmp _ff_h264_weight_%1x16_%4.nextrow
111
%endif
112
%endmacro
113

  
114
INIT_MMX
115
WEIGHT_FUNC_MM  8, 16,  0, mmx2
116
WEIGHT_FUNC_MM  8,  8,  0, mmx2
117
WEIGHT_FUNC_MM  8,  4,  0, mmx2
118
INIT_XMM
119
WEIGHT_FUNC_MM 16, 16,  8, sse2
120
WEIGHT_FUNC_MM 16,  8,  8, sse2
121

  
122
%macro WEIGHT_FUNC_HALF_MM 5
123
cglobal h264_weight_%1x%2_%5, 5, 5, %4
124
    WEIGHT_SETUP
125
    mov        r2, %2/2
126
    lea        r3, [r1*2]
127
%if %2 == mmsize
128
.nextrow
129
    WEIGHT_OP 0, r1
130
    movh     [r0], m0
131
%if mmsize == 16
132
    movhps   [r0+r1], m0
133
%else
134
    psrlq      m0, 32
135
    movh     [r0+r1], m0
136
%endif
137
    add        r0, r3
138
    dec        r2
139
    jnz .nextrow
140
    REP_RET
141
%else
142
    jmp _ff_h264_weight_%1x%3_%5.nextrow
143
%endif
144
%endmacro
145

  
146
INIT_MMX
147
WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
148
WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
149
WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
150
INIT_XMM
151
WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
152
WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
153
WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
154

  
155
%macro BIWEIGHT_SETUP 0
156
    add        r6, 1
157
    or         r6, 1
158
    add        r3, 1
159
    movd       m3, r4
160
    movd       m4, r5
161
    movd       m5, r6
162
    movd       m6, r3
163
    pslld      m5, m6
164
    psrld      m5, 1
165
%if mmsize == 16
166
    pshuflw    m3, m3, 0
167
    pshuflw    m4, m4, 0
168
    pshuflw    m5, m5, 0
169
    punpcklqdq m3, m3
170
    punpcklqdq m4, m4
171
    punpcklqdq m5, m5
172
%else
173
    pshufw     m3, m3, 0
174
    pshufw     m4, m4, 0
175
    pshufw     m5, m5, 0
176
%endif
177
    pxor       m7, m7
178
%endmacro
179

  
180
%macro BIWEIGHT_STEPA 3
181
    movh       m%1, [r0+%3]
182
    movh       m%2, [r1+%3]
183
    punpcklbw  m%1, m7
184
    punpcklbw  m%2, m7
185
    pmullw     m%1, m3
186
    pmullw     m%2, m4
187
    paddsw     m%1, m%2
188
%endmacro
189

  
190
%macro BIWEIGHT_STEPB 0
191
    paddsw     m0, m5
192
    paddsw     m1, m5
193
    psraw      m0, m6
194
    psraw      m1, m6
195
    packuswb   m0, m1
196
%endmacro
197

  
198
%macro BIWEIGHT_FUNC_DBL_MM 1
199
cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
200
    BIWEIGHT_SETUP
201
    mov        r3, %1
202
%if %1 == 16
203
.nextrow
204
    BIWEIGHT_STEPA 0, 1, 0
205
    BIWEIGHT_STEPA 1, 2, 4
206
    BIWEIGHT_STEPB
207
    mova       [r0], m0
208
    BIWEIGHT_STEPA 0, 1, 8
209
    BIWEIGHT_STEPA 1, 2, 12
210
    BIWEIGHT_STEPB
211
    mova     [r0+8], m0
212
    add        r0, r2
213
    add        r1, r2
214
    dec        r3
215
    jnz .nextrow
216
    REP_RET
217
%else
218
    jmp _ff_h264_biweight_16x16_mmx2.nextrow
219
%endif
220
%endmacro
221

  
222
INIT_MMX
223
BIWEIGHT_FUNC_DBL_MM 16
224
BIWEIGHT_FUNC_DBL_MM  8
225

  
226
%macro BIWEIGHT_FUNC_MM 4
227
cglobal h264_biweight_%1x%2_%4, 7, 7, %3
228
    BIWEIGHT_SETUP
229
    mov        r3, %2
230
%if %2 == 16
231
.nextrow
232
    BIWEIGHT_STEPA 0, 1, 0
233
    BIWEIGHT_STEPA 1, 2, mmsize/2
234
    BIWEIGHT_STEPB
235
    mova       [r0], m0
236
    add        r0, r2
237
    add        r1, r2
238
    dec        r3
239
    jnz .nextrow
240
    REP_RET
241
%else
242
    jmp _ff_h264_biweight_%1x16_%4.nextrow
243
%endif
244
%endmacro
245

  
246
INIT_MMX
247
BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
248
BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
249
BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
250
INIT_XMM
251
BIWEIGHT_FUNC_MM 16, 16,  8, sse2
252
BIWEIGHT_FUNC_MM 16,  8,  8, sse2
253

  
254
%macro BIWEIGHT_FUNC_HALF_MM 5
255
cglobal h264_biweight_%1x%2_%5, 7, 7, %4
256
    BIWEIGHT_SETUP
257
    mov        r3, %2/2
258
    lea        r4, [r2*2]
259
%if %2 == mmsize
260
.nextrow
261
    BIWEIGHT_STEPA 0, 1, 0
262
    BIWEIGHT_STEPA 1, 2, r2
263
    BIWEIGHT_STEPB
264
    movh       [r0], m0
265
%if mmsize == 16
266
    movhps     [r0+r2], m0
267
%else
268
    psrlq      m0, 32
269
    movh       [r0+r2], m0
270
%endif
271
    add        r0, r4
272
    add        r1, r4
273
    dec        r3
274
    jnz .nextrow
275
    REP_RET
276
%else
277
    jmp _ff_h264_biweight_%1x%3_%5.nextrow
278
%endif
279
%endmacro
280

  
281
INIT_MMX
282
BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
283
BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
284
BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
285
INIT_XMM
286
BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
287
BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
288
BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
289

  
290
%macro BIWEIGHT_SSSE3_SETUP 0
291
    add        r6, 1
292
    or         r6, 1
293
    add        r3, 1
294
    movd       m4, r4
295
    movd       m0, r5
296
    movd       m5, r6
297
    movd       m6, r3
298
    pslld      m5, m6
299
    psrld      m5, 1
300
    punpcklbw  m4, m0
301
    pshuflw    m4, m4, 0
302
    pshuflw    m5, m5, 0
303
    punpcklqdq m4, m4
304
    punpcklqdq m5, m5
305
%endmacro
306

  
307
%macro BIWEIGHT_SSSE3_OP 0
308
    pmaddubsw  m0, m4
309
    pmaddubsw  m2, m4
310
    paddsw     m0, m5
311
    paddsw     m2, m5
312
    psraw      m0, m6
313
    psraw      m2, m6
314
    packuswb   m0, m2
315
%endmacro
316

  
317
%macro BIWEIGHT_SSSE3_16 1
318
cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
319
    BIWEIGHT_SSSE3_SETUP
320
    mov        r3, %1
321

  
322
%if %1 == 16
323
.nextrow
324
    movh       m0, [r0]
325
    movh       m2, [r0+8]
326
    movh       m3, [r1+8]
327
    punpcklbw  m0, [r1]
328
    punpcklbw  m2, m3
329
    BIWEIGHT_SSSE3_OP
330
    mova       [r0], m0
331
    add        r0, r2
332
    add        r1, r2
333
    dec        r3
334
    jnz .nextrow
335
    REP_RET
336
%else
337
    jmp _ff_h264_biweight_16x16_ssse3.nextrow
338
%endif
339
%endmacro
340

  
341
INIT_XMM
342
BIWEIGHT_SSSE3_16 16
343
BIWEIGHT_SSSE3_16  8
344

  
345
%macro BIWEIGHT_SSSE3_8 1
346
cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
347
    BIWEIGHT_SSSE3_SETUP
348
    mov        r3, %1/2
349
    lea        r4, [r2*2]
350

  
351
%if %1 == 16
352
.nextrow
353
    movh       m0, [r0]
354
    movh       m1, [r1]
355
    movh       m2, [r0+r2]
356
    movh       m3, [r1+r2]
357
    punpcklbw  m0, m1
358
    punpcklbw  m2, m3
359
    BIWEIGHT_SSSE3_OP
360
    movh       [r0], m0
361
    movhps     [r0+r2], m0
362
    add        r0, r4
363
    add        r1, r4
364
    dec        r3
365
    jnz .nextrow
366
    REP_RET
367
%else
368
    jmp _ff_h264_biweight_8x16_ssse3.nextrow
369
%endif
370
%endmacro
371

  
372
INIT_XMM
373
BIWEIGHT_SSSE3_8 16
374
BIWEIGHT_SSSE3_8  8
375
BIWEIGHT_SSSE3_8  4
libavcodec/x86/h264_weight_sse2.asm
1
;*****************************************************************************
2
;* SSE2-optimized weighted prediction code
3
;*****************************************************************************
4
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6
;*
7
;* This file is part of FFmpeg.
8
;*
9
;* FFmpeg is free software; you can redistribute it and/or
10
;* modify it under the terms of the GNU Lesser General Public
11
;* License as published by the Free Software Foundation; either
12
;* version 2.1 of the License, or (at your option) any later version.
13
;*
14
;* FFmpeg is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
;* Lesser General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU Lesser General Public
20
;* License along with FFmpeg; if not, write to the Free Software
21
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
;******************************************************************************
23

  
24
%include "x86inc.asm"
25

  
26
SECTION .text
27
INIT_XMM
28

  
29
;-----------------------------------------------------------------------------
30
; biweight pred:
31
;
32
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
33
;                               int log2_denom, int weightd, int weights,
34
;                               int offset);
35
;-----------------------------------------------------------------------------
36

  
37
%macro BIWEIGHT_SSE2_SETUP 0
38
    add        r6, 1
39
    or         r6, 1
40
    add        r3, 1
41
    movd       m3, r4
42
    movd       m4, r5
43
    movd       m5, r6
44
    movd       m6, r3
45
    pslld      m5, m6
46
    psrld      m5, 1
47
    pshuflw    m3, m3, 0
48
    pshuflw    m4, m4, 0
49
    pshuflw    m5, m5, 0
50
    punpcklqdq m3, m3
51
    punpcklqdq m4, m4
52
    punpcklqdq m5, m5
53
    pxor       m7, m7
54
%endmacro
55

  
56
%macro BIWEIGHT_SSE2_STEPA 3
57
    movh       m%1, [r0+%3]
58
    movh       m%2, [r1+%3]
59
    punpcklbw  m%1, m7
60
    punpcklbw  m%2, m7
61
    pmullw     m%1, m3
62
    pmullw     m%2, m4
63
    paddsw     m%1, m%2
64
%endmacro
65

  
66
%macro BIWEIGHT_SSE2_STEPB 0
67
    paddsw     m0, m5
68
    paddsw     m1, m5
69
    psraw      m0, m6
70
    psraw      m1, m6
71
    packuswb   m0, m1
72
%endmacro
73

  
74
cglobal h264_biweight_16x16_sse2, 7, 7, 8
75
    BIWEIGHT_SSE2_SETUP
76
    mov        r3, 16
77

  
78
.nextrow
79
    BIWEIGHT_SSE2_STEPA 0, 1, 0
80
    BIWEIGHT_SSE2_STEPA 1, 2, 8
81
    BIWEIGHT_SSE2_STEPB
82
    mova       [r0], m0
83
    add        r0, r2
84
    add        r1, r2
85
    dec        r3
86
    jnz .nextrow
87
    REP_RET
88

  
89
cglobal h264_biweight_8x8_sse2, 7, 7, 8
90
    BIWEIGHT_SSE2_SETUP
91
    mov        r3, 4
92
    lea        r4, [r2*2]
93

  
94
.nextrow
95
    BIWEIGHT_SSE2_STEPA 0, 1, 0
96
    BIWEIGHT_SSE2_STEPA 1, 2, r2
97
    BIWEIGHT_SSE2_STEPB
98
    movh       [r0], m0
99
    movhps     [r0+r2], m0
100
    add        r0, r4
101
    add        r1, r4
102
    dec        r3
103
    jnz .nextrow
104
    REP_RET
105

  
106
%macro BIWEIGHT_SSSE3_SETUP 0
107
    add        r6, 1
108
    or         r6, 1
109
    add        r3, 1
110
    movd       m4, r4
111
    movd       m0, r5
112
    movd       m5, r6
113
    movd       m6, r3
114
    pslld      m5, m6
115
    psrld      m5, 1
116
    punpcklbw  m4, m0
117
    pshuflw    m4, m4, 0
118
    pshuflw    m5, m5, 0
119
    punpcklqdq m4, m4
120
    punpcklqdq m5, m5
121
%endmacro
122

  
123
%macro BIWEIGHT_SSSE3_OP 0
124
    pmaddubsw  m0, m4
125
    pmaddubsw  m2, m4
126
    paddsw     m0, m5
127
    paddsw     m2, m5
128
    psraw      m0, m6
129
    psraw      m2, m6
130
    packuswb   m0, m2
131
%endmacro
132

  
133
cglobal h264_biweight_16x16_ssse3, 7, 7, 8
134
    BIWEIGHT_SSSE3_SETUP
135
    mov        r3, 16
136

  
137
.nextrow
138
    movh       m0, [r0]
139
    movh       m2, [r0+8]
140
    movh       m3, [r1+8]
141
    punpcklbw  m0, [r1]
142
    punpcklbw  m2, m3
143
    BIWEIGHT_SSSE3_OP
144
    mova       [r0], m0
145
    add        r0, r2
146
    add        r1, r2
147
    dec        r3
148
    jnz .nextrow
149
    REP_RET
150

  
151
cglobal h264_biweight_8x8_ssse3, 7, 7, 8
152
    BIWEIGHT_SSSE3_SETUP
153
    mov        r3, 4
154
    lea        r4, [r2*2]
155

  
156
.nextrow
157
    movh       m0, [r0]
158
    movh       m1, [r1]
159
    movh       m2, [r0+r2]
160
    movh       m3, [r1+r2]
161
    punpcklbw  m0, m1
162
    punpcklbw  m2, m3
163
    BIWEIGHT_SSSE3_OP
164
    movh       [r0], m0
165
    movhps     [r0+r2], m0
166
    add        r0, r4
167
    add        r1, r4
168
    dec        r3
169
    jnz .nextrow
170
    REP_RET
libavcodec/x86/h264dsp_mmx.c
921 921
/***********************************/
922 922
/* weighted prediction */
923 923

  
924
static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
925
{
926
    int x, y;
927
    offset <<= log2_denom;
928
    offset += (1 << log2_denom) >> 1;
929
    __asm__ volatile(
930
        "movd    %0, %%mm4        \n\t"
931
        "movd    %1, %%mm5        \n\t"
932
        "movd    %2, %%mm6        \n\t"
933
        "pshufw  $0, %%mm4, %%mm4 \n\t"
934
        "pshufw  $0, %%mm5, %%mm5 \n\t"
935
        "pxor    %%mm7, %%mm7     \n\t"
936
        :: "g"(weight), "g"(offset), "g"(log2_denom)
937
    );
938
    for(y=0; y<h; y+=2){
939
        for(x=0; x<w; x+=4){
940
            __asm__ volatile(
941
                "movd      %0,    %%mm0 \n\t"
942
                "movd      %1,    %%mm1 \n\t"
943
                "punpcklbw %%mm7, %%mm0 \n\t"
944
                "punpcklbw %%mm7, %%mm1 \n\t"
945
                "pmullw    %%mm4, %%mm0 \n\t"
946
                "pmullw    %%mm4, %%mm1 \n\t"
947
                "paddsw    %%mm5, %%mm0 \n\t"
948
                "paddsw    %%mm5, %%mm1 \n\t"
949
                "psraw     %%mm6, %%mm0 \n\t"
950
                "psraw     %%mm6, %%mm1 \n\t"
951
                "packuswb  %%mm7, %%mm0 \n\t"
952
                "packuswb  %%mm7, %%mm1 \n\t"
953
                "movd      %%mm0, %0    \n\t"
954
                "movd      %%mm1, %1    \n\t"
955
                : "+m"(*(uint32_t*)(dst+x)),
956
                  "+m"(*(uint32_t*)(dst+x+stride))
957
            );
958
        }
959
        dst += 2*stride;
960
    }
961
}
962

  
963
static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
964
{
965
    int x, y;
966
    offset = ((offset + 1) | 1) << log2_denom;
967
    __asm__ volatile(
968
        "movd    %0, %%mm3        \n\t"
969
        "movd    %1, %%mm4        \n\t"
970
        "movd    %2, %%mm5        \n\t"
971
        "movd    %3, %%mm6        \n\t"
972
        "pshufw  $0, %%mm3, %%mm3 \n\t"
973
        "pshufw  $0, %%mm4, %%mm4 \n\t"
974
        "pshufw  $0, %%mm5, %%mm5 \n\t"
975
        "pxor    %%mm7, %%mm7     \n\t"
976
        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
977
    );
978
    for(y=0; y<h; y++){
979
        for(x=0; x<w; x+=4){
980
            __asm__ volatile(
981
                "movd      %0,    %%mm0 \n\t"
982
                "movd      %1,    %%mm1 \n\t"
983
                "punpcklbw %%mm7, %%mm0 \n\t"
984
                "punpcklbw %%mm7, %%mm1 \n\t"
985
                "pmullw    %%mm3, %%mm0 \n\t"
986
                "pmullw    %%mm4, %%mm1 \n\t"
987
                "paddsw    %%mm1, %%mm0 \n\t"
988
                "paddsw    %%mm5, %%mm0 \n\t"
989
                "psraw     %%mm6, %%mm0 \n\t"
990
                "packuswb  %%mm0, %%mm0 \n\t"
991
                "movd      %%mm0, %0    \n\t"
992
                : "+m"(*(uint32_t*)(dst+x))
993
                :  "m"(*(uint32_t*)(src+x))
994
            );
995
        }
996
        src += stride;
997
        dst += stride;
998
    }
999
}
1000

  
1001
#define H264_WEIGHT(W,H) \
1002
static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
1003
    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
1004
} \
1005
static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
1006
    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
1007
}
1008

  
1009
H264_WEIGHT(16,16)
1010
H264_WEIGHT(16, 8)
1011
H264_WEIGHT( 8,16)
1012
H264_WEIGHT( 8, 8)
1013
H264_WEIGHT( 8, 4)
1014
H264_WEIGHT( 4, 8)
1015
H264_WEIGHT( 4, 4)
1016
H264_WEIGHT( 4, 2)
1017

  
1018
void ff_h264_biweight_8x8_sse2(uint8_t *dst, uint8_t *src, int stride,
1019
                               int log2_denom, int weightd, int weights,
1020
                               int offset);
1021

  
1022
void ff_h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
1023
                                 int log2_denom, int weightd, int weights,
1024
                                 int offset);
1025

  
1026
void ff_h264_biweight_8x8_ssse3(uint8_t *dst, uint8_t *src, int stride,
1027
                                int log2_denom, int weightd, int weights,
1028
                                int offset);
1029

  
1030
void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride,
1031
                                  int log2_denom, int weightd, int weights,
1032
                                  int offset);
924
#define H264_WEIGHT(W, H, OPT) \
925
void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
926
    int stride, int log2_denom, int weight, int offset);
927

  
928
#define H264_BIWEIGHT(W, H, OPT) \
929
void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
930
    uint8_t *src, int stride, int log2_denom, int weightd, \
931
    int weights, int offset);
932

  
933
#define H264_BIWEIGHT_MMX(W,H) \
934
H264_WEIGHT  (W, H, mmx2) \
935
H264_BIWEIGHT(W, H, mmx2)
936

  
937
#define H264_BIWEIGHT_MMX_SSE(W,H) \
938
H264_BIWEIGHT_MMX(W, H) \
939
H264_WEIGHT      (W, H, sse2) \
940
H264_BIWEIGHT    (W, H, sse2) \
941
H264_BIWEIGHT    (W, H, ssse3)
942

  
943
H264_BIWEIGHT_MMX_SSE(16, 16)
944
H264_BIWEIGHT_MMX_SSE(16,  8)
945
H264_BIWEIGHT_MMX_SSE( 8, 16)
946
H264_BIWEIGHT_MMX_SSE( 8,  8)
947
H264_BIWEIGHT_MMX_SSE( 8,  4)
948
H264_BIWEIGHT_MMX    ( 4,  8)
949
H264_BIWEIGHT_MMX    ( 4,  4)
950
H264_BIWEIGHT_MMX    ( 4,  2)
1033 951

  
1034 952
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
1035 953
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
......
1076 994
            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
1077 995
            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
1078 996
            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
997
        }
998
        if(mm_flags & FF_MM_SSE2){
999
            c->h264_idct8_add = ff_h264_idct8_add_sse2;
1000
            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1001
        }
1079 1002

  
1003
#if HAVE_YASM
1004
        if (mm_flags & FF_MM_MMX2){
1005
#if ARCH_X86_32
1006
            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1007
            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1008
#endif
1080 1009
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
1081 1010
            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
1082 1011
            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
......
1094 1023
            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
1095 1024
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
1096 1025
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
1097
        }
1098
        if(mm_flags & FF_MM_SSE2){
1099
            c->h264_idct8_add = ff_h264_idct8_add_sse2;
1100
            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1101
        }
1102 1026

  
1103
#if HAVE_YASM
1104
        if (mm_flags & FF_MM_MMX2){
1105
#if ARCH_X86_32
1106
            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1107
            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1108
#endif
1109 1027
            if( mm_flags&FF_MM_SSE2 ){
1028
                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
1029
                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
1030
                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
1031
                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
1032
                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
1033

  
1110 1034
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
1035
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
1036
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
1111 1037
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
1038
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
1039

  
1112 1040
#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
1113 1041
                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
1114 1042
                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
......
1123 1051
            }
1124 1052
            if ( mm_flags&FF_MM_SSSE3 ){
1125 1053
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
1054
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
1055
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
1126 1056
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
1057
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
1127 1058
            }
1128 1059
        }
1129 1060
#endif

Also available in: Unified diff