Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp-init.c @ 268821e7

History | View | Annotate | Download (17 KB)

1
/*
2
 * VP8 DSP functions x86-optimized
3
 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
 * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

    
23
#include "libavutil/x86_cpu.h"
24
#include "libavcodec/vp8dsp.h"
25

    
26
#if HAVE_YASM
27

    
28
/*
29
 * MC functions
30
 */
31
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, int dststride,
32
                                       uint8_t *src, int srcstride,
33
                                       int height, int mx, int my);
34
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, int dststride,
35
                                       uint8_t *src, int srcstride,
36
                                       int height, int mx, int my);
37
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, int dststride,
38
                                       uint8_t *src, int srcstride,
39
                                       int height, int mx, int my);
40
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, int dststride,
41
                                       uint8_t *src, int srcstride,
42
                                       int height, int mx, int my);
43

    
44
extern void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, int dststride,
45
                                       uint8_t *src, int srcstride,
46
                                       int height, int mx, int my);
47
extern void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, int dststride,
48
                                       uint8_t *src, int srcstride,
49
                                       int height, int mx, int my);
50
extern void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, int dststride,
51
                                       uint8_t *src, int srcstride,
52
                                       int height, int mx, int my);
53
extern void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, int dststride,
54
                                       uint8_t *src, int srcstride,
55
                                       int height, int mx, int my);
56

    
57
extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, int dststride,
58
                                       uint8_t *src, int srcstride,
59
                                       int height, int mx, int my);
60
extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, int dststride,
61
                                       uint8_t *src, int srcstride,
62
                                       int height, int mx, int my);
63
extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, int dststride,
64
                                       uint8_t *src, int srcstride,
65
                                       int height, int mx, int my);
66
extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, int dststride,
67
                                       uint8_t *src, int srcstride,
68
                                       int height, int mx, int my);
69
extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, int dststride,
70
                                       uint8_t *src, int srcstride,
71
                                       int height, int mx, int my);
72
extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, int dststride,
73
                                       uint8_t *src, int srcstride,
74
                                       int height, int mx, int my);
75
extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, int dststride,
76
                                       uint8_t *src, int srcstride,
77
                                       int height, int mx, int my);
78
extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, int dststride,
79
                                       uint8_t *src, int srcstride,
80
                                       int height, int mx, int my);
81

    
82
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, int dststride,
83
                                          uint8_t *src, int srcstride,
84
                                          int height, int mx, int my);
85
extern void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, int dststride,
86
                                          uint8_t *src, int srcstride,
87
                                          int height, int mx, int my);
88
extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, int dststride,
89
                                          uint8_t *src, int srcstride,
90
                                          int height, int mx, int my);
91
extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, int dststride,
92
                                          uint8_t *src, int srcstride,
93
                                          int height, int mx, int my);
94

    
95
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, int dststride,
96
                                          uint8_t *src, int srcstride,
97
                                          int height, int mx, int my);
98
extern void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, int dststride,
99
                                          uint8_t *src, int srcstride,
100
                                          int height, int mx, int my);
101
extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, int dststride,
102
                                          uint8_t *src, int srcstride,
103
                                          int height, int mx, int my);
104
extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, int dststride,
105
                                          uint8_t *src, int srcstride,
106
                                          int height, int mx, int my);
107

    
108

    
109
extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, int dststride,
110
                                    uint8_t *src, int srcstride,
111
                                    int height, int mx, int my);
112
extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, int dststride,
113
                                    uint8_t *src, int srcstride,
114
                                    int height, int mx, int my);
115
extern void ff_put_vp8_pixels16_sse(uint8_t *dst, int dststride,
116
                                    uint8_t *src, int srcstride,
117
                                    int height, int mx, int my);
118

    
119
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
120
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
121
    uint8_t *dst,  int dststride, uint8_t *src, \
122
    int srcstride, int height, int mx, int my) \
123
{ \
124
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
125
        dst,     dststride, src,     srcstride, height, mx, my); \
126
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
127
        dst + 8, dststride, src + 8, srcstride, height, mx, my); \
128
}
129
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
130
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
131
    uint8_t *dst,  int dststride, uint8_t *src, \
132
    int srcstride, int height, int mx, int my) \
133
{ \
134
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
135
        dst,     dststride, src,     srcstride, height, mx, my); \
136
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
137
        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
138
}
139

    
140
TAP_W8 (mmxext, epel, h4)
141
TAP_W8 (mmxext, epel, h6)
142
TAP_W16(mmxext, epel, h6)
143
TAP_W8 (mmxext, epel, v4)
144
TAP_W8 (mmxext, epel, v6)
145
TAP_W16(mmxext, epel, v6)
146
TAP_W8 (mmxext, bilinear, h)
147
TAP_W16(mmxext, bilinear, h)
148
TAP_W8 (mmxext, bilinear, v)
149
TAP_W16(mmxext, bilinear, v)
150

    
151
TAP_W16(sse2,   epel, h6)
152
TAP_W16(sse2,   epel, v6)
153
TAP_W16(sse2,   bilinear, h)
154
TAP_W16(sse2,   bilinear, v)
155

    
156
TAP_W16(ssse3,  epel, h6)
157
TAP_W16(ssse3,  epel, v6)
158
TAP_W16(ssse3,  bilinear, h)
159
TAP_W16(ssse3,  bilinear, v)
160

    
161
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
162
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
163
    uint8_t *dst, int dststride, uint8_t *src, \
164
    int srcstride, int height, int mx, int my) \
165
{ \
166
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
167
    uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
168
    src -= srcstride * (TAPNUMY / 2 - 1); \
169
    ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
170
        tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
171
    ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
172
        dst, dststride, tmpptr, SIZE,      height,               mx, my); \
173
}
174

    
175
#define HVTAPMMX(x, y) \
176
HVTAP(mmxext, 8, x, y,  4,  8) \
177
HVTAP(mmxext, 8, x, y,  8, 16)
178

    
179
HVTAPMMX(4, 4)
180
HVTAPMMX(4, 6)
181
HVTAPMMX(6, 4)
182
HVTAPMMX(6, 6)
183
HVTAP(mmxext, 8, 6, 6, 16, 16)
184

    
185
#define HVTAPSSE2(x, y, w) \
186
HVTAP(sse2,  16, x, y, w, 16) \
187
HVTAP(ssse3, 16, x, y, w, 16)
188

    
189
HVTAPSSE2(4, 4, 8)
190
HVTAPSSE2(4, 6, 8)
191
HVTAPSSE2(6, 4, 8)
192
HVTAPSSE2(6, 6, 8)
193
HVTAPSSE2(6, 6, 16)
194

    
195
HVTAP(ssse3, 16, 4, 4, 4, 8)
196
HVTAP(ssse3, 16, 4, 6, 4, 8)
197
HVTAP(ssse3, 16, 6, 4, 4, 8)
198
HVTAP(ssse3, 16, 6, 6, 4, 8)
199

    
200
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
201
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
202
    uint8_t *dst, int dststride, uint8_t *src, \
203
    int srcstride, int height, int mx, int my) \
204
{ \
205
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
206
    ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
207
        tmp, SIZE,      src, srcstride, height + 1, mx, my); \
208
    ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
209
        dst, dststride, tmp, SIZE,      height,     mx, my); \
210
}
211

    
212
HVBILIN(mmxext, 8,  4,  8)
213
HVBILIN(mmxext, 8,  8, 16)
214
HVBILIN(mmxext, 8, 16, 16)
215
HVBILIN(sse2,   8,  8, 16)
216
HVBILIN(sse2,   8, 16, 16)
217
HVBILIN(ssse3,  8,  4,  8)
218
HVBILIN(ssse3,  8,  8, 16)
219
HVBILIN(ssse3,  8, 16, 16)
220

    
221
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
222
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
223
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
224
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
225

    
226
extern void ff_vp8_v_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
227
extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
228
extern void ff_vp8_v_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
229
extern void ff_vp8_h_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
230
extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
231
extern void ff_vp8_h_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
232

    
233
extern void ff_vp8_v_loop_filter16y_inner_mmx   (uint8_t *dst, int stride,
234
                                                 int e, int i, int hvt);
235
extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
236
                                                 int e, int i, int hvt);
237
extern void ff_vp8_v_loop_filter16y_inner_sse2  (uint8_t *dst, int stride,
238
                                                 int e, int i, int hvt);
239
extern void ff_vp8_h_loop_filter16y_inner_mmx   (uint8_t *dst, int stride,
240
                                                 int e, int i, int hvt);
241
extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
242
                                                 int e, int i, int hvt);
243
extern void ff_vp8_h_loop_filter16y_inner_sse2  (uint8_t *dst, int stride,
244
                                                 int e, int i, int hvt);
245

    
246
extern void ff_vp8_v_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV,
247
                                                 int s, int e, int i, int hvt);
248
extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
249
                                                 int s, int e, int i, int hvt);
250
extern void ff_vp8_v_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV,
251
                                                 int s, int e, int i, int hvt);
252
extern void ff_vp8_h_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV,
253
                                                 int s, int e, int i, int hvt);
254
extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
255
                                                 int s, int e, int i, int hvt);
256
extern void ff_vp8_h_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV,
257
                                                 int s, int e, int i, int hvt);
258
#endif
259

    
260
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
261
    c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
262
    c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
263
    c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
264

    
265
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
266
    c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
267
    c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
268
    c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
269
    c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
270
    c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
271
    VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
272

    
273
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
274
    c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
275
    c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
276
    c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
277
    c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
278
    c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
279
    c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
280
    c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
281
    c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
282

    
283

    
284
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
285
{
286
    mm_flags = mm_support();
287

    
288
#if HAVE_YASM
289
    if (mm_flags & FF_MM_MMX) {
290
        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_mmx;
291
        c->vp8_idct_add                     = ff_vp8_idct_add_mmx;
292
        c->put_vp8_epel_pixels_tab[0][0][0]     =
293
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
294
        c->put_vp8_epel_pixels_tab[1][0][0]     =
295
        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
296

    
297
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
298
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
299

    
300
        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
301
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
302
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
303
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
304
    }
305

    
306
    /* note that 4-tap width=16 functions are missing because w=16
307
     * is only used for luma, and luma is always a copy or sixtap. */
308
    if (mm_flags & FF_MM_MMX2) {
309
        c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmxext;
310
        VP8_LUMA_MC_FUNC(0, 16, mmxext);
311
        VP8_MC_FUNC(1, 8, mmxext);
312
        VP8_MC_FUNC(2, 4, mmxext);
313
        VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
314
        VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
315
        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
316

    
317
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
318
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
319

    
320
        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
321
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
322
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
323
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
324
    }
325

    
326
    if (mm_flags & FF_MM_SSE) {
327
        c->put_vp8_epel_pixels_tab[0][0][0]     =
328
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
329
    }
330

    
331
    if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
332
        VP8_LUMA_MC_FUNC(0, 16, sse2);
333
        VP8_MC_FUNC(1, 8, sse2);
334
        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
335
        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
336

    
337
        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
338
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
339

    
340
        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
341
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
342
    }
343

    
344
    if (mm_flags & FF_MM_SSE2) {
345
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
346
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
347
    }
348

    
349
    if (mm_flags & FF_MM_SSSE3) {
350
        VP8_LUMA_MC_FUNC(0, 16, ssse3);
351
        VP8_MC_FUNC(1, 8, ssse3);
352
        VP8_MC_FUNC(2, 4, ssse3);
353
        VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
354
        VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
355
        VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
356
    }
357

    
358
    if (mm_flags & FF_MM_SSE4) {
359
        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
360
    }
361
#endif
362
}