ffmpeg / libavcodec / x86 / vc1dsp_mmx.c @ 12802ec0
History | View | Annotate | Download (35.8 KB)
1 |
/*
|
---|---|
2 |
* VC-1 and WMV3 - DSP functions MMX-optimized
|
3 |
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
|
4 |
*
|
5 |
* Permission is hereby granted, free of charge, to any person
|
6 |
* obtaining a copy of this software and associated documentation
|
7 |
* files (the "Software"), to deal in the Software without
|
8 |
* restriction, including without limitation the rights to use,
|
9 |
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10 |
* copies of the Software, and to permit persons to whom the
|
11 |
* Software is furnished to do so, subject to the following
|
12 |
* conditions:
|
13 |
*
|
14 |
* The above copyright notice and this permission notice shall be
|
15 |
* included in all copies or substantial portions of the Software.
|
16 |
*
|
17 |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18 |
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
19 |
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20 |
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
21 |
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
22 |
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
23 |
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
24 |
* OTHER DEALINGS IN THE SOFTWARE.
|
25 |
*/
|
26 |
|
27 |
#include "libavutil/cpu.h" |
28 |
#include "libavutil/x86_cpu.h" |
29 |
#include "libavcodec/dsputil.h" |
30 |
#include "dsputil_mmx.h" |
31 |
#include "libavcodec/vc1dsp.h" |
32 |
|
33 |
#define OP_PUT(S,D)
|
34 |
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" |
35 |
|
36 |
/** Add rounder from mm7 to mm3 and pack result at destination */
|
37 |
#define NORMALIZE_MMX(SHIFT) \
|
38 |
"paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ |
39 |
"paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ |
40 |
"psraw "SHIFT", %%mm3 \n\t" \ |
41 |
"psraw "SHIFT", %%mm4 \n\t" |
42 |
|
43 |
#define TRANSFER_DO_PACK(OP) \
|
44 |
"packuswb %%mm4, %%mm3 \n\t" \
|
45 |
OP((%2), %%mm3) \
|
46 |
"movq %%mm3, (%2) \n\t"
|
47 |
|
48 |
#define TRANSFER_DONT_PACK(OP) \
|
49 |
OP(0(%2), %%mm3) \ |
50 |
OP(8(%2), %%mm4) \ |
51 |
"movq %%mm3, 0(%2) \n\t" \
|
52 |
"movq %%mm4, 8(%2) \n\t"
|
53 |
|
54 |
/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
|
55 |
#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" |
56 |
#define DONT_UNPACK(reg)
|
57 |
|
58 |
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
|
59 |
#define LOAD_ROUNDER_MMX(ROUND) \
|
60 |
"movd "ROUND", %%mm7 \n\t" \ |
61 |
"punpcklwd %%mm7, %%mm7 \n\t" \
|
62 |
"punpckldq %%mm7, %%mm7 \n\t"
|
63 |
|
64 |
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
|
65 |
"paddw %%mm"#R2", %%mm"#R1" \n\t" \ |
66 |
"movd (%0,%3), %%mm"#R0" \n\t" \ |
67 |
"pmullw %%mm6, %%mm"#R1" \n\t" \ |
68 |
"punpcklbw %%mm0, %%mm"#R0" \n\t" \ |
69 |
"movd (%0,%2), %%mm"#R3" \n\t" \ |
70 |
"psubw %%mm"#R0", %%mm"#R1" \n\t" \ |
71 |
"punpcklbw %%mm0, %%mm"#R3" \n\t" \ |
72 |
"paddw %%mm7, %%mm"#R1" \n\t" \ |
73 |
"psubw %%mm"#R3", %%mm"#R1" \n\t" \ |
74 |
"psraw %4, %%mm"#R1" \n\t" \ |
75 |
"movq %%mm"#R1", "#OFF"(%1) \n\t" \ |
76 |
"add %2, %0 \n\t"
|
77 |
|
78 |
/** Sacrifying mm6 allows to pipeline loads from src */
|
79 |
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, |
80 |
const uint8_t *src, x86_reg stride,
|
81 |
int rnd, int64_t shift)
|
82 |
{ |
83 |
__asm__ volatile(
|
84 |
"mov $3, %%"REG_c" \n\t" |
85 |
LOAD_ROUNDER_MMX("%5")
|
86 |
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t" |
87 |
"1: \n\t"
|
88 |
"movd (%0), %%mm2 \n\t"
|
89 |
"add %2, %0 \n\t"
|
90 |
"movd (%0), %%mm3 \n\t"
|
91 |
"punpcklbw %%mm0, %%mm2 \n\t"
|
92 |
"punpcklbw %%mm0, %%mm3 \n\t"
|
93 |
SHIFT2_LINE( 0, 1, 2, 3, 4) |
94 |
SHIFT2_LINE( 24, 2, 3, 4, 1) |
95 |
SHIFT2_LINE( 48, 3, 4, 1, 2) |
96 |
SHIFT2_LINE( 72, 4, 1, 2, 3) |
97 |
SHIFT2_LINE( 96, 1, 2, 3, 4) |
98 |
SHIFT2_LINE(120, 2, 3, 4, 1) |
99 |
SHIFT2_LINE(144, 3, 4, 1, 2) |
100 |
SHIFT2_LINE(168, 4, 1, 2, 3) |
101 |
"sub %6, %0 \n\t"
|
102 |
"add $8, %1 \n\t"
|
103 |
"dec %%"REG_c" \n\t" |
104 |
"jnz 1b \n\t"
|
105 |
: "+r"(src), "+r"(dst) |
106 |
: "r"(stride), "r"(-2*stride), |
107 |
"m"(shift), "m"(rnd), "r"(9*stride-4) |
108 |
: "%"REG_c, "memory" |
109 |
); |
110 |
} |
111 |
|
112 |
/**
|
113 |
* Data is already unpacked, so some operations can directly be made from
|
114 |
* memory.
|
115 |
*/
|
116 |
#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
|
117 |
static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ |
118 |
const int16_t *src, int rnd)\ |
119 |
{\ |
120 |
int h = 8;\ |
121 |
\ |
122 |
src -= 1;\
|
123 |
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ |
124 |
__asm__ volatile(\
|
125 |
LOAD_ROUNDER_MMX("%4")\
|
126 |
"movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ |
127 |
"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ |
128 |
"1: \n\t"\
|
129 |
"movq 2*0+0(%1), %%mm1 \n\t"\
|
130 |
"movq 2*0+8(%1), %%mm2 \n\t"\
|
131 |
"movq 2*1+0(%1), %%mm3 \n\t"\
|
132 |
"movq 2*1+8(%1), %%mm4 \n\t"\
|
133 |
"paddw 2*3+0(%1), %%mm1 \n\t"\
|
134 |
"paddw 2*3+8(%1), %%mm2 \n\t"\
|
135 |
"paddw 2*2+0(%1), %%mm3 \n\t"\
|
136 |
"paddw 2*2+8(%1), %%mm4 \n\t"\
|
137 |
"pmullw %%mm5, %%mm3 \n\t"\
|
138 |
"pmullw %%mm5, %%mm4 \n\t"\
|
139 |
"psubw %%mm1, %%mm3 \n\t"\
|
140 |
"psubw %%mm2, %%mm4 \n\t"\
|
141 |
NORMALIZE_MMX("$7")\
|
142 |
/* Remove bias */\
|
143 |
"paddw %%mm6, %%mm3 \n\t"\
|
144 |
"paddw %%mm6, %%mm4 \n\t"\
|
145 |
TRANSFER_DO_PACK(OP)\ |
146 |
"add $24, %1 \n\t"\
|
147 |
"add %3, %2 \n\t"\
|
148 |
"decl %0 \n\t"\
|
149 |
"jnz 1b \n\t"\
|
150 |
: "+r"(h), "+r" (src), "+r" (dst)\ |
151 |
: "r"(stride), "m"(rnd)\ |
152 |
: "memory"\
|
153 |
);\ |
154 |
} |
155 |
|
156 |
VC1_HOR_16b_SHIFT2(OP_PUT, put_) |
157 |
VC1_HOR_16b_SHIFT2(OP_AVG, avg_) |
158 |
|
159 |
|
160 |
/**
|
161 |
* Purely vertical or horizontal 1/2 shift interpolation.
|
162 |
* Sacrify mm6 for *9 factor.
|
163 |
*/
|
164 |
#define VC1_SHIFT2(OP, OPNAME)\
|
165 |
static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ |
166 |
x86_reg stride, int rnd, x86_reg offset)\
|
167 |
{\ |
168 |
rnd = 8-rnd;\
|
169 |
__asm__ volatile(\
|
170 |
"mov $8, %%"REG_c" \n\t"\ |
171 |
LOAD_ROUNDER_MMX("%5")\
|
172 |
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ |
173 |
"1: \n\t"\
|
174 |
"movd 0(%0 ), %%mm3 \n\t"\
|
175 |
"movd 4(%0 ), %%mm4 \n\t"\
|
176 |
"movd 0(%0,%2), %%mm1 \n\t"\
|
177 |
"movd 4(%0,%2), %%mm2 \n\t"\
|
178 |
"add %2, %0 \n\t"\
|
179 |
"punpcklbw %%mm0, %%mm3 \n\t"\
|
180 |
"punpcklbw %%mm0, %%mm4 \n\t"\
|
181 |
"punpcklbw %%mm0, %%mm1 \n\t"\
|
182 |
"punpcklbw %%mm0, %%mm2 \n\t"\
|
183 |
"paddw %%mm1, %%mm3 \n\t"\
|
184 |
"paddw %%mm2, %%mm4 \n\t"\
|
185 |
"movd 0(%0,%3), %%mm1 \n\t"\
|
186 |
"movd 4(%0,%3), %%mm2 \n\t"\
|
187 |
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ |
188 |
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ |
189 |
"punpcklbw %%mm0, %%mm1 \n\t"\
|
190 |
"punpcklbw %%mm0, %%mm2 \n\t"\
|
191 |
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ |
192 |
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ |
193 |
"movd 0(%0,%2), %%mm1 \n\t"\
|
194 |
"movd 4(%0,%2), %%mm2 \n\t"\
|
195 |
"punpcklbw %%mm0, %%mm1 \n\t"\
|
196 |
"punpcklbw %%mm0, %%mm2 \n\t"\
|
197 |
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ |
198 |
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ |
199 |
NORMALIZE_MMX("$4")\
|
200 |
"packuswb %%mm4, %%mm3 \n\t"\
|
201 |
OP((%1), %%mm3)\
|
202 |
"movq %%mm3, (%1) \n\t"\
|
203 |
"add %6, %0 \n\t"\
|
204 |
"add %4, %1 \n\t"\
|
205 |
"dec %%"REG_c" \n\t"\ |
206 |
"jnz 1b \n\t"\
|
207 |
: "+r"(src), "+r"(dst)\ |
208 |
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ |
209 |
"g"(stride-offset)\
|
210 |
: "%"REG_c, "memory"\ |
211 |
);\ |
212 |
} |
213 |
|
214 |
VC1_SHIFT2(OP_PUT, put_) |
215 |
VC1_SHIFT2(OP_AVG, avg_) |
216 |
|
217 |
/**
|
218 |
* Core of the 1/4 and 3/4 shift bicubic interpolation.
|
219 |
*
|
220 |
* @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
|
221 |
* @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
|
222 |
* @param A1 Address of 1st tap (beware of unpacked/packed).
|
223 |
* @param A2 Address of 2nd tap
|
224 |
* @param A3 Address of 3rd tap
|
225 |
* @param A4 Address of 4th tap
|
226 |
*/
|
227 |
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
|
228 |
MOVQ "*0+"A1", %%mm1 \n\t" \ |
229 |
MOVQ "*4+"A1", %%mm2 \n\t" \ |
230 |
UNPACK("%%mm1") \
|
231 |
UNPACK("%%mm2") \
|
232 |
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ |
233 |
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ |
234 |
MOVQ "*0+"A2", %%mm3 \n\t" \ |
235 |
MOVQ "*4+"A2", %%mm4 \n\t" \ |
236 |
UNPACK("%%mm3") \
|
237 |
UNPACK("%%mm4") \
|
238 |
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
239 |
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ |
240 |
"psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ |
241 |
"psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ |
242 |
MOVQ "*0+"A4", %%mm1 \n\t" \ |
243 |
MOVQ "*4+"A4", %%mm2 \n\t" \ |
244 |
UNPACK("%%mm1") \
|
245 |
UNPACK("%%mm2") \
|
246 |
"psllw $2, %%mm1 \n\t" /* 4* */ \ |
247 |
"psllw $2, %%mm2 \n\t" /* 4* */ \ |
248 |
"psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ |
249 |
"psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ |
250 |
MOVQ "*0+"A3", %%mm1 \n\t" \ |
251 |
MOVQ "*4+"A3", %%mm2 \n\t" \ |
252 |
UNPACK("%%mm1") \
|
253 |
UNPACK("%%mm2") \
|
254 |
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
255 |
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ |
256 |
"paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ |
257 |
"paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ |
258 |
|
259 |
/**
|
260 |
* Macro to build the vertical 16bits version of vc1_put_shift[13].
|
261 |
* Here, offset=src_stride. Parameters passed A1 to A4 must use
|
262 |
* %3 (src_stride) and %4 (3*src_stride).
|
263 |
*
|
264 |
* @param NAME Either 1 or 3
|
265 |
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
266 |
*/
|
267 |
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
|
268 |
static void \ |
269 |
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ |
270 |
x86_reg src_stride, \ |
271 |
int rnd, int64_t shift) \
|
272 |
{ \ |
273 |
int h = 8; \ |
274 |
src -= src_stride; \ |
275 |
__asm__ volatile( \
|
276 |
LOAD_ROUNDER_MMX("%5") \
|
277 |
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ |
278 |
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ |
279 |
".p2align 3 \n\t" \
|
280 |
"1: \n\t" \
|
281 |
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
282 |
NORMALIZE_MMX("%6") \
|
283 |
TRANSFER_DONT_PACK(OP_PUT) \ |
284 |
/* Last 3 (in fact 4) bytes on the line */ \
|
285 |
"movd 8+"A1", %%mm1 \n\t" \ |
286 |
DO_UNPACK("%%mm1") \
|
287 |
"movq %%mm1, %%mm3 \n\t" \
|
288 |
"paddw %%mm1, %%mm1 \n\t" \
|
289 |
"paddw %%mm3, %%mm1 \n\t" /* 3* */ \ |
290 |
"movd 8+"A2", %%mm3 \n\t" \ |
291 |
DO_UNPACK("%%mm3") \
|
292 |
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
293 |
"psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ |
294 |
"movd 8+"A3", %%mm1 \n\t" \ |
295 |
DO_UNPACK("%%mm1") \
|
296 |
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
297 |
"paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ |
298 |
"movd 8+"A4", %%mm1 \n\t" \ |
299 |
DO_UNPACK("%%mm1") \
|
300 |
"psllw $2, %%mm1 \n\t" /* 4* */ \ |
301 |
"psubw %%mm1, %%mm3 \n\t" \
|
302 |
"paddw %%mm7, %%mm3 \n\t" \
|
303 |
"psraw %6, %%mm3 \n\t" \
|
304 |
"movq %%mm3, 16(%2) \n\t" \
|
305 |
"add %3, %1 \n\t" \
|
306 |
"add $24, %2 \n\t" \
|
307 |
"decl %0 \n\t" \
|
308 |
"jnz 1b \n\t" \
|
309 |
: "+r"(h), "+r" (src), "+r" (dst) \ |
310 |
: "r"(src_stride), "r"(3*src_stride), \ |
311 |
"m"(rnd), "m"(shift) \ |
312 |
: "memory" \
|
313 |
); \ |
314 |
} |
315 |
|
316 |
/**
|
317 |
* Macro to build the horizontal 16bits version of vc1_put_shift[13].
|
318 |
* Here, offset=16bits, so parameters passed A1 to A4 should be simple.
|
319 |
*
|
320 |
* @param NAME Either 1 or 3
|
321 |
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
322 |
*/
|
323 |
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
324 |
static void \ |
325 |
OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ |
326 |
const int16_t *src, int rnd) \ |
327 |
{ \ |
328 |
int h = 8; \ |
329 |
src -= 1; \
|
330 |
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ |
331 |
__asm__ volatile( \
|
332 |
LOAD_ROUNDER_MMX("%4") \
|
333 |
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
334 |
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
335 |
".p2align 3 \n\t" \
|
336 |
"1: \n\t" \
|
337 |
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
|
338 |
NORMALIZE_MMX("$7") \
|
339 |
/* Remove bias */ \
|
340 |
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ |
341 |
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ |
342 |
TRANSFER_DO_PACK(OP) \ |
343 |
"add $24, %1 \n\t" \
|
344 |
"add %3, %2 \n\t" \
|
345 |
"decl %0 \n\t" \
|
346 |
"jnz 1b \n\t" \
|
347 |
: "+r"(h), "+r" (src), "+r" (dst) \ |
348 |
: "r"(stride), "m"(rnd) \ |
349 |
: "memory" \
|
350 |
); \ |
351 |
} |
352 |
|
353 |
/**
|
354 |
* Macro to build the 8bits, any direction, version of vc1_put_shift[13].
|
355 |
* Here, offset=src_stride. Parameters passed A1 to A4 must use
|
356 |
* %3 (offset) and %4 (3*offset).
|
357 |
*
|
358 |
* @param NAME Either 1 or 3
|
359 |
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
360 |
*/
|
361 |
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
362 |
static void \ |
363 |
OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ |
364 |
x86_reg stride, int rnd, x86_reg offset) \
|
365 |
{ \ |
366 |
int h = 8; \ |
367 |
src -= offset; \ |
368 |
rnd = 32-rnd; \
|
369 |
__asm__ volatile ( \
|
370 |
LOAD_ROUNDER_MMX("%6") \
|
371 |
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
372 |
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
373 |
".p2align 3 \n\t" \
|
374 |
"1: \n\t" \
|
375 |
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
376 |
NORMALIZE_MMX("$6") \
|
377 |
TRANSFER_DO_PACK(OP) \ |
378 |
"add %5, %1 \n\t" \
|
379 |
"add %5, %2 \n\t" \
|
380 |
"decl %0 \n\t" \
|
381 |
"jnz 1b \n\t" \
|
382 |
: "+r"(h), "+r" (src), "+r" (dst) \ |
383 |
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ |
384 |
: "memory" \
|
385 |
); \ |
386 |
} |
387 |
|
388 |
/** 1/4 shift bicubic interpolation */
|
389 |
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) |
390 |
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) |
391 |
MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") |
392 |
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) |
393 |
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) |
394 |
|
395 |
/** 3/4 shift bicubic interpolation */
|
396 |
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) |
397 |
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) |
398 |
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") |
399 |
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) |
400 |
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) |
401 |
|
402 |
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); |
403 |
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); |
404 |
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); |
405 |
|
406 |
/**
|
407 |
* Interpolate fractional pel values by applying proper vertical then
|
408 |
* horizontal filter.
|
409 |
*
|
410 |
* @param dst Destination buffer for interpolated pels.
|
411 |
* @param src Source buffer.
|
412 |
* @param stride Stride for both src and dst buffers.
|
413 |
* @param hmode Horizontal filter (expressed in quarter pixels shift).
|
414 |
* @param hmode Vertical filter.
|
415 |
* @param rnd Rounding bias.
|
416 |
*/
|
417 |
#define VC1_MSPEL_MC(OP)\
|
418 |
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ |
419 |
int hmode, int vmode, int rnd)\ |
420 |
{\ |
421 |
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ |
422 |
{ NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
|
423 |
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ |
424 |
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ |
425 |
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ |
426 |
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ |
427 |
\ |
428 |
__asm__ volatile(\
|
429 |
"pxor %%mm0, %%mm0 \n\t"\
|
430 |
::: "memory"\
|
431 |
);\ |
432 |
\ |
433 |
if (vmode) { /* Vertical filter to apply */\ |
434 |
if (hmode) { /* Horizontal filter to apply, output to tmp */\ |
435 |
static const int shift_value[] = { 0, 5, 1, 5 };\ |
436 |
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ |
437 |
int r;\
|
438 |
DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ |
439 |
\ |
440 |
r = (1<<(shift-1)) + rnd-1;\ |
441 |
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
|
442 |
\ |
443 |
vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ |
444 |
return;\
|
445 |
}\ |
446 |
else { /* No horizontal filter, output 8 lines to dst */\ |
447 |
vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
|
448 |
return;\
|
449 |
}\ |
450 |
}\ |
451 |
\ |
452 |
/* Horizontal mode with no vertical mode */\
|
453 |
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
|
454 |
} |
455 |
|
456 |
VC1_MSPEL_MC(put_) |
457 |
VC1_MSPEL_MC(avg_) |
458 |
|
459 |
/** Macro to ease bicubic filter interpolation functions declarations */
|
460 |
#define DECLARE_FUNCTION(a, b) \
|
461 |
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ |
462 |
put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
463 |
}\ |
464 |
static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ |
465 |
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
466 |
} |
467 |
|
468 |
DECLARE_FUNCTION(0, 1) |
469 |
DECLARE_FUNCTION(0, 2) |
470 |
DECLARE_FUNCTION(0, 3) |
471 |
|
472 |
DECLARE_FUNCTION(1, 0) |
473 |
DECLARE_FUNCTION(1, 1) |
474 |
DECLARE_FUNCTION(1, 2) |
475 |
DECLARE_FUNCTION(1, 3) |
476 |
|
477 |
DECLARE_FUNCTION(2, 0) |
478 |
DECLARE_FUNCTION(2, 1) |
479 |
DECLARE_FUNCTION(2, 2) |
480 |
DECLARE_FUNCTION(2, 3) |
481 |
|
482 |
DECLARE_FUNCTION(3, 0) |
483 |
DECLARE_FUNCTION(3, 1) |
484 |
DECLARE_FUNCTION(3, 2) |
485 |
DECLARE_FUNCTION(3, 3) |
486 |
|
487 |
static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
488 |
{ |
489 |
int dc = block[0]; |
490 |
dc = (17 * dc + 4) >> 3; |
491 |
dc = (17 * dc + 64) >> 7; |
492 |
__asm__ volatile(
|
493 |
"movd %0, %%mm0 \n\t"
|
494 |
"pshufw $0, %%mm0, %%mm0 \n\t"
|
495 |
"pxor %%mm1, %%mm1 \n\t"
|
496 |
"psubw %%mm0, %%mm1 \n\t"
|
497 |
"packuswb %%mm0, %%mm0 \n\t"
|
498 |
"packuswb %%mm1, %%mm1 \n\t"
|
499 |
::"r"(dc)
|
500 |
); |
501 |
__asm__ volatile(
|
502 |
"movd %0, %%mm2 \n\t"
|
503 |
"movd %1, %%mm3 \n\t"
|
504 |
"movd %2, %%mm4 \n\t"
|
505 |
"movd %3, %%mm5 \n\t"
|
506 |
"paddusb %%mm0, %%mm2 \n\t"
|
507 |
"paddusb %%mm0, %%mm3 \n\t"
|
508 |
"paddusb %%mm0, %%mm4 \n\t"
|
509 |
"paddusb %%mm0, %%mm5 \n\t"
|
510 |
"psubusb %%mm1, %%mm2 \n\t"
|
511 |
"psubusb %%mm1, %%mm3 \n\t"
|
512 |
"psubusb %%mm1, %%mm4 \n\t"
|
513 |
"psubusb %%mm1, %%mm5 \n\t"
|
514 |
"movd %%mm2, %0 \n\t"
|
515 |
"movd %%mm3, %1 \n\t"
|
516 |
"movd %%mm4, %2 \n\t"
|
517 |
"movd %%mm5, %3 \n\t"
|
518 |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
519 |
"+m"(*(uint32_t*)(dest+1*linesize)), |
520 |
"+m"(*(uint32_t*)(dest+2*linesize)), |
521 |
"+m"(*(uint32_t*)(dest+3*linesize)) |
522 |
); |
523 |
} |
524 |
|
525 |
static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
526 |
{ |
527 |
int dc = block[0]; |
528 |
dc = (17 * dc + 4) >> 3; |
529 |
dc = (12 * dc + 64) >> 7; |
530 |
__asm__ volatile(
|
531 |
"movd %0, %%mm0 \n\t"
|
532 |
"pshufw $0, %%mm0, %%mm0 \n\t"
|
533 |
"pxor %%mm1, %%mm1 \n\t"
|
534 |
"psubw %%mm0, %%mm1 \n\t"
|
535 |
"packuswb %%mm0, %%mm0 \n\t"
|
536 |
"packuswb %%mm1, %%mm1 \n\t"
|
537 |
::"r"(dc)
|
538 |
); |
539 |
__asm__ volatile(
|
540 |
"movd %0, %%mm2 \n\t"
|
541 |
"movd %1, %%mm3 \n\t"
|
542 |
"movd %2, %%mm4 \n\t"
|
543 |
"movd %3, %%mm5 \n\t"
|
544 |
"paddusb %%mm0, %%mm2 \n\t"
|
545 |
"paddusb %%mm0, %%mm3 \n\t"
|
546 |
"paddusb %%mm0, %%mm4 \n\t"
|
547 |
"paddusb %%mm0, %%mm5 \n\t"
|
548 |
"psubusb %%mm1, %%mm2 \n\t"
|
549 |
"psubusb %%mm1, %%mm3 \n\t"
|
550 |
"psubusb %%mm1, %%mm4 \n\t"
|
551 |
"psubusb %%mm1, %%mm5 \n\t"
|
552 |
"movd %%mm2, %0 \n\t"
|
553 |
"movd %%mm3, %1 \n\t"
|
554 |
"movd %%mm4, %2 \n\t"
|
555 |
"movd %%mm5, %3 \n\t"
|
556 |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
557 |
"+m"(*(uint32_t*)(dest+1*linesize)), |
558 |
"+m"(*(uint32_t*)(dest+2*linesize)), |
559 |
"+m"(*(uint32_t*)(dest+3*linesize)) |
560 |
); |
561 |
dest += 4*linesize;
|
562 |
__asm__ volatile(
|
563 |
"movd %0, %%mm2 \n\t"
|
564 |
"movd %1, %%mm3 \n\t"
|
565 |
"movd %2, %%mm4 \n\t"
|
566 |
"movd %3, %%mm5 \n\t"
|
567 |
"paddusb %%mm0, %%mm2 \n\t"
|
568 |
"paddusb %%mm0, %%mm3 \n\t"
|
569 |
"paddusb %%mm0, %%mm4 \n\t"
|
570 |
"paddusb %%mm0, %%mm5 \n\t"
|
571 |
"psubusb %%mm1, %%mm2 \n\t"
|
572 |
"psubusb %%mm1, %%mm3 \n\t"
|
573 |
"psubusb %%mm1, %%mm4 \n\t"
|
574 |
"psubusb %%mm1, %%mm5 \n\t"
|
575 |
"movd %%mm2, %0 \n\t"
|
576 |
"movd %%mm3, %1 \n\t"
|
577 |
"movd %%mm4, %2 \n\t"
|
578 |
"movd %%mm5, %3 \n\t"
|
579 |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
580 |
"+m"(*(uint32_t*)(dest+1*linesize)), |
581 |
"+m"(*(uint32_t*)(dest+2*linesize)), |
582 |
"+m"(*(uint32_t*)(dest+3*linesize)) |
583 |
); |
584 |
} |
585 |
|
586 |
static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
587 |
{ |
588 |
int dc = block[0]; |
589 |
dc = ( 3 * dc + 1) >> 1; |
590 |
dc = (17 * dc + 64) >> 7; |
591 |
__asm__ volatile(
|
592 |
"movd %0, %%mm0 \n\t"
|
593 |
"pshufw $0, %%mm0, %%mm0 \n\t"
|
594 |
"pxor %%mm1, %%mm1 \n\t"
|
595 |
"psubw %%mm0, %%mm1 \n\t"
|
596 |
"packuswb %%mm0, %%mm0 \n\t"
|
597 |
"packuswb %%mm1, %%mm1 \n\t"
|
598 |
::"r"(dc)
|
599 |
); |
600 |
__asm__ volatile(
|
601 |
"movq %0, %%mm2 \n\t"
|
602 |
"movq %1, %%mm3 \n\t"
|
603 |
"movq %2, %%mm4 \n\t"
|
604 |
"movq %3, %%mm5 \n\t"
|
605 |
"paddusb %%mm0, %%mm2 \n\t"
|
606 |
"paddusb %%mm0, %%mm3 \n\t"
|
607 |
"paddusb %%mm0, %%mm4 \n\t"
|
608 |
"paddusb %%mm0, %%mm5 \n\t"
|
609 |
"psubusb %%mm1, %%mm2 \n\t"
|
610 |
"psubusb %%mm1, %%mm3 \n\t"
|
611 |
"psubusb %%mm1, %%mm4 \n\t"
|
612 |
"psubusb %%mm1, %%mm5 \n\t"
|
613 |
"movq %%mm2, %0 \n\t"
|
614 |
"movq %%mm3, %1 \n\t"
|
615 |
"movq %%mm4, %2 \n\t"
|
616 |
"movq %%mm5, %3 \n\t"
|
617 |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
618 |
"+m"(*(uint32_t*)(dest+1*linesize)), |
619 |
"+m"(*(uint32_t*)(dest+2*linesize)), |
620 |
"+m"(*(uint32_t*)(dest+3*linesize)) |
621 |
); |
622 |
} |
623 |
|
624 |
static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
625 |
{ |
626 |
int dc = block[0]; |
627 |
dc = (3 * dc + 1) >> 1; |
628 |
dc = (3 * dc + 16) >> 5; |
629 |
__asm__ volatile(
|
630 |
"movd %0, %%mm0 \n\t"
|
631 |
"pshufw $0, %%mm0, %%mm0 \n\t"
|
632 |
"pxor %%mm1, %%mm1 \n\t"
|
633 |
"psubw %%mm0, %%mm1 \n\t"
|
634 |
"packuswb %%mm0, %%mm0 \n\t"
|
635 |
"packuswb %%mm1, %%mm1 \n\t"
|
636 |
::"r"(dc)
|
637 |
); |
638 |
__asm__ volatile(
|
639 |
"movq %0, %%mm2 \n\t"
|
640 |
"movq %1, %%mm3 \n\t"
|
641 |
"movq %2, %%mm4 \n\t"
|
642 |
"movq %3, %%mm5 \n\t"
|
643 |
"paddusb %%mm0, %%mm2 \n\t"
|
644 |
"paddusb %%mm0, %%mm3 \n\t"
|
645 |
"paddusb %%mm0, %%mm4 \n\t"
|
646 |
"paddusb %%mm0, %%mm5 \n\t"
|
647 |
"psubusb %%mm1, %%mm2 \n\t"
|
648 |
"psubusb %%mm1, %%mm3 \n\t"
|
649 |
"psubusb %%mm1, %%mm4 \n\t"
|
650 |
"psubusb %%mm1, %%mm5 \n\t"
|
651 |
"movq %%mm2, %0 \n\t"
|
652 |
"movq %%mm3, %1 \n\t"
|
653 |
"movq %%mm4, %2 \n\t"
|
654 |
"movq %%mm5, %3 \n\t"
|
655 |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
656 |
"+m"(*(uint32_t*)(dest+1*linesize)), |
657 |
"+m"(*(uint32_t*)(dest+2*linesize)), |
658 |
"+m"(*(uint32_t*)(dest+3*linesize)) |
659 |
); |
660 |
dest += 4*linesize;
|
661 |
__asm__ volatile(
|
662 |
"movq %0, %%mm2 \n\t"
|
663 |
"movq %1, %%mm3 \n\t"
|
664 |
"movq %2, %%mm4 \n\t"
|
665 |
"movq %3, %%mm5 \n\t"
|
666 |
"paddusb %%mm0, %%mm2 \n\t"
|
667 |
"paddusb %%mm0, %%mm3 \n\t"
|
668 |
"paddusb %%mm0, %%mm4 \n\t"
|
669 |
"paddusb %%mm0, %%mm5 \n\t"
|
670 |
"psubusb %%mm1, %%mm2 \n\t"
|
671 |
"psubusb %%mm1, %%mm3 \n\t"
|
672 |
"psubusb %%mm1, %%mm4 \n\t"
|
673 |
"psubusb %%mm1, %%mm5 \n\t"
|
674 |
"movq %%mm2, %0 \n\t"
|
675 |
"movq %%mm3, %1 \n\t"
|
676 |
"movq %%mm4, %2 \n\t"
|
677 |
"movq %%mm5, %3 \n\t"
|
678 |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
679 |
"+m"(*(uint32_t*)(dest+1*linesize)), |
680 |
"+m"(*(uint32_t*)(dest+2*linesize)), |
681 |
"+m"(*(uint32_t*)(dest+3*linesize)) |
682 |
); |
683 |
} |
684 |
|
685 |
#define LOOP_FILTER(EXT) \
|
686 |
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ |
687 |
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ |
688 |
void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ |
689 |
void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ |
690 |
\ |
691 |
static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ |
692 |
{ \ |
693 |
ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \ |
694 |
ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \ |
695 |
} \ |
696 |
\ |
697 |
static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ |
698 |
{ \ |
699 |
ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \ |
700 |
ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \ |
701 |
} |
702 |
|
703 |
#if HAVE_YASM
|
704 |
LOOP_FILTER(mmx) |
705 |
LOOP_FILTER(mmx2) |
706 |
LOOP_FILTER(sse2) |
707 |
LOOP_FILTER(ssse3) |
708 |
|
709 |
void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq); |
710 |
|
711 |
static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) |
712 |
{ |
713 |
ff_vc1_h_loop_filter8_sse4(src, stride, pq); |
714 |
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
|
715 |
} |
716 |
|
717 |
#endif
|
718 |
|
719 |
void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src,
|
720 |
int stride, int h, int x, int y); |
721 |
void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src,
|
722 |
int stride, int h, int x, int y); |
723 |
void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src,
|
724 |
int stride, int h, int x, int y); |
725 |
void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
|
726 |
int stride, int h, int x, int y); |
727 |
void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
|
728 |
int stride, int h, int x, int y); |
729 |
|
730 |
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
|
731 |
{ |
732 |
int mm_flags = av_get_cpu_flags();
|
733 |
|
734 |
if (mm_flags & AV_CPU_FLAG_MMX) {
|
735 |
dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
|
736 |
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
|
737 |
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
|
738 |
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
|
739 |
|
740 |
dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
|
741 |
dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
|
742 |
dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
|
743 |
dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
|
744 |
|
745 |
dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
|
746 |
dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
|
747 |
dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
|
748 |
dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
|
749 |
|
750 |
dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
|
751 |
dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
|
752 |
dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
|
753 |
dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
|
754 |
} |
755 |
|
756 |
if (mm_flags & AV_CPU_FLAG_MMX2){
|
757 |
dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2;
|
758 |
dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2;
|
759 |
dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2;
|
760 |
dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2;
|
761 |
|
762 |
dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2;
|
763 |
dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2;
|
764 |
dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2;
|
765 |
dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2;
|
766 |
|
767 |
dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2;
|
768 |
dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2;
|
769 |
dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2;
|
770 |
dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2;
|
771 |
|
772 |
dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2;
|
773 |
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
|
774 |
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
|
775 |
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
|
776 |
|
777 |
dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; |
778 |
dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; |
779 |
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; |
780 |
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; |
781 |
} |
782 |
|
783 |
#define ASSIGN_LF(EXT) \
|
784 |
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \ |
785 |
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \ |
786 |
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \ |
787 |
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \ |
788 |
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \ |
789 |
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT |
790 |
|
791 |
#if HAVE_YASM
|
792 |
if (mm_flags & AV_CPU_FLAG_MMX) {
|
793 |
ASSIGN_LF(mmx); |
794 |
dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
|
795 |
} |
796 |
return;
|
797 |
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
798 |
ASSIGN_LF(mmx2); |
799 |
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
|
800 |
} else if (mm_flags & AV_CPU_FLAG_3DNOW) { |
801 |
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd;
|
802 |
} |
803 |
|
804 |
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
805 |
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; |
806 |
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; |
807 |
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; |
808 |
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; |
809 |
} |
810 |
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
811 |
ASSIGN_LF(ssse3); |
812 |
dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
|
813 |
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
|
814 |
} |
815 |
if (mm_flags & AV_CPU_FLAG_SSE4) {
|
816 |
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; |
817 |
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4; |
818 |
} |
819 |
#endif
|
820 |
} |