ffmpeg / libavcodec / x86 / vc1dsp_mmx.c @ 84dc2d8a
History | View | Annotate | Download (32.8 KB)
1 | 82821c91 | Christophe Gisquet | /*
|
---|---|---|---|
2 | * VC-1 and WMV3 - DSP functions MMX-optimized
|
||
3 | * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
|
||
4 | *
|
||
5 | * Permission is hereby granted, free of charge, to any person
|
||
6 | * obtaining a copy of this software and associated documentation
|
||
7 | * files (the "Software"), to deal in the Software without
|
||
8 | * restriction, including without limitation the rights to use,
|
||
9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
10 | * copies of the Software, and to permit persons to whom the
|
||
11 | * Software is furnished to do so, subject to the following
|
||
12 | * conditions:
|
||
13 | *
|
||
14 | * The above copyright notice and this permission notice shall be
|
||
15 | * included in all copies or substantial portions of the Software.
|
||
16 | *
|
||
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||
19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||
21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||
22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||
23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||
24 | * OTHER DEALINGS IN THE SOFTWARE.
|
||
25 | */
|
||
26 | |||
27 | 245976da | Diego Biurrun | #include "libavutil/x86_cpu.h" |
28 | #include "libavcodec/dsputil.h" |
||
29 | 182f56cb | Aurelien Jacobs | #include "dsputil_mmx.h" |
30 | 82821c91 | Christophe Gisquet | |
31 | 9bf0fdf3 | David Conrad | #define OP_PUT(S,D)
|
32 | #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" |
||
33 | |||
34 | 82821c91 | Christophe Gisquet | /** Add rounder from mm7 to mm3 and pack result at destination */
|
35 | #define NORMALIZE_MMX(SHIFT) \
|
||
36 | "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ |
||
37 | "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ |
||
38 | "psraw "SHIFT", %%mm3 \n\t" \ |
||
39 | "psraw "SHIFT", %%mm4 \n\t" |
||
40 | |||
41 | 9bf0fdf3 | David Conrad | #define TRANSFER_DO_PACK(OP) \
|
42 | 82821c91 | Christophe Gisquet | "packuswb %%mm4, %%mm3 \n\t" \
|
43 | 9bf0fdf3 | David Conrad | OP((%2), %%mm3) \
|
44 | 82821c91 | Christophe Gisquet | "movq %%mm3, (%2) \n\t"
|
45 | |||
46 | 9bf0fdf3 | David Conrad | #define TRANSFER_DONT_PACK(OP) \
|
47 | OP(0(%2), %%mm3) \ |
||
48 | OP(8(%2), %%mm4) \ |
||
49 | 82821c91 | Christophe Gisquet | "movq %%mm3, 0(%2) \n\t" \
|
50 | "movq %%mm4, 8(%2) \n\t"
|
||
51 | |||
52 | /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
|
||
53 | #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" |
||
54 | #define DONT_UNPACK(reg)
|
||
55 | |||
56 | /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
|
||
57 | #define LOAD_ROUNDER_MMX(ROUND) \
|
||
58 | "movd "ROUND", %%mm7 \n\t" \ |
||
59 | "punpcklwd %%mm7, %%mm7 \n\t" \
|
||
60 | "punpckldq %%mm7, %%mm7 \n\t"
|
||
61 | |||
62 | #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
|
||
63 | "paddw %%mm"#R2", %%mm"#R1" \n\t" \ |
||
64 | ae904fd0 | Christophe Gisquet | "movd (%0,%3), %%mm"#R0" \n\t" \ |
65 | 82821c91 | Christophe Gisquet | "pmullw %%mm6, %%mm"#R1" \n\t" \ |
66 | "punpcklbw %%mm0, %%mm"#R0" \n\t" \ |
||
67 | ae904fd0 | Christophe Gisquet | "movd (%0,%2), %%mm"#R3" \n\t" \ |
68 | 82821c91 | Christophe Gisquet | "psubw %%mm"#R0", %%mm"#R1" \n\t" \ |
69 | "punpcklbw %%mm0, %%mm"#R3" \n\t" \ |
||
70 | "paddw %%mm7, %%mm"#R1" \n\t" \ |
||
71 | "psubw %%mm"#R3", %%mm"#R1" \n\t" \ |
||
72 | ae904fd0 | Christophe Gisquet | "psraw %4, %%mm"#R1" \n\t" \ |
73 | "movq %%mm"#R1", "#OFF"(%1) \n\t" \ |
||
74 | "add %2, %0 \n\t"
|
||
75 | 82821c91 | Christophe Gisquet | |
76 | 84dc2d8a | Måns Rullgård | DECLARE_ALIGNED(16, const uint64_t, ff_pw_9) = 0x0009000900090009ULL; |
77 | 82821c91 | Christophe Gisquet | |
78 | /** Sacrifying mm6 allows to pipeline loads from src */
|
||
79 | static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, |
||
80 | 40d0e665 | Ramiro Polla | const uint8_t *src, x86_reg stride,
|
81 | 82821c91 | Christophe Gisquet | int rnd, int64_t shift)
|
82 | { |
||
83 | be449fca | Diego Pettenò | __asm__ volatile(
|
84 | ae904fd0 | Christophe Gisquet | "mov $3, %%"REG_c" \n\t" |
85 | LOAD_ROUNDER_MMX("%5")
|
||
86 | "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" |
||
87 | 82821c91 | Christophe Gisquet | "1: \n\t"
|
88 | ae904fd0 | Christophe Gisquet | "movd (%0), %%mm2 \n\t"
|
89 | "add %2, %0 \n\t"
|
||
90 | "movd (%0), %%mm3 \n\t"
|
||
91 | 82821c91 | Christophe Gisquet | "punpcklbw %%mm0, %%mm2 \n\t"
|
92 | "punpcklbw %%mm0, %%mm3 \n\t"
|
||
93 | SHIFT2_LINE( 0, 1, 2, 3, 4) |
||
94 | SHIFT2_LINE( 24, 2, 3, 4, 1) |
||
95 | SHIFT2_LINE( 48, 3, 4, 1, 2) |
||
96 | SHIFT2_LINE( 72, 4, 1, 2, 3) |
||
97 | SHIFT2_LINE( 96, 1, 2, 3, 4) |
||
98 | SHIFT2_LINE(120, 2, 3, 4, 1) |
||
99 | SHIFT2_LINE(144, 3, 4, 1, 2) |
||
100 | SHIFT2_LINE(168, 4, 1, 2, 3) |
||
101 | ae904fd0 | Christophe Gisquet | "sub %6, %0 \n\t"
|
102 | "add $8, %1 \n\t"
|
||
103 | "dec %%"REG_c" \n\t" |
||
104 | 82821c91 | Christophe Gisquet | "jnz 1b \n\t"
|
105 | ae904fd0 | Christophe Gisquet | : "+r"(src), "+r"(dst) |
106 | : "r"(stride), "r"(-2*stride), |
||
107 | "m"(shift), "m"(rnd), "r"(9*stride-4) |
||
108 | : "%"REG_c, "memory" |
||
109 | 82821c91 | Christophe Gisquet | ); |
110 | } |
||
111 | |||
112 | /**
|
||
113 | * Data is already unpacked, so some operations can directly be made from
|
||
114 | * memory.
|
||
115 | */
|
||
116 | 9bf0fdf3 | David Conrad | #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
|
117 | static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ |
||
118 | const int16_t *src, int rnd)\ |
||
119 | {\ |
||
120 | int h = 8;\ |
||
121 | \ |
||
122 | src -= 1;\
|
||
123 | rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ |
||
124 | __asm__ volatile(\
|
||
125 | LOAD_ROUNDER_MMX("%4")\
|
||
126 | "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ |
||
127 | "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ |
||
128 | "1: \n\t"\
|
||
129 | "movq 2*0+0(%1), %%mm1 \n\t"\
|
||
130 | "movq 2*0+8(%1), %%mm2 \n\t"\
|
||
131 | "movq 2*1+0(%1), %%mm3 \n\t"\
|
||
132 | "movq 2*1+8(%1), %%mm4 \n\t"\
|
||
133 | "paddw 2*3+0(%1), %%mm1 \n\t"\
|
||
134 | "paddw 2*3+8(%1), %%mm2 \n\t"\
|
||
135 | "paddw 2*2+0(%1), %%mm3 \n\t"\
|
||
136 | "paddw 2*2+8(%1), %%mm4 \n\t"\
|
||
137 | "pmullw %%mm5, %%mm3 \n\t"\
|
||
138 | "pmullw %%mm5, %%mm4 \n\t"\
|
||
139 | "psubw %%mm1, %%mm3 \n\t"\
|
||
140 | "psubw %%mm2, %%mm4 \n\t"\
|
||
141 | NORMALIZE_MMX("$7")\
|
||
142 | /* Remove bias */\
|
||
143 | "paddw %%mm6, %%mm3 \n\t"\
|
||
144 | "paddw %%mm6, %%mm4 \n\t"\
|
||
145 | TRANSFER_DO_PACK(OP)\ |
||
146 | "add $24, %1 \n\t"\
|
||
147 | "add %3, %2 \n\t"\
|
||
148 | "decl %0 \n\t"\
|
||
149 | "jnz 1b \n\t"\
|
||
150 | : "+r"(h), "+r" (src), "+r" (dst)\ |
||
151 | : "r"(stride), "m"(rnd)\ |
||
152 | : "memory"\
|
||
153 | );\ |
||
154 | 82821c91 | Christophe Gisquet | } |
155 | |||
156 | 9bf0fdf3 | David Conrad | VC1_HOR_16b_SHIFT2(OP_PUT, put_) |
157 | VC1_HOR_16b_SHIFT2(OP_AVG, avg_) |
||
158 | |||
159 | 82821c91 | Christophe Gisquet | |
160 | /**
|
||
161 | * Purely vertical or horizontal 1/2 shift interpolation.
|
||
162 | * Sacrify mm6 for *9 factor.
|
||
163 | */
|
||
164 | 9bf0fdf3 | David Conrad | #define VC1_SHIFT2(OP, OPNAME)\
|
165 | static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ |
||
166 | x86_reg stride, int rnd, x86_reg offset)\
|
||
167 | {\ |
||
168 | rnd = 8-rnd;\
|
||
169 | __asm__ volatile(\
|
||
170 | "mov $8, %%"REG_c" \n\t"\ |
||
171 | LOAD_ROUNDER_MMX("%5")\
|
||
172 | "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ |
||
173 | "1: \n\t"\
|
||
174 | "movd 0(%0 ), %%mm3 \n\t"\
|
||
175 | "movd 4(%0 ), %%mm4 \n\t"\
|
||
176 | "movd 0(%0,%2), %%mm1 \n\t"\
|
||
177 | "movd 4(%0,%2), %%mm2 \n\t"\
|
||
178 | "add %2, %0 \n\t"\
|
||
179 | "punpcklbw %%mm0, %%mm3 \n\t"\
|
||
180 | "punpcklbw %%mm0, %%mm4 \n\t"\
|
||
181 | "punpcklbw %%mm0, %%mm1 \n\t"\
|
||
182 | "punpcklbw %%mm0, %%mm2 \n\t"\
|
||
183 | "paddw %%mm1, %%mm3 \n\t"\
|
||
184 | "paddw %%mm2, %%mm4 \n\t"\
|
||
185 | "movd 0(%0,%3), %%mm1 \n\t"\
|
||
186 | "movd 4(%0,%3), %%mm2 \n\t"\
|
||
187 | "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ |
||
188 | "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ |
||
189 | "punpcklbw %%mm0, %%mm1 \n\t"\
|
||
190 | "punpcklbw %%mm0, %%mm2 \n\t"\
|
||
191 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ |
||
192 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ |
||
193 | "movd 0(%0,%2), %%mm1 \n\t"\
|
||
194 | "movd 4(%0,%2), %%mm2 \n\t"\
|
||
195 | "punpcklbw %%mm0, %%mm1 \n\t"\
|
||
196 | "punpcklbw %%mm0, %%mm2 \n\t"\
|
||
197 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ |
||
198 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ |
||
199 | NORMALIZE_MMX("$4")\
|
||
200 | "packuswb %%mm4, %%mm3 \n\t"\
|
||
201 | OP((%1), %%mm3)\
|
||
202 | "movq %%mm3, (%1) \n\t"\
|
||
203 | "add %6, %0 \n\t"\
|
||
204 | "add %4, %1 \n\t"\
|
||
205 | "dec %%"REG_c" \n\t"\ |
||
206 | "jnz 1b \n\t"\
|
||
207 | : "+r"(src), "+r"(dst)\ |
||
208 | : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ |
||
209 | "g"(stride-offset)\
|
||
210 | : "%"REG_c, "memory"\ |
||
211 | );\ |
||
212 | 82821c91 | Christophe Gisquet | } |
213 | |||
214 | 9bf0fdf3 | David Conrad | VC1_SHIFT2(OP_PUT, put_) |
215 | VC1_SHIFT2(OP_AVG, avg_) |
||
216 | |||
217 | 82821c91 | Christophe Gisquet | /**
|
218 | * Filter coefficients made global to allow access by all 1 or 3 quarter shift
|
||
219 | * interpolation functions.
|
||
220 | */
|
||
221 | 8539d8b5 | Diego Pettenò | DECLARE_ASM_CONST(16, uint64_t, ff_pw_53) = 0x0035003500350035ULL; |
222 | DECLARE_ASM_CONST(16, uint64_t, ff_pw_18) = 0x0012001200120012ULL; |
||
223 | 82821c91 | Christophe Gisquet | |
224 | /**
|
||
225 | * Core of the 1/4 and 3/4 shift bicubic interpolation.
|
||
226 | *
|
||
227 | * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
|
||
228 | * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
|
||
229 | * @param A1 Address of 1st tap (beware of unpacked/packed).
|
||
230 | * @param A2 Address of 2nd tap
|
||
231 | * @param A3 Address of 3rd tap
|
||
232 | * @param A4 Address of 4th tap
|
||
233 | */
|
||
234 | ae904fd0 | Christophe Gisquet | #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
|
235 | 82821c91 | Christophe Gisquet | MOVQ "*0+"A1", %%mm1 \n\t" \ |
236 | MOVQ "*4+"A1", %%mm2 \n\t" \ |
||
237 | UNPACK("%%mm1") \
|
||
238 | UNPACK("%%mm2") \
|
||
239 | ae904fd0 | Christophe Gisquet | "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ |
240 | "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ |
||
241 | 82821c91 | Christophe Gisquet | MOVQ "*0+"A2", %%mm3 \n\t" \ |
242 | MOVQ "*4+"A2", %%mm4 \n\t" \ |
||
243 | UNPACK("%%mm3") \
|
||
244 | UNPACK("%%mm4") \
|
||
245 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
||
246 | "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ |
||
247 | "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ |
||
248 | "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ |
||
249 | MOVQ "*0+"A4", %%mm1 \n\t" \ |
||
250 | MOVQ "*4+"A4", %%mm2 \n\t" \ |
||
251 | UNPACK("%%mm1") \
|
||
252 | UNPACK("%%mm2") \
|
||
253 | "psllw $2, %%mm1 \n\t" /* 4* */ \ |
||
254 | "psllw $2, %%mm2 \n\t" /* 4* */ \ |
||
255 | "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ |
||
256 | "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ |
||
257 | MOVQ "*0+"A3", %%mm1 \n\t" \ |
||
258 | MOVQ "*4+"A3", %%mm2 \n\t" \ |
||
259 | UNPACK("%%mm1") \
|
||
260 | UNPACK("%%mm2") \
|
||
261 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
||
262 | "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ |
||
263 | "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ |
||
264 | "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ |
||
265 | |||
266 | /**
|
||
267 | * Macro to build the vertical 16bits version of vc1_put_shift[13].
|
||
268 | * Here, offset=src_stride. Parameters passed A1 to A4 must use
|
||
269 | * %3 (src_stride) and %4 (3*src_stride).
|
||
270 | *
|
||
271 | * @param NAME Either 1 or 3
|
||
272 | * @see MSPEL_FILTER13_CORE for information on A1->A4
|
||
273 | */
|
||
274 | #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
|
||
275 | static void \ |
||
276 | vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ |
||
277 | 40d0e665 | Ramiro Polla | x86_reg src_stride, \ |
278 | 82821c91 | Christophe Gisquet | int rnd, int64_t shift) \
|
279 | { \ |
||
280 | int h = 8; \ |
||
281 | src -= src_stride; \ |
||
282 | be449fca | Diego Pettenò | __asm__ volatile( \
|
283 | 82821c91 | Christophe Gisquet | LOAD_ROUNDER_MMX("%5") \
|
284 | ae904fd0 | Christophe Gisquet | "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ |
285 | "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ |
||
286 | 82821c91 | Christophe Gisquet | ASMALIGN(3) \
|
287 | "1: \n\t" \
|
||
288 | ae904fd0 | Christophe Gisquet | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
289 | 82821c91 | Christophe Gisquet | NORMALIZE_MMX("%6") \
|
290 | 9bf0fdf3 | David Conrad | TRANSFER_DONT_PACK(OP_PUT) \ |
291 | 82821c91 | Christophe Gisquet | /* Last 3 (in fact 4) bytes on the line */ \
|
292 | "movd 8+"A1", %%mm1 \n\t" \ |
||
293 | DO_UNPACK("%%mm1") \
|
||
294 | "movq %%mm1, %%mm3 \n\t" \
|
||
295 | "paddw %%mm1, %%mm1 \n\t" \
|
||
296 | "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ |
||
297 | "movd 8+"A2", %%mm3 \n\t" \ |
||
298 | DO_UNPACK("%%mm3") \
|
||
299 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
||
300 | "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ |
||
301 | "movd 8+"A3", %%mm1 \n\t" \ |
||
302 | DO_UNPACK("%%mm1") \
|
||
303 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
||
304 | "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ |
||
305 | "movd 8+"A4", %%mm1 \n\t" \ |
||
306 | DO_UNPACK("%%mm1") \
|
||
307 | "psllw $2, %%mm1 \n\t" /* 4* */ \ |
||
308 | "psubw %%mm1, %%mm3 \n\t" \
|
||
309 | "paddw %%mm7, %%mm3 \n\t" \
|
||
310 | "psraw %6, %%mm3 \n\t" \
|
||
311 | "movq %%mm3, 16(%2) \n\t" \
|
||
312 | "add %3, %1 \n\t" \
|
||
313 | "add $24, %2 \n\t" \
|
||
314 | 15c57ced | Reimar Döffinger | "decl %0 \n\t" \
|
315 | 82821c91 | Christophe Gisquet | "jnz 1b \n\t" \
|
316 | ae904fd0 | Christophe Gisquet | : "+r"(h), "+r" (src), "+r" (dst) \ |
317 | 82821c91 | Christophe Gisquet | : "r"(src_stride), "r"(3*src_stride), \ |
318 | ae904fd0 | Christophe Gisquet | "m"(rnd), "m"(shift) \ |
319 | 82821c91 | Christophe Gisquet | : "memory" \
|
320 | ); \ |
||
321 | } |
||
322 | |||
323 | /**
|
||
324 | * Macro to build the horizontal 16bits version of vc1_put_shift[13].
|
||
325 | * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
|
||
326 | *
|
||
327 | * @param NAME Either 1 or 3
|
||
328 | * @see MSPEL_FILTER13_CORE for information on A1->A4
|
||
329 | */
|
||
330 | 9bf0fdf3 | David Conrad | #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
331 | 82821c91 | Christophe Gisquet | static void \ |
332 | 9bf0fdf3 | David Conrad | OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ |
333 | 82821c91 | Christophe Gisquet | const int16_t *src, int rnd) \ |
334 | { \ |
||
335 | int h = 8; \ |
||
336 | src -= 1; \
|
||
337 | rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ |
||
338 | be449fca | Diego Pettenò | __asm__ volatile( \
|
339 | 82821c91 | Christophe Gisquet | LOAD_ROUNDER_MMX("%4") \
|
340 | ae904fd0 | Christophe Gisquet | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
341 | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
||
342 | 82821c91 | Christophe Gisquet | ASMALIGN(3) \
|
343 | "1: \n\t" \
|
||
344 | ae904fd0 | Christophe Gisquet | MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
|
345 | 82821c91 | Christophe Gisquet | NORMALIZE_MMX("$7") \
|
346 | /* Remove bias */ \
|
||
347 | ae904fd0 | Christophe Gisquet | "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ |
348 | "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ |
||
349 | 9bf0fdf3 | David Conrad | TRANSFER_DO_PACK(OP) \ |
350 | 82821c91 | Christophe Gisquet | "add $24, %1 \n\t" \
|
351 | "add %3, %2 \n\t" \
|
||
352 | 15c57ced | Reimar Döffinger | "decl %0 \n\t" \
|
353 | 82821c91 | Christophe Gisquet | "jnz 1b \n\t" \
|
354 | ae904fd0 | Christophe Gisquet | : "+r"(h), "+r" (src), "+r" (dst) \ |
355 | : "r"(stride), "m"(rnd) \ |
||
356 | 82821c91 | Christophe Gisquet | : "memory" \
|
357 | ); \ |
||
358 | } |
||
359 | |||
360 | /**
|
||
361 | * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
|
||
362 | * Here, offset=src_stride. Parameters passed A1 to A4 must use
|
||
363 | * %3 (offset) and %4 (3*offset).
|
||
364 | *
|
||
365 | * @param NAME Either 1 or 3
|
||
366 | * @see MSPEL_FILTER13_CORE for information on A1->A4
|
||
367 | */
|
||
368 | 9bf0fdf3 | David Conrad | #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
369 | 82821c91 | Christophe Gisquet | static void \ |
370 | 9bf0fdf3 | David Conrad | OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ |
371 | 40d0e665 | Ramiro Polla | x86_reg stride, int rnd, x86_reg offset) \
|
372 | 82821c91 | Christophe Gisquet | { \ |
373 | int h = 8; \ |
||
374 | src -= offset; \ |
||
375 | rnd = 32-rnd; \
|
||
376 | be449fca | Diego Pettenò | __asm__ volatile ( \
|
377 | 82821c91 | Christophe Gisquet | LOAD_ROUNDER_MMX("%6") \
|
378 | ae904fd0 | Christophe Gisquet | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
379 | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
||
380 | 82821c91 | Christophe Gisquet | ASMALIGN(3) \
|
381 | "1: \n\t" \
|
||
382 | ae904fd0 | Christophe Gisquet | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
383 | 82821c91 | Christophe Gisquet | NORMALIZE_MMX("$6") \
|
384 | 9bf0fdf3 | David Conrad | TRANSFER_DO_PACK(OP) \ |
385 | 82821c91 | Christophe Gisquet | "add %5, %1 \n\t" \
|
386 | "add %5, %2 \n\t" \
|
||
387 | 15c57ced | Reimar Döffinger | "decl %0 \n\t" \
|
388 | 82821c91 | Christophe Gisquet | "jnz 1b \n\t" \
|
389 | ae904fd0 | Christophe Gisquet | : "+r"(h), "+r" (src), "+r" (dst) \ |
390 | : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ |
||
391 | 82821c91 | Christophe Gisquet | : "memory" \
|
392 | ); \ |
||
393 | } |
||
394 | |||
395 | /** 1/4 shift bicubic interpolation */
|
||
396 | 9bf0fdf3 | David Conrad | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) |
397 | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) |
||
398 | 82821c91 | Christophe Gisquet | MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") |
399 | 9bf0fdf3 | David Conrad | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) |
400 | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) |
||
401 | 82821c91 | Christophe Gisquet | |
402 | /** 3/4 shift bicubic interpolation */
|
||
403 | 9bf0fdf3 | David Conrad | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) |
404 | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) |
||
405 | 82821c91 | Christophe Gisquet | MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") |
406 | 9bf0fdf3 | David Conrad | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) |
407 | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) |
||
408 | 82821c91 | Christophe Gisquet | |
409 | 40d0e665 | Ramiro Polla | typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); |
410 | typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); |
||
411 | typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); |
||
412 | 82821c91 | Christophe Gisquet | |
413 | /**
|
||
414 | * Interpolates fractional pel values by applying proper vertical then
|
||
415 | * horizontal filter.
|
||
416 | *
|
||
417 | * @param dst Destination buffer for interpolated pels.
|
||
418 | * @param src Source buffer.
|
||
419 | * @param stride Stride for both src and dst buffers.
|
||
420 | * @param hmode Horizontal filter (expressed in quarter pixels shift).
|
||
421 | * @param hmode Vertical filter.
|
||
422 | * @param rnd Rounding bias.
|
||
423 | */
|
||
424 | 9bf0fdf3 | David Conrad | #define VC1_MSPEL_MC(OP)\
|
425 | static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ |
||
426 | int hmode, int vmode, int rnd)\ |
||
427 | {\ |
||
428 | static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ |
||
429 | { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
|
||
430 | static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ |
||
431 | { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ |
||
432 | static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ |
||
433 | { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ |
||
434 | \ |
||
435 | __asm__ volatile(\
|
||
436 | "pxor %%mm0, %%mm0 \n\t"\
|
||
437 | ::: "memory"\
|
||
438 | );\ |
||
439 | \ |
||
440 | if (vmode) { /* Vertical filter to apply */\ |
||
441 | if (hmode) { /* Horizontal filter to apply, output to tmp */\ |
||
442 | static const int shift_value[] = { 0, 5, 1, 5 };\ |
||
443 | int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ |
||
444 | int r;\
|
||
445 | 84dc2d8a | Måns Rullgård | DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ |
446 | 9bf0fdf3 | David Conrad | \ |
447 | r = (1<<(shift-1)) + rnd-1;\ |
||
448 | vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
|
||
449 | \ |
||
450 | vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ |
||
451 | return;\
|
||
452 | }\ |
||
453 | else { /* No horizontal filter, output 8 lines to dst */\ |
||
454 | vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
|
||
455 | return;\
|
||
456 | }\ |
||
457 | }\ |
||
458 | \ |
||
459 | /* Horizontal mode with no vertical mode */\
|
||
460 | vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
|
||
461 | 82821c91 | Christophe Gisquet | } |
462 | |||
463 | 9bf0fdf3 | David Conrad | VC1_MSPEL_MC(put_) |
464 | VC1_MSPEL_MC(avg_) |
||
465 | |||
466 | 5b67ce2a | Aurelien Jacobs | void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); |
467 | 9bf0fdf3 | David Conrad | void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); |
468 | 82821c91 | Christophe Gisquet | |
469 | /** Macro to ease bicubic filter interpolation functions declarations */
|
||
470 | #define DECLARE_FUNCTION(a, b) \
|
||
471 | static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ |
||
472 | 9bf0fdf3 | David Conrad | put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
473 | }\ |
||
474 | static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ |
||
475 | avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
||
476 | 82821c91 | Christophe Gisquet | } |
477 | |||
478 | DECLARE_FUNCTION(0, 1) |
||
479 | DECLARE_FUNCTION(0, 2) |
||
480 | DECLARE_FUNCTION(0, 3) |
||
481 | |||
482 | DECLARE_FUNCTION(1, 0) |
||
483 | DECLARE_FUNCTION(1, 1) |
||
484 | DECLARE_FUNCTION(1, 2) |
||
485 | DECLARE_FUNCTION(1, 3) |
||
486 | |||
487 | DECLARE_FUNCTION(2, 0) |
||
488 | DECLARE_FUNCTION(2, 1) |
||
489 | DECLARE_FUNCTION(2, 2) |
||
490 | DECLARE_FUNCTION(2, 3) |
||
491 | |||
492 | DECLARE_FUNCTION(3, 0) |
||
493 | DECLARE_FUNCTION(3, 1) |
||
494 | DECLARE_FUNCTION(3, 2) |
||
495 | DECLARE_FUNCTION(3, 3) |
||
496 | |||
497 | 4f717c69 | Jason Garrett-Glaser | static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
498 | { |
||
499 | int dc = block[0]; |
||
500 | dc = (17 * dc + 4) >> 3; |
||
501 | dc = (17 * dc + 64) >> 7; |
||
502 | __asm__ volatile(
|
||
503 | "movd %0, %%mm0 \n\t"
|
||
504 | "pshufw $0, %%mm0, %%mm0 \n\t"
|
||
505 | "pxor %%mm1, %%mm1 \n\t"
|
||
506 | "psubw %%mm0, %%mm1 \n\t"
|
||
507 | "packuswb %%mm0, %%mm0 \n\t"
|
||
508 | "packuswb %%mm1, %%mm1 \n\t"
|
||
509 | ::"r"(dc)
|
||
510 | ); |
||
511 | __asm__ volatile(
|
||
512 | "movd %0, %%mm2 \n\t"
|
||
513 | "movd %1, %%mm3 \n\t"
|
||
514 | "movd %2, %%mm4 \n\t"
|
||
515 | "movd %3, %%mm5 \n\t"
|
||
516 | "paddusb %%mm0, %%mm2 \n\t"
|
||
517 | "paddusb %%mm0, %%mm3 \n\t"
|
||
518 | "paddusb %%mm0, %%mm4 \n\t"
|
||
519 | "paddusb %%mm0, %%mm5 \n\t"
|
||
520 | "psubusb %%mm1, %%mm2 \n\t"
|
||
521 | "psubusb %%mm1, %%mm3 \n\t"
|
||
522 | "psubusb %%mm1, %%mm4 \n\t"
|
||
523 | "psubusb %%mm1, %%mm5 \n\t"
|
||
524 | "movd %%mm2, %0 \n\t"
|
||
525 | "movd %%mm3, %1 \n\t"
|
||
526 | "movd %%mm4, %2 \n\t"
|
||
527 | "movd %%mm5, %3 \n\t"
|
||
528 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
||
529 | "+m"(*(uint32_t*)(dest+1*linesize)), |
||
530 | "+m"(*(uint32_t*)(dest+2*linesize)), |
||
531 | "+m"(*(uint32_t*)(dest+3*linesize)) |
||
532 | ); |
||
533 | } |
||
534 | |||
535 | static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
||
536 | { |
||
537 | int dc = block[0]; |
||
538 | dc = (17 * dc + 4) >> 3; |
||
539 | dc = (12 * dc + 64) >> 7; |
||
540 | __asm__ volatile(
|
||
541 | "movd %0, %%mm0 \n\t"
|
||
542 | "pshufw $0, %%mm0, %%mm0 \n\t"
|
||
543 | "pxor %%mm1, %%mm1 \n\t"
|
||
544 | "psubw %%mm0, %%mm1 \n\t"
|
||
545 | "packuswb %%mm0, %%mm0 \n\t"
|
||
546 | "packuswb %%mm1, %%mm1 \n\t"
|
||
547 | ::"r"(dc)
|
||
548 | ); |
||
549 | __asm__ volatile(
|
||
550 | "movd %0, %%mm2 \n\t"
|
||
551 | "movd %1, %%mm3 \n\t"
|
||
552 | "movd %2, %%mm4 \n\t"
|
||
553 | "movd %3, %%mm5 \n\t"
|
||
554 | "paddusb %%mm0, %%mm2 \n\t"
|
||
555 | "paddusb %%mm0, %%mm3 \n\t"
|
||
556 | "paddusb %%mm0, %%mm4 \n\t"
|
||
557 | "paddusb %%mm0, %%mm5 \n\t"
|
||
558 | "psubusb %%mm1, %%mm2 \n\t"
|
||
559 | "psubusb %%mm1, %%mm3 \n\t"
|
||
560 | "psubusb %%mm1, %%mm4 \n\t"
|
||
561 | "psubusb %%mm1, %%mm5 \n\t"
|
||
562 | "movd %%mm2, %0 \n\t"
|
||
563 | "movd %%mm3, %1 \n\t"
|
||
564 | "movd %%mm4, %2 \n\t"
|
||
565 | "movd %%mm5, %3 \n\t"
|
||
566 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
||
567 | "+m"(*(uint32_t*)(dest+1*linesize)), |
||
568 | "+m"(*(uint32_t*)(dest+2*linesize)), |
||
569 | "+m"(*(uint32_t*)(dest+3*linesize)) |
||
570 | ); |
||
571 | dest += 4*linesize;
|
||
572 | __asm__ volatile(
|
||
573 | "movd %0, %%mm2 \n\t"
|
||
574 | "movd %1, %%mm3 \n\t"
|
||
575 | "movd %2, %%mm4 \n\t"
|
||
576 | "movd %3, %%mm5 \n\t"
|
||
577 | "paddusb %%mm0, %%mm2 \n\t"
|
||
578 | "paddusb %%mm0, %%mm3 \n\t"
|
||
579 | "paddusb %%mm0, %%mm4 \n\t"
|
||
580 | "paddusb %%mm0, %%mm5 \n\t"
|
||
581 | "psubusb %%mm1, %%mm2 \n\t"
|
||
582 | "psubusb %%mm1, %%mm3 \n\t"
|
||
583 | "psubusb %%mm1, %%mm4 \n\t"
|
||
584 | "psubusb %%mm1, %%mm5 \n\t"
|
||
585 | "movd %%mm2, %0 \n\t"
|
||
586 | "movd %%mm3, %1 \n\t"
|
||
587 | "movd %%mm4, %2 \n\t"
|
||
588 | "movd %%mm5, %3 \n\t"
|
||
589 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
||
590 | "+m"(*(uint32_t*)(dest+1*linesize)), |
||
591 | "+m"(*(uint32_t*)(dest+2*linesize)), |
||
592 | "+m"(*(uint32_t*)(dest+3*linesize)) |
||
593 | ); |
||
594 | } |
||
595 | |||
596 | static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
||
597 | { |
||
598 | int dc = block[0]; |
||
599 | dc = ( 3 * dc + 1) >> 1; |
||
600 | dc = (17 * dc + 64) >> 7; |
||
601 | __asm__ volatile(
|
||
602 | "movd %0, %%mm0 \n\t"
|
||
603 | "pshufw $0, %%mm0, %%mm0 \n\t"
|
||
604 | "pxor %%mm1, %%mm1 \n\t"
|
||
605 | "psubw %%mm0, %%mm1 \n\t"
|
||
606 | "packuswb %%mm0, %%mm0 \n\t"
|
||
607 | "packuswb %%mm1, %%mm1 \n\t"
|
||
608 | ::"r"(dc)
|
||
609 | ); |
||
610 | __asm__ volatile(
|
||
611 | "movq %0, %%mm2 \n\t"
|
||
612 | "movq %1, %%mm3 \n\t"
|
||
613 | "movq %2, %%mm4 \n\t"
|
||
614 | "movq %3, %%mm5 \n\t"
|
||
615 | "paddusb %%mm0, %%mm2 \n\t"
|
||
616 | "paddusb %%mm0, %%mm3 \n\t"
|
||
617 | "paddusb %%mm0, %%mm4 \n\t"
|
||
618 | "paddusb %%mm0, %%mm5 \n\t"
|
||
619 | "psubusb %%mm1, %%mm2 \n\t"
|
||
620 | "psubusb %%mm1, %%mm3 \n\t"
|
||
621 | "psubusb %%mm1, %%mm4 \n\t"
|
||
622 | "psubusb %%mm1, %%mm5 \n\t"
|
||
623 | "movq %%mm2, %0 \n\t"
|
||
624 | "movq %%mm3, %1 \n\t"
|
||
625 | "movq %%mm4, %2 \n\t"
|
||
626 | "movq %%mm5, %3 \n\t"
|
||
627 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
||
628 | "+m"(*(uint32_t*)(dest+1*linesize)), |
||
629 | "+m"(*(uint32_t*)(dest+2*linesize)), |
||
630 | "+m"(*(uint32_t*)(dest+3*linesize)) |
||
631 | ); |
||
632 | } |
||
633 | |||
634 | static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block) |
||
635 | { |
||
636 | int dc = block[0]; |
||
637 | dc = (3 * dc + 1) >> 1; |
||
638 | dc = (3 * dc + 16) >> 5; |
||
639 | __asm__ volatile(
|
||
640 | "movd %0, %%mm0 \n\t"
|
||
641 | "pshufw $0, %%mm0, %%mm0 \n\t"
|
||
642 | "pxor %%mm1, %%mm1 \n\t"
|
||
643 | "psubw %%mm0, %%mm1 \n\t"
|
||
644 | "packuswb %%mm0, %%mm0 \n\t"
|
||
645 | "packuswb %%mm1, %%mm1 \n\t"
|
||
646 | ::"r"(dc)
|
||
647 | ); |
||
648 | __asm__ volatile(
|
||
649 | "movq %0, %%mm2 \n\t"
|
||
650 | "movq %1, %%mm3 \n\t"
|
||
651 | "movq %2, %%mm4 \n\t"
|
||
652 | "movq %3, %%mm5 \n\t"
|
||
653 | "paddusb %%mm0, %%mm2 \n\t"
|
||
654 | "paddusb %%mm0, %%mm3 \n\t"
|
||
655 | "paddusb %%mm0, %%mm4 \n\t"
|
||
656 | "paddusb %%mm0, %%mm5 \n\t"
|
||
657 | "psubusb %%mm1, %%mm2 \n\t"
|
||
658 | "psubusb %%mm1, %%mm3 \n\t"
|
||
659 | "psubusb %%mm1, %%mm4 \n\t"
|
||
660 | "psubusb %%mm1, %%mm5 \n\t"
|
||
661 | "movq %%mm2, %0 \n\t"
|
||
662 | "movq %%mm3, %1 \n\t"
|
||
663 | "movq %%mm4, %2 \n\t"
|
||
664 | "movq %%mm5, %3 \n\t"
|
||
665 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
||
666 | "+m"(*(uint32_t*)(dest+1*linesize)), |
||
667 | "+m"(*(uint32_t*)(dest+2*linesize)), |
||
668 | "+m"(*(uint32_t*)(dest+3*linesize)) |
||
669 | ); |
||
670 | dest += 4*linesize;
|
||
671 | __asm__ volatile(
|
||
672 | "movq %0, %%mm2 \n\t"
|
||
673 | "movq %1, %%mm3 \n\t"
|
||
674 | "movq %2, %%mm4 \n\t"
|
||
675 | "movq %3, %%mm5 \n\t"
|
||
676 | "paddusb %%mm0, %%mm2 \n\t"
|
||
677 | "paddusb %%mm0, %%mm3 \n\t"
|
||
678 | "paddusb %%mm0, %%mm4 \n\t"
|
||
679 | "paddusb %%mm0, %%mm5 \n\t"
|
||
680 | "psubusb %%mm1, %%mm2 \n\t"
|
||
681 | "psubusb %%mm1, %%mm3 \n\t"
|
||
682 | "psubusb %%mm1, %%mm4 \n\t"
|
||
683 | "psubusb %%mm1, %%mm5 \n\t"
|
||
684 | "movq %%mm2, %0 \n\t"
|
||
685 | "movq %%mm3, %1 \n\t"
|
||
686 | "movq %%mm4, %2 \n\t"
|
||
687 | "movq %%mm5, %3 \n\t"
|
||
688 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
||
689 | "+m"(*(uint32_t*)(dest+1*linesize)), |
||
690 | "+m"(*(uint32_t*)(dest+2*linesize)), |
||
691 | "+m"(*(uint32_t*)(dest+3*linesize)) |
||
692 | ); |
||
693 | } |
||
694 | |||
695 | 82821c91 | Christophe Gisquet | void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
|
696 | 9bf0fdf3 | David Conrad | mm_flags = mm_support(); |
697 | |||
698 | 5b67ce2a | Aurelien Jacobs | dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
|
699 | 82821c91 | Christophe Gisquet | dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
|
700 | dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
|
||
701 | dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
|
||
702 | |||
703 | dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
|
||
704 | dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
|
||
705 | dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
|
||
706 | dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
|
||
707 | |||
708 | dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
|
||
709 | dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
|
||
710 | dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
|
||
711 | dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
|
||
712 | |||
713 | dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
|
||
714 | dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
|
||
715 | dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
|
||
716 | dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
|
||
717 | 9bf0fdf3 | David Conrad | |
718 | if (mm_flags & FF_MM_MMX2){
|
||
719 | dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2;
|
||
720 | dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2;
|
||
721 | dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2;
|
||
722 | dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2;
|
||
723 | |||
724 | dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2;
|
||
725 | dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2;
|
||
726 | dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2;
|
||
727 | dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2;
|
||
728 | |||
729 | dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2;
|
||
730 | dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2;
|
||
731 | dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2;
|
||
732 | dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2;
|
||
733 | |||
734 | dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2;
|
||
735 | dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
|
||
736 | dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
|
||
737 | dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
|
||
738 | 4f717c69 | Jason Garrett-Glaser | |
739 | dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; |
||
740 | dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; |
||
741 | dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; |
||
742 | dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; |
||
743 | 9bf0fdf3 | David Conrad | } |
744 | 82821c91 | Christophe Gisquet | } |