ffmpeg / libavcodec / x86 / vp8dsp.asm @ 2dd2f716
History | View | Annotate | Download (28.4 KB)
1 |
;****************************************************************************** |
---|---|
2 |
;* VP8 MMXEXT optimizations |
3 |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
4 |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
5 |
;* |
6 |
;* This file is part of FFmpeg. |
7 |
;* |
8 |
;* FFmpeg is free software; you can redistribute it and/or |
9 |
;* modify it under the terms of the GNU Lesser General Public |
10 |
;* License as published by the Free Software Foundation; either |
11 |
;* version 2.1 of the License, or (at your option) any later version. |
12 |
;* |
13 |
;* FFmpeg is distributed in the hope that it will be useful, |
14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 |
;* Lesser General Public License for more details. |
17 |
;* |
18 |
;* You should have received a copy of the GNU Lesser General Public |
19 |
;* License along with FFmpeg; if not, write to the Free Software |
20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 |
;****************************************************************************** |
22 |
|
23 |
%include "x86inc.asm" |
24 |
%include "x86util.asm" |
25 |
|
26 |
SECTION_RODATA |
27 |
|
28 |
fourtap_filter_hw_m: times 4 dw -6, 123 |
29 |
times 4 dw 12, -1 |
30 |
times 4 dw -9, 93 |
31 |
times 4 dw 50, -6 |
32 |
times 4 dw -6, 50 |
33 |
times 4 dw 93, -9 |
34 |
times 4 dw -1, 12 |
35 |
times 4 dw 123, -6 |
36 |
|
37 |
sixtap_filter_hw_m: times 4 dw 2, -11 |
38 |
times 4 dw 108, 36 |
39 |
times 4 dw -8, 1 |
40 |
times 4 dw 3, -16 |
41 |
times 4 dw 77, 77 |
42 |
times 4 dw -16, 3 |
43 |
times 4 dw 1, -8 |
44 |
times 4 dw 36, 108 |
45 |
times 4 dw -11, 2 |
46 |
|
47 |
fourtap_filter_hb_m: times 8 db -6, -1 |
48 |
times 8 db 123, 12 |
49 |
times 8 db -9, -6 |
50 |
times 8 db 93, 50 |
51 |
times 8 db -6, -9 |
52 |
times 8 db 50, 93 |
53 |
times 8 db -1, -6 |
54 |
times 8 db 12, 123 |
55 |
|
56 |
sixtap_filter_hb_m: times 8 db 2, 1 |
57 |
times 8 db -11, 108 |
58 |
times 8 db 36, -8 |
59 |
times 8 db 3, 3 |
60 |
times 8 db -16, 77 |
61 |
times 8 db 77, -16 |
62 |
times 8 db 1, 2 |
63 |
times 8 db -8, 36 |
64 |
times 8 db 108, -11 |
65 |
|
66 |
fourtap_filter_v_m: times 8 dw -6 |
67 |
times 8 dw 123 |
68 |
times 8 dw 12 |
69 |
times 8 dw -1 |
70 |
times 8 dw -9 |
71 |
times 8 dw 93 |
72 |
times 8 dw 50 |
73 |
times 8 dw -6 |
74 |
times 8 dw -6 |
75 |
times 8 dw 50 |
76 |
times 8 dw 93 |
77 |
times 8 dw -9 |
78 |
times 8 dw -1 |
79 |
times 8 dw 12 |
80 |
times 8 dw 123 |
81 |
times 8 dw -6 |
82 |
|
83 |
sixtap_filter_v_m: times 8 dw 2 |
84 |
times 8 dw -11 |
85 |
times 8 dw 108 |
86 |
times 8 dw 36 |
87 |
times 8 dw -8 |
88 |
times 8 dw 1 |
89 |
times 8 dw 3 |
90 |
times 8 dw -16 |
91 |
times 8 dw 77 |
92 |
times 8 dw 77 |
93 |
times 8 dw -16 |
94 |
times 8 dw 3 |
95 |
times 8 dw 1 |
96 |
times 8 dw -8 |
97 |
times 8 dw 36 |
98 |
times 8 dw 108 |
99 |
times 8 dw -11 |
100 |
times 8 dw 2 |
101 |
|
102 |
bilinear_filter_vw_m: times 8 dw 1 |
103 |
times 8 dw 2 |
104 |
times 8 dw 3 |
105 |
times 8 dw 4 |
106 |
times 8 dw 5 |
107 |
times 8 dw 6 |
108 |
times 8 dw 7 |
109 |
|
110 |
bilinear_filter_vb_m: times 8 db 7, 1 |
111 |
times 8 db 6, 2 |
112 |
times 8 db 5, 3 |
113 |
times 8 db 4, 4 |
114 |
times 8 db 3, 5 |
115 |
times 8 db 2, 6 |
116 |
times 8 db 1, 7 |
117 |
|
118 |
%ifdef PIC |
119 |
%define fourtap_filter_hw r11 |
120 |
%define sixtap_filter_hw r11 |
121 |
%define fourtap_filter_hb r11 |
122 |
%define sixtap_filter_hb r11 |
123 |
%define fourtap_filter_v r11 |
124 |
%define sixtap_filter_v r11 |
125 |
%define bilinear_filter_vw r11 |
126 |
%define bilinear_filter_vb r11 |
127 |
%else |
128 |
%define fourtap_filter_hw fourtap_filter_hw_m |
129 |
%define sixtap_filter_hw sixtap_filter_hw_m |
130 |
%define fourtap_filter_hb fourtap_filter_hb_m |
131 |
%define sixtap_filter_hb sixtap_filter_hb_m |
132 |
%define fourtap_filter_v fourtap_filter_v_m |
133 |
%define sixtap_filter_v sixtap_filter_v_m |
134 |
%define bilinear_filter_vw bilinear_filter_vw_m |
135 |
%define bilinear_filter_vb bilinear_filter_vb_m |
136 |
%endif |
137 |
|
138 |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
139 |
filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 |
140 |
|
141 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
143 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
144 |
|
145 |
pw_20091: times 4 dw 20091 |
146 |
pw_17734: times 4 dw 17734 |
147 |
|
148 |
cextern pw_3 |
149 |
cextern pw_4 |
150 |
cextern pw_64 |
151 |
|
152 |
SECTION .text |
153 |
|
154 |
;----------------------------------------------------------------------------- |
155 |
; subpel MC functions: |
156 |
; |
157 |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |
158 |
; uint8_t *src, int srcstride, |
159 |
; int height, int mx, int my); |
160 |
;----------------------------------------------------------------------------- |
161 |
|
162 |
; 4x4 block, H-only 4-tap filter |
163 |
cglobal put_vp8_epel4_h4_mmxext, 6, 6 |
164 |
shl r5d, 4 |
165 |
%ifdef PIC |
166 |
lea r11, [fourtap_filter_hw_m] |
167 |
%endif |
168 |
movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
169 |
movq mm5, [fourtap_filter_hw+r5] |
170 |
movq mm7, [pw_64] |
171 |
pxor mm6, mm6 |
172 |
|
173 |
.nextrow |
174 |
movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |
175 |
|
176 |
; first set of 2 pixels |
177 |
movq mm2, mm1 ; byte ABCD.. |
178 |
punpcklbw mm1, mm6 ; byte->word ABCD |
179 |
pshufw mm0, mm2, 9 ; byte CDEF.. |
180 |
punpcklbw mm0, mm6 ; byte->word CDEF |
181 |
pshufw mm3, mm1, 0x94 ; word ABBC |
182 |
pshufw mm1, mm0, 0x94 ; word CDDE |
183 |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |
184 |
movq mm0, mm1 ; backup for second set of pixels |
185 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
186 |
paddd mm3, mm1 ; finish 1st 2px |
187 |
|
188 |
; second set of 2 pixels, use backup of above |
189 |
punpckhbw mm2, mm6 ; byte->word EFGH |
190 |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |
191 |
pshufw mm1, mm2, 0x94 ; word EFFG |
192 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
193 |
paddd mm0, mm1 ; finish 2nd 2px |
194 |
|
195 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |
196 |
packssdw mm3, mm0 ; merge dword->word (4px) |
197 |
paddsw mm3, mm7 ; rounding |
198 |
psraw mm3, 7 |
199 |
packuswb mm3, mm6 ; clip and word->bytes |
200 |
movd [r0], mm3 ; store |
201 |
|
202 |
; go to next line |
203 |
add r0, r1 |
204 |
add r2, r3 |
205 |
dec r4 ; next row |
206 |
jg .nextrow |
207 |
REP_RET |
208 |
|
209 |
; 4x4 block, H-only 6-tap filter |
210 |
cglobal put_vp8_epel4_h6_mmxext, 6, 6 |
211 |
lea r5d, [r5*3] |
212 |
%ifdef PIC |
213 |
lea r11, [sixtap_filter_hw_m] |
214 |
%endif |
215 |
movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |
216 |
movq mm5, [sixtap_filter_hw+r5*8-32] |
217 |
movq mm6, [sixtap_filter_hw+r5*8-16] |
218 |
movq mm7, [pw_64] |
219 |
pxor mm3, mm3 |
220 |
|
221 |
.nextrow |
222 |
movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |
223 |
|
224 |
; first set of 2 pixels |
225 |
movq mm2, mm1 ; byte ABCD.. |
226 |
punpcklbw mm1, mm3 ; byte->word ABCD |
227 |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |
228 |
punpckhbw mm2, mm3 ; byte->word EFGH |
229 |
punpcklbw mm0, mm3 ; byte->word CDEF |
230 |
pshufw mm1, mm1, 0x94 ; word ABBC |
231 |
pshufw mm2, mm2, 0x94 ; word EFFG |
232 |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |
233 |
pshufw mm3, mm0, 0x94 ; word CDDE |
234 |
movq mm0, mm3 ; backup for second set of pixels |
235 |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |
236 |
paddd mm1, mm3 ; add to 1st 2px cache |
237 |
movq mm3, mm2 ; backup for second set of pixels |
238 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
239 |
paddd mm1, mm2 ; finish 1st 2px |
240 |
|
241 |
; second set of 2 pixels, use backup of above |
242 |
movd mm2, [r2+3] ; byte FGHI (prevent overreads) |
243 |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |
244 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |
245 |
paddd mm0, mm3 ; add to 2nd 2px cache |
246 |
pxor mm3, mm3 |
247 |
punpcklbw mm2, mm3 ; byte->word FGHI |
248 |
pshufw mm2, mm2, 0xE9 ; word GHHI |
249 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
250 |
paddd mm0, mm2 ; finish 2nd 2px |
251 |
|
252 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |
253 |
packssdw mm1, mm0 ; merge dword->word (4px) |
254 |
paddsw mm1, mm7 ; rounding |
255 |
psraw mm1, 7 |
256 |
packuswb mm1, mm3 ; clip and word->bytes |
257 |
movd [r0], mm1 ; store |
258 |
|
259 |
; go to next line |
260 |
add r0, r1 |
261 |
add r2, r3 |
262 |
dec r4 ; next row |
263 |
jg .nextrow |
264 |
REP_RET |
265 |
|
266 |
; 4x4 block, H-only 4-tap filter |
267 |
INIT_XMM |
268 |
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |
269 |
shl r5d, 4 |
270 |
%ifdef PIC |
271 |
lea r11, [fourtap_filter_hw_m] |
272 |
%endif |
273 |
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
274 |
mova m6, [fourtap_filter_hw+r5] |
275 |
pxor m7, m7 |
276 |
|
277 |
.nextrow |
278 |
movh m0, [r2-1] |
279 |
punpcklbw m0, m7 ; ABCDEFGH |
280 |
mova m1, m0 |
281 |
mova m2, m0 |
282 |
mova m3, m0 |
283 |
psrldq m1, 2 ; BCDEFGH |
284 |
psrldq m2, 4 ; CDEFGH |
285 |
psrldq m3, 6 ; DEFGH |
286 |
punpcklwd m0, m1 ; ABBCCDDE |
287 |
punpcklwd m2, m3 ; CDDEEFFG |
288 |
pmaddwd m0, m5 |
289 |
pmaddwd m2, m6 |
290 |
paddd m0, m2 |
291 |
|
292 |
movh m1, [r2+3] |
293 |
punpcklbw m1, m7 ; ABCDEFGH |
294 |
mova m2, m1 |
295 |
mova m3, m1 |
296 |
mova m4, m1 |
297 |
psrldq m2, 2 ; BCDEFGH |
298 |
psrldq m3, 4 ; CDEFGH |
299 |
psrldq m4, 6 ; DEFGH |
300 |
punpcklwd m1, m2 ; ABBCCDDE |
301 |
punpcklwd m3, m4 ; CDDEEFFG |
302 |
pmaddwd m1, m5 |
303 |
pmaddwd m3, m6 |
304 |
paddd m1, m3 |
305 |
|
306 |
packssdw m0, m1 |
307 |
paddsw m0, [pw_64] |
308 |
psraw m0, 7 |
309 |
packuswb m0, m7 |
310 |
movh [r0], m0 ; store |
311 |
|
312 |
; go to next line |
313 |
add r0, r1 |
314 |
add r2, r3 |
315 |
dec r4 ; next row |
316 |
jg .nextrow |
317 |
REP_RET |
318 |
|
319 |
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |
320 |
lea r5d, [r5*3] |
321 |
%ifdef PIC |
322 |
lea r11, [sixtap_filter_hw_m] |
323 |
%endif |
324 |
lea r5, [sixtap_filter_hw+r5*8] |
325 |
pxor m7, m7 |
326 |
|
327 |
.nextrow |
328 |
movu m0, [r2-2] |
329 |
mova m6, m0 |
330 |
mova m4, m0 |
331 |
punpcklbw m0, m7 ; ABCDEFGHI |
332 |
mova m1, m0 |
333 |
mova m2, m0 |
334 |
mova m3, m0 |
335 |
psrldq m1, 2 ; BCDEFGH |
336 |
psrldq m2, 4 ; CDEFGH |
337 |
psrldq m3, 6 ; DEFGH |
338 |
psrldq m4, 4 |
339 |
punpcklbw m4, m7 ; EFGH |
340 |
mova m5, m4 |
341 |
psrldq m5, 2 ; FGH |
342 |
punpcklwd m0, m1 ; ABBCCDDE |
343 |
punpcklwd m2, m3 ; CDDEEFFG |
344 |
punpcklwd m4, m5 ; EFFGGHHI |
345 |
pmaddwd m0, [r5-48] |
346 |
pmaddwd m2, [r5-32] |
347 |
pmaddwd m4, [r5-16] |
348 |
paddd m0, m2 |
349 |
paddd m0, m4 |
350 |
|
351 |
psrldq m6, 4 |
352 |
mova m4, m6 |
353 |
punpcklbw m6, m7 ; ABCDEFGHI |
354 |
mova m1, m6 |
355 |
mova m2, m6 |
356 |
mova m3, m6 |
357 |
psrldq m1, 2 ; BCDEFGH |
358 |
psrldq m2, 4 ; CDEFGH |
359 |
psrldq m3, 6 ; DEFGH |
360 |
psrldq m4, 4 |
361 |
punpcklbw m4, m7 ; EFGH |
362 |
mova m5, m4 |
363 |
psrldq m5, 2 ; FGH |
364 |
punpcklwd m6, m1 ; ABBCCDDE |
365 |
punpcklwd m2, m3 ; CDDEEFFG |
366 |
punpcklwd m4, m5 ; EFFGGHHI |
367 |
pmaddwd m6, [r5-48] |
368 |
pmaddwd m2, [r5-32] |
369 |
pmaddwd m4, [r5-16] |
370 |
paddd m6, m2 |
371 |
paddd m6, m4 |
372 |
|
373 |
packssdw m0, m6 |
374 |
paddsw m0, [pw_64] |
375 |
psraw m0, 7 |
376 |
packuswb m0, m7 |
377 |
movh [r0], m0 ; store |
378 |
|
379 |
; go to next line |
380 |
add r0, r1 |
381 |
add r2, r3 |
382 |
dec r4 ; next row |
383 |
jg .nextrow |
384 |
REP_RET |
385 |
|
386 |
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 |
387 |
shl r5d, 4 |
388 |
mova m2, [pw_64] |
389 |
mova m3, [filter_h4_shuf] |
390 |
mova m4, [filter_h6_shuf2] |
391 |
%ifdef PIC |
392 |
lea r11, [fourtap_filter_hb_m] |
393 |
%endif |
394 |
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
395 |
mova m6, [fourtap_filter_hb+r5] |
396 |
|
397 |
.nextrow |
398 |
movu m0, [r2-1] |
399 |
mova m1, m0 |
400 |
pshufb m0, m3 |
401 |
pshufb m1, m4 |
402 |
pmaddubsw m0, m5 |
403 |
pmaddubsw m1, m6 |
404 |
paddsw m0, m2 |
405 |
paddsw m0, m1 |
406 |
psraw m0, 7 |
407 |
packuswb m0, m0 |
408 |
movh [r0], m0 ; store |
409 |
|
410 |
; go to next line |
411 |
add r0, r1 |
412 |
add r2, r3 |
413 |
dec r4 ; next row |
414 |
jg .nextrow |
415 |
REP_RET |
416 |
|
417 |
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 |
418 |
lea r5d, [r5*3] |
419 |
mova m3, [filter_h6_shuf1] |
420 |
mova m4, [filter_h6_shuf2] |
421 |
%ifdef PIC |
422 |
lea r11, [sixtap_filter_hb_m] |
423 |
%endif |
424 |
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
425 |
mova m6, [sixtap_filter_hb+r5*8-32] |
426 |
mova m7, [sixtap_filter_hb+r5*8-16] |
427 |
|
428 |
.nextrow |
429 |
movu m0, [r2-2] |
430 |
mova m1, m0 |
431 |
mova m2, m0 |
432 |
pshufb m0, m3 |
433 |
pshufb m1, m4 |
434 |
pshufb m2, [filter_h6_shuf3] |
435 |
pmaddubsw m0, m5 |
436 |
pmaddubsw m1, m6 |
437 |
pmaddubsw m2, m7 |
438 |
paddsw m0, m1 |
439 |
paddsw m0, m2 |
440 |
paddsw m0, [pw_64] |
441 |
psraw m0, 7 |
442 |
packuswb m0, m0 |
443 |
movh [r0], m0 ; store |
444 |
|
445 |
; go to next line |
446 |
add r0, r1 |
447 |
add r2, r3 |
448 |
dec r4 ; next row |
449 |
jg .nextrow |
450 |
REP_RET |
451 |
|
452 |
%macro FILTER_V 3 |
453 |
; 4x4 block, V-only 4-tap filter |
454 |
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |
455 |
shl r6d, 5 |
456 |
%ifdef PIC |
457 |
lea r11, [fourtap_filter_v_m] |
458 |
%endif |
459 |
lea r6, [fourtap_filter_v+r6-32] |
460 |
mova m6, [pw_64] |
461 |
pxor m7, m7 |
462 |
mova m5, [r6+48] |
463 |
|
464 |
; read 3 lines |
465 |
sub r2, r3 |
466 |
movh m0, [r2] |
467 |
movh m1, [r2+ r3] |
468 |
movh m2, [r2+2*r3] |
469 |
add r2, r3 |
470 |
punpcklbw m0, m7 |
471 |
punpcklbw m1, m7 |
472 |
punpcklbw m2, m7 |
473 |
|
474 |
.nextrow |
475 |
; first calculate negative taps (to prevent losing positive overflows) |
476 |
movh m4, [r2+2*r3] ; read new row |
477 |
punpcklbw m4, m7 |
478 |
mova m3, m4 |
479 |
pmullw m0, [r6+0] |
480 |
pmullw m4, m5 |
481 |
paddsw m4, m0 |
482 |
|
483 |
; then calculate positive taps |
484 |
mova m0, m1 |
485 |
pmullw m1, [r6+16] |
486 |
paddsw m4, m1 |
487 |
mova m1, m2 |
488 |
pmullw m2, [r6+32] |
489 |
paddsw m4, m2 |
490 |
mova m2, m3 |
491 |
|
492 |
; round/clip/store |
493 |
paddsw m4, m6 |
494 |
psraw m4, 7 |
495 |
packuswb m4, m7 |
496 |
movh [r0], m4 |
497 |
|
498 |
; go to next line |
499 |
add r0, r1 |
500 |
add r2, r3 |
501 |
dec r4 ; next row |
502 |
jg .nextrow |
503 |
REP_RET |
504 |
|
505 |
|
506 |
; 4x4 block, V-only 6-tap filter |
507 |
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |
508 |
shl r6d, 4 |
509 |
lea r6, [r6*3] |
510 |
%ifdef PIC |
511 |
lea r11, [sixtap_filter_v_m] |
512 |
%endif |
513 |
lea r6, [sixtap_filter_v+r6-96] |
514 |
pxor m7, m7 |
515 |
|
516 |
; read 5 lines |
517 |
sub r2, r3 |
518 |
sub r2, r3 |
519 |
movh m0, [r2] |
520 |
movh m1, [r2+r3] |
521 |
movh m2, [r2+r3*2] |
522 |
lea r2, [r2+r3*2] |
523 |
add r2, r3 |
524 |
movh m3, [r2] |
525 |
movh m4, [r2+r3] |
526 |
punpcklbw m0, m7 |
527 |
punpcklbw m1, m7 |
528 |
punpcklbw m2, m7 |
529 |
punpcklbw m3, m7 |
530 |
punpcklbw m4, m7 |
531 |
|
532 |
.nextrow |
533 |
; first calculate negative taps (to prevent losing positive overflows) |
534 |
mova m5, m1 |
535 |
pmullw m5, [r6+16] |
536 |
mova m6, m4 |
537 |
pmullw m6, [r6+64] |
538 |
paddsw m6, m5 |
539 |
|
540 |
; then calculate positive taps |
541 |
movh m5, [r2+2*r3] ; read new row |
542 |
punpcklbw m5, m7 |
543 |
pmullw m0, [r6+0] |
544 |
paddsw m6, m0 |
545 |
mova m0, m1 |
546 |
mova m1, m2 |
547 |
pmullw m2, [r6+32] |
548 |
paddsw m6, m2 |
549 |
mova m2, m3 |
550 |
pmullw m3, [r6+48] |
551 |
paddsw m6, m3 |
552 |
mova m3, m4 |
553 |
mova m4, m5 |
554 |
pmullw m5, [r6+80] |
555 |
paddsw m6, m5 |
556 |
|
557 |
; round/clip/store |
558 |
paddsw m6, [pw_64] |
559 |
psraw m6, 7 |
560 |
packuswb m6, m7 |
561 |
movh [r0], m6 |
562 |
|
563 |
; go to next line |
564 |
add r0, r1 |
565 |
add r2, r3 |
566 |
dec r4 ; next row |
567 |
jg .nextrow |
568 |
REP_RET |
569 |
%endmacro |
570 |
|
571 |
INIT_MMX |
572 |
FILTER_V mmxext, 4, 0 |
573 |
INIT_XMM |
574 |
FILTER_V sse2, 8, 8 |
575 |
|
576 |
cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 |
577 |
shl r6d, 4 |
578 |
%ifdef PIC |
579 |
lea r11, [fourtap_filter_hb_m] |
580 |
%endif |
581 |
mova m5, [fourtap_filter_hb+r6-16] |
582 |
mova m6, [fourtap_filter_hb+r6] |
583 |
mova m7, [pw_64] |
584 |
|
585 |
; read 3 lines |
586 |
sub r2, r3 |
587 |
movh m0, [r2] |
588 |
movh m1, [r2+ r3] |
589 |
movh m2, [r2+2*r3] |
590 |
add r2, r3 |
591 |
|
592 |
.nextrow |
593 |
movh m3, [r2+2*r3] ; read new row |
594 |
mova m4, m0 |
595 |
mova m0, m1 |
596 |
punpcklbw m4, m3 |
597 |
punpcklbw m1, m2 |
598 |
pmaddubsw m4, m5 |
599 |
pmaddubsw m1, m6 |
600 |
paddsw m4, m1 |
601 |
mova m1, m2 |
602 |
paddsw m4, m7 |
603 |
mova m2, m3 |
604 |
psraw m4, 7 |
605 |
packuswb m4, m4 |
606 |
movh [r0], m4 |
607 |
|
608 |
; go to next line |
609 |
add r0, r1 |
610 |
add r2, r3 |
611 |
dec r4 ; next row |
612 |
jg .nextrow |
613 |
REP_RET |
614 |
|
615 |
cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 |
616 |
lea r6d, [r6*3] |
617 |
%ifdef PIC |
618 |
lea r11, [sixtap_filter_hb_m] |
619 |
%endif |
620 |
lea r6, [sixtap_filter_hb+r6*8] |
621 |
|
622 |
; read 5 lines |
623 |
sub r2, r3 |
624 |
sub r2, r3 |
625 |
movh m0, [r2] |
626 |
movh m1, [r2+r3] |
627 |
movh m2, [r2+r3*2] |
628 |
lea r2, [r2+r3*2] |
629 |
add r2, r3 |
630 |
movh m3, [r2] |
631 |
movh m4, [r2+r3] |
632 |
|
633 |
.nextrow |
634 |
movh m5, [r2+2*r3] ; read new row |
635 |
mova m6, m0 |
636 |
punpcklbw m6, m5 |
637 |
mova m0, m1 |
638 |
punpcklbw m1, m2 |
639 |
mova m7, m3 |
640 |
punpcklbw m7, m4 |
641 |
pmaddubsw m6, [r6-48] |
642 |
pmaddubsw m1, [r6-32] |
643 |
pmaddubsw m7, [r6-16] |
644 |
paddsw m6, m1 |
645 |
paddsw m6, m7 |
646 |
mova m1, m2 |
647 |
paddsw m6, [pw_64] |
648 |
mova m2, m3 |
649 |
psraw m6, 7 |
650 |
mova m3, m4 |
651 |
packuswb m6, m6 |
652 |
mova m4, m5 |
653 |
movh [r0], m6 |
654 |
|
655 |
; go to next line |
656 |
add r0, r1 |
657 |
add r2, r3 |
658 |
dec r4 ; next row |
659 |
jg .nextrow |
660 |
REP_RET |
661 |
|
662 |
%macro FILTER_BILINEAR 3 |
663 |
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |
664 |
mov r5d, 8*16 |
665 |
shl r6d, 4 |
666 |
sub r5d, r6d |
667 |
%ifdef PIC |
668 |
lea r11, [bilinear_filter_vw_m] |
669 |
%endif |
670 |
pxor m6, m6 |
671 |
mova m4, [bilinear_filter_vw+r5-16] |
672 |
mova m5, [bilinear_filter_vw+r6-16] |
673 |
.nextrow |
674 |
movh m0, [r2+r3*0] |
675 |
movh m1, [r2+r3*1] |
676 |
movh m3, [r2+r3*2] |
677 |
punpcklbw m0, m6 |
678 |
punpcklbw m1, m6 |
679 |
punpcklbw m3, m6 |
680 |
mova m2, m1 |
681 |
pmullw m0, m4 |
682 |
pmullw m1, m5 |
683 |
pmullw m2, m4 |
684 |
pmullw m3, m5 |
685 |
paddsw m0, m1 |
686 |
paddsw m2, m3 |
687 |
psraw m0, 2 |
688 |
psraw m2, 2 |
689 |
pavgw m0, m6 |
690 |
pavgw m2, m6 |
691 |
%ifidn %1, mmxext |
692 |
packuswb m0, m0 |
693 |
packuswb m2, m2 |
694 |
movh [r0+r1*0], m0 |
695 |
movh [r0+r1*1], m2 |
696 |
%else |
697 |
packuswb m0, m2 |
698 |
movh [r0+r1*0], m0 |
699 |
movhps [r0+r1*1], m0 |
700 |
%endif |
701 |
|
702 |
lea r0, [r0+r1*2] |
703 |
lea r2, [r2+r3*2] |
704 |
sub r4, 2 |
705 |
jg .nextrow |
706 |
REP_RET |
707 |
|
708 |
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |
709 |
mov r6d, 8*16 |
710 |
shl r5d, 4 |
711 |
sub r6d, r5d |
712 |
%ifdef PIC |
713 |
lea r11, [bilinear_filter_vw_m] |
714 |
%endif |
715 |
pxor m6, m6 |
716 |
mova m4, [bilinear_filter_vw+r6-16] |
717 |
mova m5, [bilinear_filter_vw+r5-16] |
718 |
.nextrow |
719 |
movh m0, [r2+r3*0+0] |
720 |
movh m1, [r2+r3*0+1] |
721 |
movh m2, [r2+r3*1+0] |
722 |
movh m3, [r2+r3*1+1] |
723 |
punpcklbw m0, m6 |
724 |
punpcklbw m1, m6 |
725 |
punpcklbw m2, m6 |
726 |
punpcklbw m3, m6 |
727 |
pmullw m0, m4 |
728 |
pmullw m1, m5 |
729 |
pmullw m2, m4 |
730 |
pmullw m3, m5 |
731 |
paddsw m0, m1 |
732 |
paddsw m2, m3 |
733 |
psraw m0, 2 |
734 |
psraw m2, 2 |
735 |
pavgw m0, m6 |
736 |
pavgw m2, m6 |
737 |
%ifidn %1, mmxext |
738 |
packuswb m0, m0 |
739 |
packuswb m2, m2 |
740 |
movh [r0+r1*0], m0 |
741 |
movh [r0+r1*1], m2 |
742 |
%else |
743 |
packuswb m0, m2 |
744 |
movh [r0+r1*0], m0 |
745 |
movhps [r0+r1*1], m0 |
746 |
%endif |
747 |
|
748 |
lea r0, [r0+r1*2] |
749 |
lea r2, [r2+r3*2] |
750 |
sub r4, 2 |
751 |
jg .nextrow |
752 |
REP_RET |
753 |
%endmacro |
754 |
|
755 |
INIT_MMX |
756 |
FILTER_BILINEAR mmxext, 4, 0 |
757 |
INIT_XMM |
758 |
FILTER_BILINEAR sse2, 8, 7 |
759 |
|
760 |
cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 |
761 |
shl r6d, 4 |
762 |
%ifdef PIC |
763 |
lea r11, [bilinear_filter_vb_m] |
764 |
%endif |
765 |
pxor m4, m4 |
766 |
mova m3, [bilinear_filter_vb+r6-16] |
767 |
.nextrow |
768 |
movh m0, [r2+r3*0] |
769 |
movh m1, [r2+r3*1] |
770 |
movh m2, [r2+r3*2] |
771 |
punpcklbw m0, m1 |
772 |
punpcklbw m1, m2 |
773 |
pmaddubsw m0, m3 |
774 |
pmaddubsw m1, m3 |
775 |
psraw m0, 2 |
776 |
psraw m1, 2 |
777 |
pavgw m0, m4 |
778 |
pavgw m1, m4 |
779 |
packuswb m0, m1 |
780 |
movh [r0+r1*0], m0 |
781 |
movhps [r0+r1*1], m0 |
782 |
|
783 |
lea r0, [r0+r1*2] |
784 |
lea r2, [r2+r3*2] |
785 |
sub r4, 2 |
786 |
jg .nextrow |
787 |
REP_RET |
788 |
|
789 |
cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 |
790 |
shl r5d, 4 |
791 |
%ifdef PIC |
792 |
lea r11, [bilinear_filter_vb_m] |
793 |
%endif |
794 |
pxor m4, m4 |
795 |
mova m2, [filter_h2_shuf] |
796 |
mova m3, [bilinear_filter_vb+r5-16] |
797 |
.nextrow |
798 |
movu m0, [r2+r3*0] |
799 |
movu m1, [r2+r3*1] |
800 |
pshufb m0, m2 |
801 |
pshufb m1, m2 |
802 |
pmaddubsw m0, m3 |
803 |
pmaddubsw m1, m3 |
804 |
psraw m0, 2 |
805 |
psraw m1, 2 |
806 |
pavgw m0, m4 |
807 |
pavgw m1, m4 |
808 |
packuswb m0, m1 |
809 |
movh [r0+r1*0], m0 |
810 |
movhps [r0+r1*1], m0 |
811 |
|
812 |
lea r0, [r0+r1*2] |
813 |
lea r2, [r2+r3*2] |
814 |
sub r4, 2 |
815 |
jg .nextrow |
816 |
REP_RET |
817 |
|
818 |
cglobal put_vp8_pixels8_mmx, 5,5 |
819 |
.nextrow: |
820 |
movq mm0, [r2+r3*0] |
821 |
movq mm1, [r2+r3*1] |
822 |
lea r2, [r2+r3*2] |
823 |
movq [r0+r1*0], mm0 |
824 |
movq [r0+r1*1], mm1 |
825 |
lea r0, [r0+r1*2] |
826 |
sub r4d, 2 |
827 |
jg .nextrow |
828 |
REP_RET |
829 |
|
830 |
cglobal put_vp8_pixels16_mmx, 5,5 |
831 |
.nextrow: |
832 |
movq mm0, [r2+r3*0+0] |
833 |
movq mm1, [r2+r3*0+8] |
834 |
movq mm2, [r2+r3*1+0] |
835 |
movq mm3, [r2+r3*1+8] |
836 |
lea r2, [r2+r3*2] |
837 |
movq [r0+r1*0+0], mm0 |
838 |
movq [r0+r1*0+8], mm1 |
839 |
movq [r0+r1*1+0], mm2 |
840 |
movq [r0+r1*1+8], mm3 |
841 |
lea r0, [r0+r1*2] |
842 |
sub r4d, 2 |
843 |
jg .nextrow |
844 |
REP_RET |
845 |
|
846 |
cglobal put_vp8_pixels16_sse, 5,5,2 |
847 |
.nextrow: |
848 |
movups xmm0, [r2+r3*0] |
849 |
movups xmm1, [r2+r3*1] |
850 |
lea r2, [r2+r3*2] |
851 |
movaps [r0+r1*0], xmm0 |
852 |
movaps [r0+r1*1], xmm1 |
853 |
lea r0, [r0+r1*2] |
854 |
sub r4d, 2 |
855 |
jg .nextrow |
856 |
REP_RET |
857 |
|
858 |
;----------------------------------------------------------------------------- |
859 |
; IDCT functions: |
860 |
; |
861 |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
862 |
;----------------------------------------------------------------------------- |
863 |
|
864 |
cglobal vp8_idct_dc_add_mmx, 3, 3 |
865 |
; load data |
866 |
movd mm0, [r1] |
867 |
|
868 |
; calculate DC |
869 |
paddw mm0, [pw_4] |
870 |
pxor mm1, mm1 |
871 |
psraw mm0, 3 |
872 |
psubw mm1, mm0 |
873 |
packuswb mm0, mm0 |
874 |
packuswb mm1, mm1 |
875 |
punpcklbw mm0, mm0 |
876 |
punpcklbw mm1, mm1 |
877 |
punpcklwd mm0, mm0 |
878 |
punpcklwd mm1, mm1 |
879 |
|
880 |
; add DC |
881 |
lea r1, [r0+r2*2] |
882 |
movd mm2, [r0] |
883 |
movd mm3, [r0+r2] |
884 |
movd mm4, [r1] |
885 |
movd mm5, [r1+r2] |
886 |
paddusb mm2, mm0 |
887 |
paddusb mm3, mm0 |
888 |
paddusb mm4, mm0 |
889 |
paddusb mm5, mm0 |
890 |
psubusb mm2, mm1 |
891 |
psubusb mm3, mm1 |
892 |
psubusb mm4, mm1 |
893 |
psubusb mm5, mm1 |
894 |
movd [r0], mm2 |
895 |
movd [r0+r2], mm3 |
896 |
movd [r1], mm4 |
897 |
movd [r1+r2], mm5 |
898 |
RET |
899 |
|
900 |
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
901 |
; load data |
902 |
movd xmm0, [r1] |
903 |
lea r1, [r0+r2*2] |
904 |
pxor xmm1, xmm1 |
905 |
movq xmm2, [pw_4] |
906 |
|
907 |
; calculate DC |
908 |
paddw xmm0, xmm2 |
909 |
movd xmm2, [r0] |
910 |
movd xmm3, [r0+r2] |
911 |
movd xmm4, [r1] |
912 |
movd xmm5, [r1+r2] |
913 |
psraw xmm0, 3 |
914 |
pshuflw xmm0, xmm0, 0 |
915 |
punpcklqdq xmm0, xmm0 |
916 |
punpckldq xmm2, xmm3 |
917 |
punpckldq xmm4, xmm5 |
918 |
punpcklbw xmm2, xmm1 |
919 |
punpcklbw xmm4, xmm1 |
920 |
paddw xmm2, xmm0 |
921 |
paddw xmm4, xmm0 |
922 |
packuswb xmm2, xmm4 |
923 |
movd [r0], xmm2 |
924 |
pextrd [r0+r2], xmm2, 1 |
925 |
pextrd [r1], xmm2, 2 |
926 |
pextrd [r1+r2], xmm2, 3 |
927 |
RET |
928 |
|
929 |
;----------------------------------------------------------------------------- |
930 |
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
931 |
;----------------------------------------------------------------------------- |
932 |
|
933 |
; calculate %1=%2+%1; %2=%2-%1, with %3=temp register |
934 |
%macro SUMSUB 3 |
935 |
mova %3, %1 |
936 |
paddw %1, %2 |
937 |
psubw %2, %3 |
938 |
%endmacro |
939 |
|
940 |
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |
941 |
; this macro assumes that m6/m7 have words for 20091/17734 loaded |
942 |
%macro VP8_MULTIPLY_SUMSUB 4 |
943 |
mova %3, %1 |
944 |
mova %4, %2 |
945 |
pmulhw %3, m6 ;20091(1) |
946 |
pmulhw %4, m6 ;20091(2) |
947 |
paddw %3, %1 |
948 |
paddw %4, %2 |
949 |
psllw %1, 1 |
950 |
psllw %2, 1 |
951 |
pmulhw %1, m7 ;35468(1) |
952 |
pmulhw %2, m7 ;35468(2) |
953 |
psubw %1, %4 |
954 |
paddw %2, %3 |
955 |
%endmacro |
956 |
|
957 |
; calculate x0=%1+%3; x1=%1-%3 |
958 |
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |
959 |
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |
960 |
; %5/%6 are temporary registers |
961 |
; we assume m6/m7 have constant words 20091/17734 loaded in them |
962 |
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |
963 |
SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |
964 |
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |
965 |
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |
966 |
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |
967 |
SWAP %4, %1 |
968 |
SWAP %4, %3 |
969 |
%endmacro |
970 |
|
971 |
; transpose a 4x4 table |
972 |
%macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3 |
973 |
mova m%5, m%1 |
974 |
punpcklwd m%1, m%2 |
975 |
punpckhwd m%5, m%2 |
976 |
mova m%2, m%3 |
977 |
punpcklwd m%3, m%4 |
978 |
punpckhwd m%2, m%4 |
979 |
mova m%4, m%1 |
980 |
punpckldq m%1, m%3 ;col0 |
981 |
punpckhdq m%4, m%3 ;col1 |
982 |
mova m%3, m%5 |
983 |
punpckldq m%5, m%2 ;col2 |
984 |
punpckhdq m%3, m%2 ;col3 |
985 |
SWAP %4, %2 |
986 |
SWAP %4, %5 |
987 |
SWAP %4, %3 |
988 |
%endmacro |
989 |
|
990 |
INIT_MMX |
991 |
cglobal vp8_idct_add_mmx, 3, 3 |
992 |
; load block data |
993 |
movq m0, [r1] |
994 |
movq m1, [r1+8] |
995 |
movq m2, [r1+16] |
996 |
movq m3, [r1+24] |
997 |
movq m6, [pw_20091] |
998 |
movq m7, [pw_17734] |
999 |
|
1000 |
; actual IDCT |
1001 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
1002 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1003 |
paddw m0, [pw_4] |
1004 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
1005 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1006 |
|
1007 |
; store |
1008 |
pxor m4, m4 |
1009 |
lea r1, [r0+2*r2] |
1010 |
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |
1011 |
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |
1012 |
|
1013 |
RET |
1014 |
|
1015 |
;----------------------------------------------------------------------------- |
1016 |
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
1017 |
;----------------------------------------------------------------------------- |
1018 |
|
1019 |
%macro SCATTER_WHT 1 |
1020 |
pextrw r1d, m0, %1 |
1021 |
pextrw r2d, m1, %1 |
1022 |
mov [r0+2*16*0], r1w |
1023 |
mov [r0+2*16*1], r2w |
1024 |
pextrw r1d, m2, %1 |
1025 |
pextrw r2d, m3, %1 |
1026 |
mov [r0+2*16*2], r1w |
1027 |
mov [r0+2*16*3], r2w |
1028 |
%endmacro |
1029 |
|
1030 |
%macro HADAMARD4_1D 4 |
1031 |
SUMSUB_BADC m%2, m%1, m%4, m%3 |
1032 |
SUMSUB_BADC m%4, m%2, m%3, m%1 |
1033 |
SWAP %1, %4, %3 |
1034 |
%endmacro |
1035 |
|
1036 |
INIT_MMX |
1037 |
cglobal vp8_luma_dc_wht_mmxext, 2,3 |
1038 |
movq m0, [r1] |
1039 |
movq m1, [r1+8] |
1040 |
movq m2, [r1+16] |
1041 |
movq m3, [r1+24] |
1042 |
HADAMARD4_1D 0, 1, 2, 3 |
1043 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1044 |
paddw m0, [pw_3] |
1045 |
HADAMARD4_1D 0, 1, 2, 3 |
1046 |
psraw m0, 3 |
1047 |
psraw m1, 3 |
1048 |
psraw m2, 3 |
1049 |
psraw m3, 3 |
1050 |
SCATTER_WHT 0 |
1051 |
add r0, 2*16*4 |
1052 |
SCATTER_WHT 1 |
1053 |
add r0, 2*16*4 |
1054 |
SCATTER_WHT 2 |
1055 |
add r0, 2*16*4 |
1056 |
SCATTER_WHT 3 |
1057 |
RET |