ffmpeg / libavcodec / x86 / vp8dsp.asm @ f2a30bd8
History | View | Annotate | Download (37.9 KB)
1 |
;****************************************************************************** |
---|---|
2 |
;* VP8 MMXEXT optimizations |
3 |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
4 |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
5 |
;* |
6 |
;* This file is part of FFmpeg. |
7 |
;* |
8 |
;* FFmpeg is free software; you can redistribute it and/or |
9 |
;* modify it under the terms of the GNU Lesser General Public |
10 |
;* License as published by the Free Software Foundation; either |
11 |
;* version 2.1 of the License, or (at your option) any later version. |
12 |
;* |
13 |
;* FFmpeg is distributed in the hope that it will be useful, |
14 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 |
;* Lesser General Public License for more details. |
17 |
;* |
18 |
;* You should have received a copy of the GNU Lesser General Public |
19 |
;* License along with FFmpeg; if not, write to the Free Software |
20 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 |
;****************************************************************************** |
22 |
|
23 |
%include "x86inc.asm" |
24 |
%include "x86util.asm" |
25 |
|
26 |
SECTION_RODATA |
27 |
|
28 |
fourtap_filter_hw_m: times 4 dw -6, 123 |
29 |
times 4 dw 12, -1 |
30 |
times 4 dw -9, 93 |
31 |
times 4 dw 50, -6 |
32 |
times 4 dw -6, 50 |
33 |
times 4 dw 93, -9 |
34 |
times 4 dw -1, 12 |
35 |
times 4 dw 123, -6 |
36 |
|
37 |
sixtap_filter_hw_m: times 4 dw 2, -11 |
38 |
times 4 dw 108, 36 |
39 |
times 4 dw -8, 1 |
40 |
times 4 dw 3, -16 |
41 |
times 4 dw 77, 77 |
42 |
times 4 dw -16, 3 |
43 |
times 4 dw 1, -8 |
44 |
times 4 dw 36, 108 |
45 |
times 4 dw -11, 2 |
46 |
|
47 |
fourtap_filter_hb_m: times 8 db -6, 123 |
48 |
times 8 db 12, -1 |
49 |
times 8 db -9, 93 |
50 |
times 8 db 50, -6 |
51 |
times 8 db -6, 50 |
52 |
times 8 db 93, -9 |
53 |
times 8 db -1, 12 |
54 |
times 8 db 123, -6 |
55 |
|
56 |
sixtap_filter_hb_m: times 8 db 2, 1 |
57 |
times 8 db -11, 108 |
58 |
times 8 db 36, -8 |
59 |
times 8 db 3, 3 |
60 |
times 8 db -16, 77 |
61 |
times 8 db 77, -16 |
62 |
times 8 db 1, 2 |
63 |
times 8 db -8, 36 |
64 |
times 8 db 108, -11 |
65 |
|
66 |
fourtap_filter_v_m: times 8 dw -6 |
67 |
times 8 dw 123 |
68 |
times 8 dw 12 |
69 |
times 8 dw -1 |
70 |
times 8 dw -9 |
71 |
times 8 dw 93 |
72 |
times 8 dw 50 |
73 |
times 8 dw -6 |
74 |
times 8 dw -6 |
75 |
times 8 dw 50 |
76 |
times 8 dw 93 |
77 |
times 8 dw -9 |
78 |
times 8 dw -1 |
79 |
times 8 dw 12 |
80 |
times 8 dw 123 |
81 |
times 8 dw -6 |
82 |
|
83 |
sixtap_filter_v_m: times 8 dw 2 |
84 |
times 8 dw -11 |
85 |
times 8 dw 108 |
86 |
times 8 dw 36 |
87 |
times 8 dw -8 |
88 |
times 8 dw 1 |
89 |
times 8 dw 3 |
90 |
times 8 dw -16 |
91 |
times 8 dw 77 |
92 |
times 8 dw 77 |
93 |
times 8 dw -16 |
94 |
times 8 dw 3 |
95 |
times 8 dw 1 |
96 |
times 8 dw -8 |
97 |
times 8 dw 36 |
98 |
times 8 dw 108 |
99 |
times 8 dw -11 |
100 |
times 8 dw 2 |
101 |
|
102 |
bilinear_filter_vw_m: times 8 dw 1 |
103 |
times 8 dw 2 |
104 |
times 8 dw 3 |
105 |
times 8 dw 4 |
106 |
times 8 dw 5 |
107 |
times 8 dw 6 |
108 |
times 8 dw 7 |
109 |
|
110 |
bilinear_filter_vb_m: times 8 db 7, 1 |
111 |
times 8 db 6, 2 |
112 |
times 8 db 5, 3 |
113 |
times 8 db 4, 4 |
114 |
times 8 db 3, 5 |
115 |
times 8 db 2, 6 |
116 |
times 8 db 1, 7 |
117 |
|
118 |
%ifdef PIC |
119 |
%define fourtap_filter_hw r11 |
120 |
%define sixtap_filter_hw r11 |
121 |
%define fourtap_filter_hb r11 |
122 |
%define sixtap_filter_hb r11 |
123 |
%define fourtap_filter_v r11 |
124 |
%define sixtap_filter_v r11 |
125 |
%define bilinear_filter_vw r11 |
126 |
%define bilinear_filter_vb r11 |
127 |
%else |
128 |
%define fourtap_filter_hw fourtap_filter_hw_m |
129 |
%define sixtap_filter_hw sixtap_filter_hw_m |
130 |
%define fourtap_filter_hb fourtap_filter_hb_m |
131 |
%define sixtap_filter_hb sixtap_filter_hb_m |
132 |
%define fourtap_filter_v fourtap_filter_v_m |
133 |
%define sixtap_filter_v sixtap_filter_v_m |
134 |
%define bilinear_filter_vw bilinear_filter_vw_m |
135 |
%define bilinear_filter_vb bilinear_filter_vb_m |
136 |
%endif |
137 |
|
138 |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
139 |
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
140 |
|
141 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
143 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
144 |
|
145 |
pw_20091: times 4 dw 20091 |
146 |
pw_17734: times 4 dw 17734 |
147 |
|
148 |
cextern pw_3 |
149 |
cextern pb_3 |
150 |
cextern pw_4 |
151 |
cextern pb_4 |
152 |
cextern pw_64 |
153 |
cextern pb_80 |
154 |
cextern pb_F8 |
155 |
cextern pb_FE |
156 |
|
157 |
SECTION .text |
158 |
|
159 |
;----------------------------------------------------------------------------- |
160 |
; subpel MC functions: |
161 |
; |
162 |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |
163 |
; uint8_t *src, int srcstride, |
164 |
; int height, int mx, int my); |
165 |
;----------------------------------------------------------------------------- |
166 |
|
167 |
%macro FILTER_SSSE3 3 |
168 |
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
169 |
lea r5d, [r5*3] |
170 |
mova m3, [filter_h6_shuf2] |
171 |
mova m4, [filter_h6_shuf3] |
172 |
%ifdef PIC |
173 |
lea r11, [sixtap_filter_hb_m] |
174 |
%endif |
175 |
mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
176 |
mova m6, [sixtap_filter_hb+r5*8-32] |
177 |
mova m7, [sixtap_filter_hb+r5*8-16] |
178 |
|
179 |
.nextrow |
180 |
movu m0, [r2-2] |
181 |
mova m1, m0 |
182 |
mova m2, m0 |
183 |
%ifidn %1, 4 |
184 |
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
185 |
; shuffle with a memory operand |
186 |
punpcklbw m0, [r2+3] |
187 |
%else |
188 |
pshufb m0, [filter_h6_shuf1] |
189 |
%endif |
190 |
pshufb m1, m3 |
191 |
pshufb m2, m4 |
192 |
pmaddubsw m0, m5 |
193 |
pmaddubsw m1, m6 |
194 |
pmaddubsw m2, m7 |
195 |
paddsw m0, m1 |
196 |
paddsw m0, m2 |
197 |
paddsw m0, [pw_64] |
198 |
psraw m0, 7 |
199 |
packuswb m0, m0 |
200 |
movh [r0], m0 ; store |
201 |
|
202 |
; go to next line |
203 |
add r0, r1 |
204 |
add r2, r3 |
205 |
dec r4 ; next row |
206 |
jg .nextrow |
207 |
REP_RET |
208 |
|
209 |
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
210 |
shl r5d, 4 |
211 |
mova m2, [pw_64] |
212 |
mova m3, [filter_h2_shuf] |
213 |
mova m4, [filter_h4_shuf] |
214 |
%ifdef PIC |
215 |
lea r11, [fourtap_filter_hb_m] |
216 |
%endif |
217 |
mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
218 |
mova m6, [fourtap_filter_hb+r5] |
219 |
|
220 |
.nextrow |
221 |
movu m0, [r2-1] |
222 |
mova m1, m0 |
223 |
pshufb m0, m3 |
224 |
pshufb m1, m4 |
225 |
pmaddubsw m0, m5 |
226 |
pmaddubsw m1, m6 |
227 |
paddsw m0, m2 |
228 |
paddsw m0, m1 |
229 |
psraw m0, 7 |
230 |
packuswb m0, m0 |
231 |
movh [r0], m0 ; store |
232 |
|
233 |
; go to next line |
234 |
add r0, r1 |
235 |
add r2, r3 |
236 |
dec r4 ; next row |
237 |
jg .nextrow |
238 |
REP_RET |
239 |
|
240 |
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
241 |
shl r6d, 4 |
242 |
%ifdef PIC |
243 |
lea r11, [fourtap_filter_hb_m] |
244 |
%endif |
245 |
mova m5, [fourtap_filter_hb+r6-16] |
246 |
mova m6, [fourtap_filter_hb+r6] |
247 |
mova m7, [pw_64] |
248 |
|
249 |
; read 3 lines |
250 |
sub r2, r3 |
251 |
movh m0, [r2] |
252 |
movh m1, [r2+ r3] |
253 |
movh m2, [r2+2*r3] |
254 |
add r2, r3 |
255 |
|
256 |
.nextrow |
257 |
movh m3, [r2+2*r3] ; read new row |
258 |
mova m4, m0 |
259 |
mova m0, m1 |
260 |
punpcklbw m4, m1 |
261 |
mova m1, m2 |
262 |
punpcklbw m2, m3 |
263 |
pmaddubsw m4, m5 |
264 |
pmaddubsw m2, m6 |
265 |
paddsw m4, m2 |
266 |
mova m2, m3 |
267 |
paddsw m4, m7 |
268 |
psraw m4, 7 |
269 |
packuswb m4, m4 |
270 |
movh [r0], m4 |
271 |
|
272 |
; go to next line |
273 |
add r0, r1 |
274 |
add r2, r3 |
275 |
dec r4 ; next row |
276 |
jg .nextrow |
277 |
REP_RET |
278 |
|
279 |
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
280 |
lea r6d, [r6*3] |
281 |
%ifdef PIC |
282 |
lea r11, [sixtap_filter_hb_m] |
283 |
%endif |
284 |
lea r6, [sixtap_filter_hb+r6*8] |
285 |
|
286 |
; read 5 lines |
287 |
sub r2, r3 |
288 |
sub r2, r3 |
289 |
movh m0, [r2] |
290 |
movh m1, [r2+r3] |
291 |
movh m2, [r2+r3*2] |
292 |
lea r2, [r2+r3*2] |
293 |
add r2, r3 |
294 |
movh m3, [r2] |
295 |
movh m4, [r2+r3] |
296 |
|
297 |
.nextrow |
298 |
movh m5, [r2+2*r3] ; read new row |
299 |
mova m6, m0 |
300 |
punpcklbw m6, m5 |
301 |
mova m0, m1 |
302 |
punpcklbw m1, m2 |
303 |
mova m7, m3 |
304 |
punpcklbw m7, m4 |
305 |
pmaddubsw m6, [r6-48] |
306 |
pmaddubsw m1, [r6-32] |
307 |
pmaddubsw m7, [r6-16] |
308 |
paddsw m6, m1 |
309 |
paddsw m6, m7 |
310 |
mova m1, m2 |
311 |
paddsw m6, [pw_64] |
312 |
mova m2, m3 |
313 |
psraw m6, 7 |
314 |
mova m3, m4 |
315 |
packuswb m6, m6 |
316 |
mova m4, m5 |
317 |
movh [r0], m6 |
318 |
|
319 |
; go to next line |
320 |
add r0, r1 |
321 |
add r2, r3 |
322 |
dec r4 ; next row |
323 |
jg .nextrow |
324 |
REP_RET |
325 |
%endmacro |
326 |
|
327 |
INIT_MMX |
328 |
FILTER_SSSE3 4, 0, 0 |
329 |
INIT_XMM |
330 |
FILTER_SSSE3 8, 8, 7 |
331 |
|
332 |
; 4x4 block, H-only 4-tap filter |
333 |
cglobal put_vp8_epel4_h4_mmxext, 6, 6 |
334 |
shl r5d, 4 |
335 |
%ifdef PIC |
336 |
lea r11, [fourtap_filter_hw_m] |
337 |
%endif |
338 |
movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
339 |
movq mm5, [fourtap_filter_hw+r5] |
340 |
movq mm7, [pw_64] |
341 |
pxor mm6, mm6 |
342 |
|
343 |
.nextrow |
344 |
movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |
345 |
|
346 |
; first set of 2 pixels |
347 |
movq mm2, mm1 ; byte ABCD.. |
348 |
punpcklbw mm1, mm6 ; byte->word ABCD |
349 |
pshufw mm0, mm2, 9 ; byte CDEF.. |
350 |
punpcklbw mm0, mm6 ; byte->word CDEF |
351 |
pshufw mm3, mm1, 0x94 ; word ABBC |
352 |
pshufw mm1, mm0, 0x94 ; word CDDE |
353 |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |
354 |
movq mm0, mm1 ; backup for second set of pixels |
355 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
356 |
paddd mm3, mm1 ; finish 1st 2px |
357 |
|
358 |
; second set of 2 pixels, use backup of above |
359 |
punpckhbw mm2, mm6 ; byte->word EFGH |
360 |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |
361 |
pshufw mm1, mm2, 0x94 ; word EFFG |
362 |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
363 |
paddd mm0, mm1 ; finish 2nd 2px |
364 |
|
365 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |
366 |
packssdw mm3, mm0 ; merge dword->word (4px) |
367 |
paddsw mm3, mm7 ; rounding |
368 |
psraw mm3, 7 |
369 |
packuswb mm3, mm6 ; clip and word->bytes |
370 |
movd [r0], mm3 ; store |
371 |
|
372 |
; go to next line |
373 |
add r0, r1 |
374 |
add r2, r3 |
375 |
dec r4 ; next row |
376 |
jg .nextrow |
377 |
REP_RET |
378 |
|
379 |
; 4x4 block, H-only 6-tap filter |
380 |
cglobal put_vp8_epel4_h6_mmxext, 6, 6 |
381 |
lea r5d, [r5*3] |
382 |
%ifdef PIC |
383 |
lea r11, [sixtap_filter_hw_m] |
384 |
%endif |
385 |
movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |
386 |
movq mm5, [sixtap_filter_hw+r5*8-32] |
387 |
movq mm6, [sixtap_filter_hw+r5*8-16] |
388 |
movq mm7, [pw_64] |
389 |
pxor mm3, mm3 |
390 |
|
391 |
.nextrow |
392 |
movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |
393 |
|
394 |
; first set of 2 pixels |
395 |
movq mm2, mm1 ; byte ABCD.. |
396 |
punpcklbw mm1, mm3 ; byte->word ABCD |
397 |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |
398 |
punpckhbw mm2, mm3 ; byte->word EFGH |
399 |
punpcklbw mm0, mm3 ; byte->word CDEF |
400 |
pshufw mm1, mm1, 0x94 ; word ABBC |
401 |
pshufw mm2, mm2, 0x94 ; word EFFG |
402 |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |
403 |
pshufw mm3, mm0, 0x94 ; word CDDE |
404 |
movq mm0, mm3 ; backup for second set of pixels |
405 |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |
406 |
paddd mm1, mm3 ; add to 1st 2px cache |
407 |
movq mm3, mm2 ; backup for second set of pixels |
408 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
409 |
paddd mm1, mm2 ; finish 1st 2px |
410 |
|
411 |
; second set of 2 pixels, use backup of above |
412 |
movd mm2, [r2+3] ; byte FGHI (prevent overreads) |
413 |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |
414 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |
415 |
paddd mm0, mm3 ; add to 2nd 2px cache |
416 |
pxor mm3, mm3 |
417 |
punpcklbw mm2, mm3 ; byte->word FGHI |
418 |
pshufw mm2, mm2, 0xE9 ; word GHHI |
419 |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
420 |
paddd mm0, mm2 ; finish 2nd 2px |
421 |
|
422 |
; merge two sets of 2 pixels into one set of 4, round/clip/store |
423 |
packssdw mm1, mm0 ; merge dword->word (4px) |
424 |
paddsw mm1, mm7 ; rounding |
425 |
psraw mm1, 7 |
426 |
packuswb mm1, mm3 ; clip and word->bytes |
427 |
movd [r0], mm1 ; store |
428 |
|
429 |
; go to next line |
430 |
add r0, r1 |
431 |
add r2, r3 |
432 |
dec r4 ; next row |
433 |
jg .nextrow |
434 |
REP_RET |
435 |
|
436 |
; 4x4 block, H-only 4-tap filter |
437 |
INIT_XMM |
438 |
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |
439 |
shl r5d, 4 |
440 |
%ifdef PIC |
441 |
lea r11, [fourtap_filter_hw_m] |
442 |
%endif |
443 |
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
444 |
mova m6, [fourtap_filter_hw+r5] |
445 |
pxor m7, m7 |
446 |
|
447 |
.nextrow |
448 |
movh m0, [r2-1] |
449 |
punpcklbw m0, m7 ; ABCDEFGH |
450 |
mova m1, m0 |
451 |
mova m2, m0 |
452 |
mova m3, m0 |
453 |
psrldq m1, 2 ; BCDEFGH |
454 |
psrldq m2, 4 ; CDEFGH |
455 |
psrldq m3, 6 ; DEFGH |
456 |
punpcklwd m0, m1 ; ABBCCDDE |
457 |
punpcklwd m2, m3 ; CDDEEFFG |
458 |
pmaddwd m0, m5 |
459 |
pmaddwd m2, m6 |
460 |
paddd m0, m2 |
461 |
|
462 |
movh m1, [r2+3] |
463 |
punpcklbw m1, m7 ; ABCDEFGH |
464 |
mova m2, m1 |
465 |
mova m3, m1 |
466 |
mova m4, m1 |
467 |
psrldq m2, 2 ; BCDEFGH |
468 |
psrldq m3, 4 ; CDEFGH |
469 |
psrldq m4, 6 ; DEFGH |
470 |
punpcklwd m1, m2 ; ABBCCDDE |
471 |
punpcklwd m3, m4 ; CDDEEFFG |
472 |
pmaddwd m1, m5 |
473 |
pmaddwd m3, m6 |
474 |
paddd m1, m3 |
475 |
|
476 |
packssdw m0, m1 |
477 |
paddsw m0, [pw_64] |
478 |
psraw m0, 7 |
479 |
packuswb m0, m7 |
480 |
movh [r0], m0 ; store |
481 |
|
482 |
; go to next line |
483 |
add r0, r1 |
484 |
add r2, r3 |
485 |
dec r4 ; next row |
486 |
jg .nextrow |
487 |
REP_RET |
488 |
|
489 |
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |
490 |
lea r5d, [r5*3] |
491 |
%ifdef PIC |
492 |
lea r11, [sixtap_filter_hw_m] |
493 |
%endif |
494 |
lea r5, [sixtap_filter_hw+r5*8] |
495 |
pxor m7, m7 |
496 |
|
497 |
.nextrow |
498 |
movu m0, [r2-2] |
499 |
mova m6, m0 |
500 |
mova m4, m0 |
501 |
punpcklbw m0, m7 ; ABCDEFGHI |
502 |
mova m1, m0 |
503 |
mova m2, m0 |
504 |
mova m3, m0 |
505 |
psrldq m1, 2 ; BCDEFGH |
506 |
psrldq m2, 4 ; CDEFGH |
507 |
psrldq m3, 6 ; DEFGH |
508 |
psrldq m4, 4 |
509 |
punpcklbw m4, m7 ; EFGH |
510 |
mova m5, m4 |
511 |
psrldq m5, 2 ; FGH |
512 |
punpcklwd m0, m1 ; ABBCCDDE |
513 |
punpcklwd m2, m3 ; CDDEEFFG |
514 |
punpcklwd m4, m5 ; EFFGGHHI |
515 |
pmaddwd m0, [r5-48] |
516 |
pmaddwd m2, [r5-32] |
517 |
pmaddwd m4, [r5-16] |
518 |
paddd m0, m2 |
519 |
paddd m0, m4 |
520 |
|
521 |
psrldq m6, 4 |
522 |
mova m4, m6 |
523 |
punpcklbw m6, m7 ; ABCDEFGHI |
524 |
mova m1, m6 |
525 |
mova m2, m6 |
526 |
mova m3, m6 |
527 |
psrldq m1, 2 ; BCDEFGH |
528 |
psrldq m2, 4 ; CDEFGH |
529 |
psrldq m3, 6 ; DEFGH |
530 |
psrldq m4, 4 |
531 |
punpcklbw m4, m7 ; EFGH |
532 |
mova m5, m4 |
533 |
psrldq m5, 2 ; FGH |
534 |
punpcklwd m6, m1 ; ABBCCDDE |
535 |
punpcklwd m2, m3 ; CDDEEFFG |
536 |
punpcklwd m4, m5 ; EFFGGHHI |
537 |
pmaddwd m6, [r5-48] |
538 |
pmaddwd m2, [r5-32] |
539 |
pmaddwd m4, [r5-16] |
540 |
paddd m6, m2 |
541 |
paddd m6, m4 |
542 |
|
543 |
packssdw m0, m6 |
544 |
paddsw m0, [pw_64] |
545 |
psraw m0, 7 |
546 |
packuswb m0, m7 |
547 |
movh [r0], m0 ; store |
548 |
|
549 |
; go to next line |
550 |
add r0, r1 |
551 |
add r2, r3 |
552 |
dec r4 ; next row |
553 |
jg .nextrow |
554 |
REP_RET |
555 |
|
556 |
%macro FILTER_V 3 |
557 |
; 4x4 block, V-only 4-tap filter |
558 |
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |
559 |
shl r6d, 5 |
560 |
%ifdef PIC |
561 |
lea r11, [fourtap_filter_v_m] |
562 |
%endif |
563 |
lea r6, [fourtap_filter_v+r6-32] |
564 |
mova m6, [pw_64] |
565 |
pxor m7, m7 |
566 |
mova m5, [r6+48] |
567 |
|
568 |
; read 3 lines |
569 |
sub r2, r3 |
570 |
movh m0, [r2] |
571 |
movh m1, [r2+ r3] |
572 |
movh m2, [r2+2*r3] |
573 |
add r2, r3 |
574 |
punpcklbw m0, m7 |
575 |
punpcklbw m1, m7 |
576 |
punpcklbw m2, m7 |
577 |
|
578 |
.nextrow |
579 |
; first calculate negative taps (to prevent losing positive overflows) |
580 |
movh m4, [r2+2*r3] ; read new row |
581 |
punpcklbw m4, m7 |
582 |
mova m3, m4 |
583 |
pmullw m0, [r6+0] |
584 |
pmullw m4, m5 |
585 |
paddsw m4, m0 |
586 |
|
587 |
; then calculate positive taps |
588 |
mova m0, m1 |
589 |
pmullw m1, [r6+16] |
590 |
paddsw m4, m1 |
591 |
mova m1, m2 |
592 |
pmullw m2, [r6+32] |
593 |
paddsw m4, m2 |
594 |
mova m2, m3 |
595 |
|
596 |
; round/clip/store |
597 |
paddsw m4, m6 |
598 |
psraw m4, 7 |
599 |
packuswb m4, m7 |
600 |
movh [r0], m4 |
601 |
|
602 |
; go to next line |
603 |
add r0, r1 |
604 |
add r2, r3 |
605 |
dec r4 ; next row |
606 |
jg .nextrow |
607 |
REP_RET |
608 |
|
609 |
|
610 |
; 4x4 block, V-only 6-tap filter |
611 |
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |
612 |
shl r6d, 4 |
613 |
lea r6, [r6*3] |
614 |
%ifdef PIC |
615 |
lea r11, [sixtap_filter_v_m] |
616 |
%endif |
617 |
lea r6, [sixtap_filter_v+r6-96] |
618 |
pxor m7, m7 |
619 |
|
620 |
; read 5 lines |
621 |
sub r2, r3 |
622 |
sub r2, r3 |
623 |
movh m0, [r2] |
624 |
movh m1, [r2+r3] |
625 |
movh m2, [r2+r3*2] |
626 |
lea r2, [r2+r3*2] |
627 |
add r2, r3 |
628 |
movh m3, [r2] |
629 |
movh m4, [r2+r3] |
630 |
punpcklbw m0, m7 |
631 |
punpcklbw m1, m7 |
632 |
punpcklbw m2, m7 |
633 |
punpcklbw m3, m7 |
634 |
punpcklbw m4, m7 |
635 |
|
636 |
.nextrow |
637 |
; first calculate negative taps (to prevent losing positive overflows) |
638 |
mova m5, m1 |
639 |
pmullw m5, [r6+16] |
640 |
mova m6, m4 |
641 |
pmullw m6, [r6+64] |
642 |
paddsw m6, m5 |
643 |
|
644 |
; then calculate positive taps |
645 |
movh m5, [r2+2*r3] ; read new row |
646 |
punpcklbw m5, m7 |
647 |
pmullw m0, [r6+0] |
648 |
paddsw m6, m0 |
649 |
mova m0, m1 |
650 |
mova m1, m2 |
651 |
pmullw m2, [r6+32] |
652 |
paddsw m6, m2 |
653 |
mova m2, m3 |
654 |
pmullw m3, [r6+48] |
655 |
paddsw m6, m3 |
656 |
mova m3, m4 |
657 |
mova m4, m5 |
658 |
pmullw m5, [r6+80] |
659 |
paddsw m6, m5 |
660 |
|
661 |
; round/clip/store |
662 |
paddsw m6, [pw_64] |
663 |
psraw m6, 7 |
664 |
packuswb m6, m7 |
665 |
movh [r0], m6 |
666 |
|
667 |
; go to next line |
668 |
add r0, r1 |
669 |
add r2, r3 |
670 |
dec r4 ; next row |
671 |
jg .nextrow |
672 |
REP_RET |
673 |
%endmacro |
674 |
|
675 |
INIT_MMX |
676 |
FILTER_V mmxext, 4, 0 |
677 |
INIT_XMM |
678 |
FILTER_V sse2, 8, 8 |
679 |
|
680 |
%macro FILTER_BILINEAR 3 |
681 |
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |
682 |
mov r5d, 8*16 |
683 |
shl r6d, 4 |
684 |
sub r5d, r6d |
685 |
%ifdef PIC |
686 |
lea r11, [bilinear_filter_vw_m] |
687 |
%endif |
688 |
pxor m6, m6 |
689 |
mova m4, [bilinear_filter_vw+r5-16] |
690 |
mova m5, [bilinear_filter_vw+r6-16] |
691 |
.nextrow |
692 |
movh m0, [r2+r3*0] |
693 |
movh m1, [r2+r3*1] |
694 |
movh m3, [r2+r3*2] |
695 |
punpcklbw m0, m6 |
696 |
punpcklbw m1, m6 |
697 |
punpcklbw m3, m6 |
698 |
mova m2, m1 |
699 |
pmullw m0, m4 |
700 |
pmullw m1, m5 |
701 |
pmullw m2, m4 |
702 |
pmullw m3, m5 |
703 |
paddsw m0, m1 |
704 |
paddsw m2, m3 |
705 |
psraw m0, 2 |
706 |
psraw m2, 2 |
707 |
pavgw m0, m6 |
708 |
pavgw m2, m6 |
709 |
%ifidn %1, mmxext |
710 |
packuswb m0, m0 |
711 |
packuswb m2, m2 |
712 |
movh [r0+r1*0], m0 |
713 |
movh [r0+r1*1], m2 |
714 |
%else |
715 |
packuswb m0, m2 |
716 |
movh [r0+r1*0], m0 |
717 |
movhps [r0+r1*1], m0 |
718 |
%endif |
719 |
|
720 |
lea r0, [r0+r1*2] |
721 |
lea r2, [r2+r3*2] |
722 |
sub r4, 2 |
723 |
jg .nextrow |
724 |
REP_RET |
725 |
|
726 |
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |
727 |
mov r6d, 8*16 |
728 |
shl r5d, 4 |
729 |
sub r6d, r5d |
730 |
%ifdef PIC |
731 |
lea r11, [bilinear_filter_vw_m] |
732 |
%endif |
733 |
pxor m6, m6 |
734 |
mova m4, [bilinear_filter_vw+r6-16] |
735 |
mova m5, [bilinear_filter_vw+r5-16] |
736 |
.nextrow |
737 |
movh m0, [r2+r3*0+0] |
738 |
movh m1, [r2+r3*0+1] |
739 |
movh m2, [r2+r3*1+0] |
740 |
movh m3, [r2+r3*1+1] |
741 |
punpcklbw m0, m6 |
742 |
punpcklbw m1, m6 |
743 |
punpcklbw m2, m6 |
744 |
punpcklbw m3, m6 |
745 |
pmullw m0, m4 |
746 |
pmullw m1, m5 |
747 |
pmullw m2, m4 |
748 |
pmullw m3, m5 |
749 |
paddsw m0, m1 |
750 |
paddsw m2, m3 |
751 |
psraw m0, 2 |
752 |
psraw m2, 2 |
753 |
pavgw m0, m6 |
754 |
pavgw m2, m6 |
755 |
%ifidn %1, mmxext |
756 |
packuswb m0, m0 |
757 |
packuswb m2, m2 |
758 |
movh [r0+r1*0], m0 |
759 |
movh [r0+r1*1], m2 |
760 |
%else |
761 |
packuswb m0, m2 |
762 |
movh [r0+r1*0], m0 |
763 |
movhps [r0+r1*1], m0 |
764 |
%endif |
765 |
|
766 |
lea r0, [r0+r1*2] |
767 |
lea r2, [r2+r3*2] |
768 |
sub r4, 2 |
769 |
jg .nextrow |
770 |
REP_RET |
771 |
%endmacro |
772 |
|
773 |
INIT_MMX |
774 |
FILTER_BILINEAR mmxext, 4, 0 |
775 |
INIT_XMM |
776 |
FILTER_BILINEAR sse2, 8, 7 |
777 |
|
778 |
%macro FILTER_BILINEAR_SSSE3 1 |
779 |
cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
780 |
shl r6d, 4 |
781 |
%ifdef PIC |
782 |
lea r11, [bilinear_filter_vb_m] |
783 |
%endif |
784 |
pxor m4, m4 |
785 |
mova m3, [bilinear_filter_vb+r6-16] |
786 |
.nextrow |
787 |
movh m0, [r2+r3*0] |
788 |
movh m1, [r2+r3*1] |
789 |
movh m2, [r2+r3*2] |
790 |
punpcklbw m0, m1 |
791 |
punpcklbw m1, m2 |
792 |
pmaddubsw m0, m3 |
793 |
pmaddubsw m1, m3 |
794 |
psraw m0, 2 |
795 |
psraw m1, 2 |
796 |
pavgw m0, m4 |
797 |
pavgw m1, m4 |
798 |
%if mmsize==8 |
799 |
packuswb m0, m0 |
800 |
packuswb m1, m1 |
801 |
movh [r0+r1*0], m0 |
802 |
movh [r0+r1*1], m1 |
803 |
%else |
804 |
packuswb m0, m1 |
805 |
movh [r0+r1*0], m0 |
806 |
movhps [r0+r1*1], m0 |
807 |
%endif |
808 |
|
809 |
lea r0, [r0+r1*2] |
810 |
lea r2, [r2+r3*2] |
811 |
sub r4, 2 |
812 |
jg .nextrow |
813 |
REP_RET |
814 |
|
815 |
cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
816 |
shl r5d, 4 |
817 |
%ifdef PIC |
818 |
lea r11, [bilinear_filter_vb_m] |
819 |
%endif |
820 |
pxor m4, m4 |
821 |
mova m2, [filter_h2_shuf] |
822 |
mova m3, [bilinear_filter_vb+r5-16] |
823 |
.nextrow |
824 |
movu m0, [r2+r3*0] |
825 |
movu m1, [r2+r3*1] |
826 |
pshufb m0, m2 |
827 |
pshufb m1, m2 |
828 |
pmaddubsw m0, m3 |
829 |
pmaddubsw m1, m3 |
830 |
psraw m0, 2 |
831 |
psraw m1, 2 |
832 |
pavgw m0, m4 |
833 |
pavgw m1, m4 |
834 |
%if mmsize==8 |
835 |
packuswb m0, m0 |
836 |
packuswb m1, m1 |
837 |
movh [r0+r1*0], m0 |
838 |
movh [r0+r1*1], m1 |
839 |
%else |
840 |
packuswb m0, m1 |
841 |
movh [r0+r1*0], m0 |
842 |
movhps [r0+r1*1], m0 |
843 |
%endif |
844 |
|
845 |
lea r0, [r0+r1*2] |
846 |
lea r2, [r2+r3*2] |
847 |
sub r4, 2 |
848 |
jg .nextrow |
849 |
REP_RET |
850 |
%endmacro |
851 |
|
852 |
INIT_MMX |
853 |
FILTER_BILINEAR_SSSE3 4 |
854 |
INIT_XMM |
855 |
FILTER_BILINEAR_SSSE3 8 |
856 |
|
857 |
cglobal put_vp8_pixels8_mmx, 5,5 |
858 |
.nextrow: |
859 |
movq mm0, [r2+r3*0] |
860 |
movq mm1, [r2+r3*1] |
861 |
lea r2, [r2+r3*2] |
862 |
movq [r0+r1*0], mm0 |
863 |
movq [r0+r1*1], mm1 |
864 |
lea r0, [r0+r1*2] |
865 |
sub r4d, 2 |
866 |
jg .nextrow |
867 |
REP_RET |
868 |
|
869 |
cglobal put_vp8_pixels16_mmx, 5,5 |
870 |
.nextrow: |
871 |
movq mm0, [r2+r3*0+0] |
872 |
movq mm1, [r2+r3*0+8] |
873 |
movq mm2, [r2+r3*1+0] |
874 |
movq mm3, [r2+r3*1+8] |
875 |
lea r2, [r2+r3*2] |
876 |
movq [r0+r1*0+0], mm0 |
877 |
movq [r0+r1*0+8], mm1 |
878 |
movq [r0+r1*1+0], mm2 |
879 |
movq [r0+r1*1+8], mm3 |
880 |
lea r0, [r0+r1*2] |
881 |
sub r4d, 2 |
882 |
jg .nextrow |
883 |
REP_RET |
884 |
|
885 |
cglobal put_vp8_pixels16_sse, 5,5,2 |
886 |
.nextrow: |
887 |
movups xmm0, [r2+r3*0] |
888 |
movups xmm1, [r2+r3*1] |
889 |
lea r2, [r2+r3*2] |
890 |
movaps [r0+r1*0], xmm0 |
891 |
movaps [r0+r1*1], xmm1 |
892 |
lea r0, [r0+r1*2] |
893 |
sub r4d, 2 |
894 |
jg .nextrow |
895 |
REP_RET |
896 |
|
897 |
;----------------------------------------------------------------------------- |
898 |
; IDCT functions: |
899 |
; |
900 |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
901 |
;----------------------------------------------------------------------------- |
902 |
|
903 |
cglobal vp8_idct_dc_add_mmx, 3, 3 |
904 |
; load data |
905 |
movd mm0, [r1] |
906 |
|
907 |
; calculate DC |
908 |
paddw mm0, [pw_4] |
909 |
pxor mm1, mm1 |
910 |
psraw mm0, 3 |
911 |
psubw mm1, mm0 |
912 |
packuswb mm0, mm0 |
913 |
packuswb mm1, mm1 |
914 |
punpcklbw mm0, mm0 |
915 |
punpcklbw mm1, mm1 |
916 |
punpcklwd mm0, mm0 |
917 |
punpcklwd mm1, mm1 |
918 |
|
919 |
; add DC |
920 |
lea r1, [r0+r2*2] |
921 |
movd mm2, [r0] |
922 |
movd mm3, [r0+r2] |
923 |
movd mm4, [r1] |
924 |
movd mm5, [r1+r2] |
925 |
paddusb mm2, mm0 |
926 |
paddusb mm3, mm0 |
927 |
paddusb mm4, mm0 |
928 |
paddusb mm5, mm0 |
929 |
psubusb mm2, mm1 |
930 |
psubusb mm3, mm1 |
931 |
psubusb mm4, mm1 |
932 |
psubusb mm5, mm1 |
933 |
movd [r0], mm2 |
934 |
movd [r0+r2], mm3 |
935 |
movd [r1], mm4 |
936 |
movd [r1+r2], mm5 |
937 |
RET |
938 |
|
939 |
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
940 |
; load data |
941 |
movd xmm0, [r1] |
942 |
lea r1, [r0+r2*2] |
943 |
pxor xmm1, xmm1 |
944 |
movq xmm2, [pw_4] |
945 |
|
946 |
; calculate DC |
947 |
paddw xmm0, xmm2 |
948 |
movd xmm2, [r0] |
949 |
movd xmm3, [r0+r2] |
950 |
movd xmm4, [r1] |
951 |
movd xmm5, [r1+r2] |
952 |
psraw xmm0, 3 |
953 |
pshuflw xmm0, xmm0, 0 |
954 |
punpcklqdq xmm0, xmm0 |
955 |
punpckldq xmm2, xmm3 |
956 |
punpckldq xmm4, xmm5 |
957 |
punpcklbw xmm2, xmm1 |
958 |
punpcklbw xmm4, xmm1 |
959 |
paddw xmm2, xmm0 |
960 |
paddw xmm4, xmm0 |
961 |
packuswb xmm2, xmm4 |
962 |
movd [r0], xmm2 |
963 |
pextrd [r0+r2], xmm2, 1 |
964 |
pextrd [r1], xmm2, 2 |
965 |
pextrd [r1+r2], xmm2, 3 |
966 |
RET |
967 |
|
968 |
;----------------------------------------------------------------------------- |
969 |
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
970 |
;----------------------------------------------------------------------------- |
971 |
|
972 |
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |
973 |
; this macro assumes that m6/m7 have words for 20091/17734 loaded |
974 |
%macro VP8_MULTIPLY_SUMSUB 4 |
975 |
mova %3, %1 |
976 |
mova %4, %2 |
977 |
pmulhw %3, m6 ;20091(1) |
978 |
pmulhw %4, m6 ;20091(2) |
979 |
paddw %3, %1 |
980 |
paddw %4, %2 |
981 |
paddw %1, %1 |
982 |
paddw %2, %2 |
983 |
pmulhw %1, m7 ;35468(1) |
984 |
pmulhw %2, m7 ;35468(2) |
985 |
psubw %1, %4 |
986 |
paddw %2, %3 |
987 |
%endmacro |
988 |
|
989 |
; calculate x0=%1+%3; x1=%1-%3 |
990 |
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |
991 |
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |
992 |
; %5/%6 are temporary registers |
993 |
; we assume m6/m7 have constant words 20091/17734 loaded in them |
994 |
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |
995 |
SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |
996 |
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |
997 |
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |
998 |
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |
999 |
SWAP %4, %1 |
1000 |
SWAP %4, %3 |
1001 |
%endmacro |
1002 |
|
1003 |
INIT_MMX |
1004 |
cglobal vp8_idct_add_mmx, 3, 3 |
1005 |
; load block data |
1006 |
movq m0, [r1] |
1007 |
movq m1, [r1+8] |
1008 |
movq m2, [r1+16] |
1009 |
movq m3, [r1+24] |
1010 |
movq m6, [pw_20091] |
1011 |
movq m7, [pw_17734] |
1012 |
|
1013 |
; actual IDCT |
1014 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
1015 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1016 |
paddw m0, [pw_4] |
1017 |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
1018 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1019 |
|
1020 |
; store |
1021 |
pxor m4, m4 |
1022 |
lea r1, [r0+2*r2] |
1023 |
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |
1024 |
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |
1025 |
|
1026 |
RET |
1027 |
|
1028 |
;----------------------------------------------------------------------------- |
1029 |
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
1030 |
;----------------------------------------------------------------------------- |
1031 |
|
1032 |
%macro SCATTER_WHT 1 |
1033 |
pextrw r1d, m0, %1 |
1034 |
pextrw r2d, m1, %1 |
1035 |
mov [r0+2*16*0], r1w |
1036 |
mov [r0+2*16*1], r2w |
1037 |
pextrw r1d, m2, %1 |
1038 |
pextrw r2d, m3, %1 |
1039 |
mov [r0+2*16*2], r1w |
1040 |
mov [r0+2*16*3], r2w |
1041 |
%endmacro |
1042 |
|
1043 |
%macro HADAMARD4_1D 4 |
1044 |
SUMSUB_BADC m%2, m%1, m%4, m%3 |
1045 |
SUMSUB_BADC m%4, m%2, m%3, m%1 |
1046 |
SWAP %1, %4, %3 |
1047 |
%endmacro |
1048 |
|
1049 |
INIT_MMX |
1050 |
cglobal vp8_luma_dc_wht_mmxext, 2,3 |
1051 |
movq m0, [r1] |
1052 |
movq m1, [r1+8] |
1053 |
movq m2, [r1+16] |
1054 |
movq m3, [r1+24] |
1055 |
HADAMARD4_1D 0, 1, 2, 3 |
1056 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1057 |
paddw m0, [pw_3] |
1058 |
HADAMARD4_1D 0, 1, 2, 3 |
1059 |
psraw m0, 3 |
1060 |
psraw m1, 3 |
1061 |
psraw m2, 3 |
1062 |
psraw m3, 3 |
1063 |
SCATTER_WHT 0 |
1064 |
add r0, 2*16*4 |
1065 |
SCATTER_WHT 1 |
1066 |
add r0, 2*16*4 |
1067 |
SCATTER_WHT 2 |
1068 |
add r0, 2*16*4 |
1069 |
SCATTER_WHT 3 |
1070 |
RET |
1071 |
|
1072 |
;----------------------------------------------------------------------------- |
1073 |
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
1074 |
;----------------------------------------------------------------------------- |
1075 |
|
1076 |
; macro called with 7 mm register indexes as argument, and 4 regular registers |
1077 |
; |
1078 |
; first 4 mm registers will carry the transposed pixel data |
1079 |
; the other three are scratchspace (one would be sufficient, but this allows |
1080 |
; for more spreading/pipelining and thus faster execution on OOE CPUs) |
1081 |
; |
1082 |
; first two regular registers are buf+4*stride and buf+5*stride |
1083 |
; third is -stride, fourth is +stride |
1084 |
%macro READ_8x4_INTERLEAVED 11 |
1085 |
; interleave 8 (A-H) rows of 4 pixels each |
1086 |
movd m%1, [%8+%10*4] ; A0-3 |
1087 |
movd m%5, [%9+%10*4] ; B0-3 |
1088 |
movd m%2, [%8+%10*2] ; C0-3 |
1089 |
movd m%6, [%8+%10] ; D0-3 |
1090 |
movd m%3, [%8] ; E0-3 |
1091 |
movd m%7, [%9] ; F0-3 |
1092 |
movd m%4, [%9+%11] ; G0-3 |
1093 |
punpcklbw m%1, m%5 ; A/B interleaved |
1094 |
movd m%5, [%9+%11*2] ; H0-3 |
1095 |
punpcklbw m%2, m%6 ; C/D interleaved |
1096 |
punpcklbw m%3, m%7 ; E/F interleaved |
1097 |
punpcklbw m%4, m%5 ; G/H interleaved |
1098 |
%endmacro |
1099 |
|
1100 |
; macro called with 7 mm register indexes as argument, and 5 regular registers |
1101 |
; first 11 mean the same as READ_8x4_TRANSPOSED above |
1102 |
; fifth regular register is scratchspace to reach the bottom 8 rows, it |
1103 |
; will be set to second regular register + 8*stride at the end |
1104 |
%macro READ_16x4_INTERLEAVED 12 |
1105 |
; transpose 16 (A-P) rows of 4 pixels each |
1106 |
lea %12, [r0+8*r2] |
1107 |
|
1108 |
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
1109 |
movd m%1, [%8+%10*4] ; A0-3 |
1110 |
movd m%3, [%12+%10*4] ; I0-3 |
1111 |
movd m%2, [%8+%10*2] ; C0-3 |
1112 |
movd m%4, [%12+%10*2] ; K0-3 |
1113 |
movd m%6, [%8+%10] ; D0-3 |
1114 |
movd m%5, [%12+%10] ; L0-3 |
1115 |
movd m%7, [%12] ; M0-3 |
1116 |
add %12, %11 |
1117 |
punpcklbw m%1, m%3 ; A/I |
1118 |
movd m%3, [%8] ; E0-3 |
1119 |
punpcklbw m%2, m%4 ; C/K |
1120 |
punpcklbw m%6, m%5 ; D/L |
1121 |
punpcklbw m%3, m%7 ; E/M |
1122 |
punpcklbw m%2, m%6 ; C/D/K/L interleaved |
1123 |
|
1124 |
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
1125 |
movd m%5, [%9+%10*4] ; B0-3 |
1126 |
movd m%4, [%12+%10*4] ; J0-3 |
1127 |
movd m%7, [%9] ; F0-3 |
1128 |
movd m%6, [%12] ; N0-3 |
1129 |
punpcklbw m%5, m%4 ; B/J |
1130 |
punpcklbw m%7, m%6 ; F/N |
1131 |
punpcklbw m%1, m%5 ; A/B/I/J interleaved |
1132 |
punpcklbw m%3, m%7 ; E/F/M/N interleaved |
1133 |
movd m%4, [%9+%11] ; G0-3 |
1134 |
movd m%6, [%12+%11] ; O0-3 |
1135 |
movd m%5, [%9+%11*2] ; H0-3 |
1136 |
movd m%7, [%12+%11*2] ; P0-3 |
1137 |
punpcklbw m%4, m%6 ; G/O |
1138 |
punpcklbw m%5, m%7 ; H/P |
1139 |
punpcklbw m%4, m%5 ; G/H/O/P interleaved |
1140 |
%endmacro |
1141 |
|
1142 |
; write 4 mm registers of 2 dwords each |
1143 |
; first four arguments are mm register indexes containing source data |
1144 |
; last four are registers containing buf+4*stride, buf+5*stride, |
1145 |
; -stride and +stride |
1146 |
%macro WRITE_4x2D 8 |
1147 |
; write out (2 dwords per register) |
1148 |
movd [%5+%7*4], m%1 |
1149 |
movd [%5+%7*2], m%2 |
1150 |
movd [%5], m%3 |
1151 |
movd [%6+%8], m%4 |
1152 |
punpckhdq m%1, m%1 |
1153 |
punpckhdq m%2, m%2 |
1154 |
punpckhdq m%3, m%3 |
1155 |
punpckhdq m%4, m%4 |
1156 |
movd [%6+%7*4], m%1 |
1157 |
movd [%5+%7], m%2 |
1158 |
movd [%6], m%3 |
1159 |
movd [%6+%8*2], m%4 |
1160 |
%endmacro |
1161 |
|
1162 |
; write 4 xmm registers of 4 dwords each |
1163 |
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
1164 |
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
1165 |
; we add 1*stride to the third regular registry in the process |
1166 |
%macro WRITE_4x4D 9 |
1167 |
; write out (4 dwords per register), start with dwords zero |
1168 |
movd [%5+%8*4], m%1 |
1169 |
movd [%5], m%2 |
1170 |
movd [%5+%9*4], m%3 |
1171 |
movd [%5+%9*8], m%4 |
1172 |
|
1173 |
; store dwords 1 |
1174 |
psrldq m%1, 4 |
1175 |
psrldq m%2, 4 |
1176 |
psrldq m%3, 4 |
1177 |
psrldq m%4, 4 |
1178 |
movd [%6+%8*4], m%1 |
1179 |
movd [%6], m%2 |
1180 |
movd [%6+%9*4], m%3 |
1181 |
movd [%6+%9*8], m%4 |
1182 |
|
1183 |
; write dwords 2 |
1184 |
psrldq m%1, 4 |
1185 |
psrldq m%2, 4 |
1186 |
psrldq m%3, 4 |
1187 |
psrldq m%4, 4 |
1188 |
movd [%5+%8*2], m%1 |
1189 |
movd [%6+%9], m%2 |
1190 |
movd [%7+%8*2], m%3 |
1191 |
movd [%7+%9*2], m%4 |
1192 |
add %7, %9 |
1193 |
|
1194 |
; store dwords 3 |
1195 |
psrldq m%1, 4 |
1196 |
psrldq m%2, 4 |
1197 |
psrldq m%3, 4 |
1198 |
psrldq m%4, 4 |
1199 |
movd [%5+%8], m%1 |
1200 |
movd [%6+%9*2], m%2 |
1201 |
movd [%7+%8*2], m%3 |
1202 |
movd [%7+%9*2], m%4 |
1203 |
%endmacro |
1204 |
|
1205 |
%macro SIMPLE_LOOPFILTER 3 |
1206 |
cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
1207 |
%ifidn %2, h |
1208 |
mov r5, rsp ; backup stack pointer |
1209 |
and rsp, ~(mmsize-1) ; align stack |
1210 |
%endif |
1211 |
%if mmsize == 8 ; mmx/mmxext |
1212 |
mov r3, 2 |
1213 |
%endif |
1214 |
|
1215 |
; splat register with "flim" |
1216 |
movd m7, r2 |
1217 |
punpcklbw m7, m7 |
1218 |
%if mmsize == 16 ; sse2 |
1219 |
punpcklwd m7, m7 |
1220 |
pshufd m7, m7, 0x0 |
1221 |
%elifidn %1, mmx |
1222 |
punpcklwd m7, m7 |
1223 |
punpckldq m7, m7 |
1224 |
%else ; mmxext |
1225 |
pshufw m7, m7, 0x0 |
1226 |
%endif |
1227 |
|
1228 |
; set up indexes to address 4 rows |
1229 |
mov r2, r1 |
1230 |
neg r1 |
1231 |
%ifidn %2, h |
1232 |
lea r0, [r0+4*r2-2] |
1233 |
sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |
1234 |
%endif |
1235 |
|
1236 |
%if mmsize == 8 ; mmx / mmxext |
1237 |
.next8px |
1238 |
%endif |
1239 |
%ifidn %2, v |
1240 |
; read 4 half/full rows of pixels |
1241 |
mova m0, [r0+r1*2] ; p1 |
1242 |
mova m1, [r0+r1] ; p0 |
1243 |
mova m2, [r0] ; q0 |
1244 |
mova m3, [r0+r2] ; q1 |
1245 |
%else ; h |
1246 |
lea r4, [r0+r2] |
1247 |
|
1248 |
%if mmsize == 8 ; mmx/mmxext |
1249 |
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
1250 |
%else ; sse2 |
1251 |
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
1252 |
%endif |
1253 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
1254 |
|
1255 |
mova [rsp], m0 ; store p1 |
1256 |
mova [rsp+mmsize], m3 ; store q1 |
1257 |
%endif |
1258 |
|
1259 |
; simple_limit |
1260 |
mova m5, m2 ; m5=backup of q0 |
1261 |
mova m6, m1 ; m6=backup of p0 |
1262 |
psubusb m1, m2 ; p0-q0 |
1263 |
psubusb m2, m6 ; q0-p0 |
1264 |
por m1, m2 ; FFABS(p0-q0) |
1265 |
paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
1266 |
|
1267 |
mova m4, m3 |
1268 |
mova m2, m0 |
1269 |
psubusb m3, m0 ; q1-p1 |
1270 |
psubusb m0, m4 ; p1-q1 |
1271 |
por m3, m0 ; FFABS(p1-q1) |
1272 |
mova m0, [pb_80] |
1273 |
pxor m2, m0 |
1274 |
pxor m4, m0 |
1275 |
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
1276 |
pand m3, [pb_FE] |
1277 |
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
1278 |
paddusb m3, m1 |
1279 |
psubusb m3, m7 |
1280 |
pxor m1, m1 |
1281 |
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
1282 |
|
1283 |
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
1284 |
mova m4, m5 |
1285 |
pxor m5, m0 |
1286 |
pxor m0, m6 |
1287 |
psubsb m5, m0 ; q0-p0 (signed) |
1288 |
paddsb m2, m5 |
1289 |
paddsb m2, m5 |
1290 |
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
1291 |
pand m2, m3 ; apply filter mask (m3) |
1292 |
|
1293 |
mova m3, [pb_F8] |
1294 |
mova m1, m2 |
1295 |
paddsb m2, [pb_4] ; f1<<3=a+4 |
1296 |
paddsb m1, [pb_3] ; f2<<3=a+3 |
1297 |
pand m2, m3 |
1298 |
pand m1, m3 ; cache f2<<3 |
1299 |
|
1300 |
pxor m0, m0 |
1301 |
pxor m3, m3 |
1302 |
pcmpgtb m0, m2 ; which values are <0? |
1303 |
psubb m3, m2 ; -f1<<3 |
1304 |
psrlq m2, 3 ; +f1 |
1305 |
psrlq m3, 3 ; -f1 |
1306 |
pand m3, m0 |
1307 |
pandn m0, m2 |
1308 |
psubusb m4, m0 |
1309 |
paddusb m4, m3 ; q0-f1 |
1310 |
|
1311 |
pxor m0, m0 |
1312 |
pxor m3, m3 |
1313 |
pcmpgtb m0, m1 ; which values are <0? |
1314 |
psubb m3, m1 ; -f2<<3 |
1315 |
psrlq m1, 3 ; +f2 |
1316 |
psrlq m3, 3 ; -f2 |
1317 |
pand m3, m0 |
1318 |
pandn m0, m1 |
1319 |
paddusb m6, m0 |
1320 |
psubusb m6, m3 ; p0+f2 |
1321 |
|
1322 |
; store |
1323 |
%ifidn %2, v |
1324 |
mova [r0], m4 |
1325 |
mova [r0+r1], m6 |
1326 |
%else ; h |
1327 |
mova m0, [rsp] ; p1 |
1328 |
SWAP 2, 4 ; p0 |
1329 |
SWAP 1, 6 ; q0 |
1330 |
mova m3, [rsp+mmsize] ; q1 |
1331 |
|
1332 |
TRANSPOSE4x4B 0, 1, 2, 3, 4 |
1333 |
%if mmsize == 16 ; sse2 |
1334 |
add r3, r1 ; change from r4*8*stride to r0+8*stride |
1335 |
WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 |
1336 |
%else ; mmx/mmxext |
1337 |
WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |
1338 |
%endif |
1339 |
%endif |
1340 |
|
1341 |
%if mmsize == 8 ; mmx/mmxext |
1342 |
; next 8 pixels |
1343 |
%ifidn %2, v |
1344 |
add r0, 8 ; advance 8 cols = pixels |
1345 |
%else ; h |
1346 |
lea r0, [r0+r2*8] ; advance 8 rows = lines |
1347 |
%endif |
1348 |
dec r3 |
1349 |
jg .next8px |
1350 |
%ifidn %2, v |
1351 |
REP_RET |
1352 |
%else ; h |
1353 |
mov rsp, r5 ; restore stack pointer |
1354 |
RET |
1355 |
%endif |
1356 |
%else ; sse2 |
1357 |
%ifidn %2, h |
1358 |
mov rsp, r5 ; restore stack pointer |
1359 |
%endif |
1360 |
RET |
1361 |
%endif |
1362 |
%endmacro |
1363 |
|
1364 |
INIT_MMX |
1365 |
SIMPLE_LOOPFILTER mmx, v, 4 |
1366 |
SIMPLE_LOOPFILTER mmx, h, 6 |
1367 |
SIMPLE_LOOPFILTER mmxext, v, 4 |
1368 |
SIMPLE_LOOPFILTER mmxext, h, 6 |
1369 |
INIT_XMM |
1370 |
SIMPLE_LOOPFILTER sse2, v, 3 |
1371 |
SIMPLE_LOOPFILTER sse2, h, 6 |