Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ 888fa31e

History | View | Annotate | Download (78.4 KB)

1 0178d14f Jason Garrett-Glaser
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6 2912e87a Mans Rullgard
;* This file is part of Libav.
7 0178d14f Jason Garrett-Glaser
;*
8 2912e87a Mans Rullgard
;* Libav is free software; you can redistribute it and/or
9 0178d14f Jason Garrett-Glaser
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13 2912e87a Mans Rullgard
;* Libav is distributed in the hope that it will be useful,
14 0178d14f Jason Garrett-Glaser
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19 2912e87a Mans Rullgard
;* License along with Libav; if not, write to the Free Software
20 888fa31e Diego Biurrun
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 0178d14f Jason Garrett-Glaser
;******************************************************************************
22
23
%include "x86inc.asm"
24 004cda8e Jason Garrett-Glaser
%include "x86util.asm"
25 0178d14f Jason Garrett-Glaser
26
SECTION_RODATA
27
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46
47 dcc602d8 Jason Garrett-Glaser
fourtap_filter_hb_m: times 8 db  -6, 123
48
                     times 8 db  12,  -1
49
                     times 8 db  -9,  93
50
                     times 8 db  50,  -6
51
                     times 8 db  -6,  50
52
                     times 8 db  93,  -9
53
                     times 8 db  -1,  12
54
                     times 8 db 123,  -6
55 0178d14f Jason Garrett-Glaser
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101
102 a173aa89 Jason Garrett-Glaser
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117
118 0178d14f Jason Garrett-Glaser
%ifdef PIC
119 a173aa89 Jason Garrett-Glaser
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127 0178d14f Jason Garrett-Glaser
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134 a173aa89 Jason Garrett-Glaser
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136 0178d14f Jason Garrett-Glaser
%endif
137
138 a173aa89 Jason Garrett-Glaser
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139 dcc602d8 Jason Garrett-Glaser
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140 0178d14f Jason Garrett-Glaser
141 a173aa89 Jason Garrett-Glaser
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144 0178d14f Jason Garrett-Glaser
145 2dd2f716 Ronald S. Bultje
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147
148 ab4d0318 Ronald S. Bultje
pb_27_63: times 8 db 27, 63
149
pb_18_63: times 8 db 18, 63
150
pb_9_63:  times 8 db  9, 63
151
152 a711eb48 Ronald S. Bultje
cextern pb_1
153 004cda8e Jason Garrett-Glaser
cextern pw_3
154 f2a30bd8 Ronald S. Bultje
cextern pb_3
155 0178d14f Jason Garrett-Glaser
cextern pw_4
156 f2a30bd8 Ronald S. Bultje
cextern pb_4
157 e9e456d8 Ronald S. Bultje
cextern pw_9
158
cextern pw_18
159
cextern pw_27
160
cextern pw_63
161 0178d14f Jason Garrett-Glaser
cextern pw_64
162 f2a30bd8 Ronald S. Bultje
cextern pb_80
163
cextern pb_F8
164
cextern pb_FE
165 0178d14f Jason Garrett-Glaser
166
SECTION .text
167
168
;-----------------------------------------------------------------------------
169
; subpel MC functions:
170
;
171
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
172
;                                              uint8_t *src, int srcstride,
173
;                                              int height,   int mx, int my);
174
;-----------------------------------------------------------------------------
175
176 dcc602d8 Jason Garrett-Glaser
%macro FILTER_SSSE3 3
177
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
178
    lea      r5d, [r5*3]
179
    mova      m3, [filter_h6_shuf2]
180
    mova      m4, [filter_h6_shuf3]
181
%ifdef PIC
182
    lea      r11, [sixtap_filter_hb_m]
183
%endif
184
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
185
    mova      m6, [sixtap_filter_hb+r5*8-32]
186
    mova      m7, [sixtap_filter_hb+r5*8-16]
187
188
.nextrow
189
    movu      m0, [r2-2]
190
    mova      m1, m0
191
    mova      m2, m0
192
%ifidn %1, 4
193
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
194
; shuffle with a memory operand
195
    punpcklbw m0, [r2+3]
196
%else
197
    pshufb    m0, [filter_h6_shuf1]
198
%endif
199
    pshufb    m1, m3
200
    pshufb    m2, m4
201
    pmaddubsw m0, m5
202
    pmaddubsw m1, m6
203
    pmaddubsw m2, m7
204
    paddsw    m0, m1
205
    paddsw    m0, m2
206
    paddsw    m0, [pw_64]
207
    psraw     m0, 7
208
    packuswb  m0, m0
209
    movh    [r0], m0        ; store
210
211
    ; go to next line
212
    add       r0, r1
213
    add       r2, r3
214 684d608b Ronald S. Bultje
    dec      r4d            ; next row
215 dcc602d8 Jason Garrett-Glaser
    jg .nextrow
216
    REP_RET
217
218
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
219
    shl      r5d, 4
220
    mova      m2, [pw_64]
221
    mova      m3, [filter_h2_shuf]
222
    mova      m4, [filter_h4_shuf]
223
%ifdef PIC
224
    lea      r11, [fourtap_filter_hb_m]
225
%endif
226
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
227
    mova      m6, [fourtap_filter_hb+r5]
228
229
.nextrow
230
    movu      m0, [r2-1]
231
    mova      m1, m0
232
    pshufb    m0, m3
233
    pshufb    m1, m4
234
    pmaddubsw m0, m5
235
    pmaddubsw m1, m6
236
    paddsw    m0, m2
237
    paddsw    m0, m1
238
    psraw     m0, 7
239
    packuswb  m0, m0
240
    movh    [r0], m0        ; store
241
242
    ; go to next line
243
    add       r0, r1
244
    add       r2, r3
245 684d608b Ronald S. Bultje
    dec      r4d            ; next row
246 dcc602d8 Jason Garrett-Glaser
    jg .nextrow
247
    REP_RET
248
249
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
250
    shl      r6d, 4
251
%ifdef PIC
252
    lea      r11, [fourtap_filter_hb_m]
253
%endif
254
    mova      m5, [fourtap_filter_hb+r6-16]
255
    mova      m6, [fourtap_filter_hb+r6]
256
    mova      m7, [pw_64]
257
258
    ; read 3 lines
259
    sub       r2, r3
260
    movh      m0, [r2]
261
    movh      m1, [r2+  r3]
262
    movh      m2, [r2+2*r3]
263
    add       r2, r3
264
265
.nextrow
266
    movh      m3, [r2+2*r3]                ; read new row
267
    mova      m4, m0
268
    mova      m0, m1
269
    punpcklbw m4, m1
270
    mova      m1, m2
271
    punpcklbw m2, m3
272
    pmaddubsw m4, m5
273
    pmaddubsw m2, m6
274
    paddsw    m4, m2
275
    mova      m2, m3
276
    paddsw    m4, m7
277
    psraw     m4, 7
278
    packuswb  m4, m4
279
    movh    [r0], m4
280
281
    ; go to next line
282
    add        r0, r1
283
    add        r2, r3
284 684d608b Ronald S. Bultje
    dec       r4d                          ; next row
285 dcc602d8 Jason Garrett-Glaser
    jg .nextrow
286
    REP_RET
287
288
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
289
    lea      r6d, [r6*3]
290
%ifdef PIC
291
    lea      r11, [sixtap_filter_hb_m]
292
%endif
293
    lea       r6, [sixtap_filter_hb+r6*8]
294
295
    ; read 5 lines
296
    sub       r2, r3
297
    sub       r2, r3
298
    movh      m0, [r2]
299
    movh      m1, [r2+r3]
300
    movh      m2, [r2+r3*2]
301
    lea       r2, [r2+r3*2]
302
    add       r2, r3
303
    movh      m3, [r2]
304
    movh      m4, [r2+r3]
305
306
.nextrow
307
    movh      m5, [r2+2*r3]                ; read new row
308
    mova      m6, m0
309
    punpcklbw m6, m5
310
    mova      m0, m1
311
    punpcklbw m1, m2
312
    mova      m7, m3
313
    punpcklbw m7, m4
314
    pmaddubsw m6, [r6-48]
315
    pmaddubsw m1, [r6-32]
316
    pmaddubsw m7, [r6-16]
317
    paddsw    m6, m1
318
    paddsw    m6, m7
319
    mova      m1, m2
320
    paddsw    m6, [pw_64]
321
    mova      m2, m3
322
    psraw     m6, 7
323
    mova      m3, m4
324
    packuswb  m6, m6
325
    mova      m4, m5
326
    movh    [r0], m6
327
328
    ; go to next line
329
    add        r0, r1
330
    add        r2, r3
331 684d608b Ronald S. Bultje
    dec       r4d                          ; next row
332 dcc602d8 Jason Garrett-Glaser
    jg .nextrow
333
    REP_RET
334
%endmacro
335
336
INIT_MMX
337
FILTER_SSSE3 4, 0, 0
338
INIT_XMM
339
FILTER_SSSE3 8, 8, 7
340
341 0178d14f Jason Garrett-Glaser
; 4x4 block, H-only 4-tap filter
342
cglobal put_vp8_epel4_h4_mmxext, 6, 6
343
    shl       r5d, 4
344
%ifdef PIC
345
    lea       r11, [fourtap_filter_hw_m]
346
%endif
347
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
348
    movq      mm5, [fourtap_filter_hw+r5]
349
    movq      mm7, [pw_64]
350
    pxor      mm6, mm6
351
352
.nextrow
353
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
354
355
    ; first set of 2 pixels
356
    movq      mm2, mm1                     ; byte ABCD..
357
    punpcklbw mm1, mm6                     ; byte->word ABCD
358
    pshufw    mm0, mm2, 9                  ; byte CDEF..
359
    punpcklbw mm0, mm6                     ; byte->word CDEF
360
    pshufw    mm3, mm1, 0x94               ; word ABBC
361
    pshufw    mm1, mm0, 0x94               ; word CDDE
362
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
363
    movq      mm0, mm1                     ; backup for second set of pixels
364
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
365
    paddd     mm3, mm1                     ; finish 1st 2px
366
367
    ; second set of 2 pixels, use backup of above
368
    punpckhbw mm2, mm6                     ; byte->word EFGH
369
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
370
    pshufw    mm1, mm2, 0x94               ; word EFFG
371
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
372
    paddd     mm0, mm1                     ; finish 2nd 2px
373
374
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
375
    packssdw  mm3, mm0                     ; merge dword->word (4px)
376
    paddsw    mm3, mm7                     ; rounding
377
    psraw     mm3, 7
378
    packuswb  mm3, mm6                     ; clip and word->bytes
379
    movd     [r0], mm3                     ; store
380
381
    ; go to next line
382
    add        r0, r1
383
    add        r2, r3
384 684d608b Ronald S. Bultje
    dec       r4d                          ; next row
385 0178d14f Jason Garrett-Glaser
    jg .nextrow
386
    REP_RET
387
388
; 4x4 block, H-only 6-tap filter
389
cglobal put_vp8_epel4_h6_mmxext, 6, 6
390
    lea       r5d, [r5*3]
391
%ifdef PIC
392
    lea       r11, [sixtap_filter_hw_m]
393
%endif
394
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
395
    movq      mm5, [sixtap_filter_hw+r5*8-32]
396
    movq      mm6, [sixtap_filter_hw+r5*8-16]
397
    movq      mm7, [pw_64]
398
    pxor      mm3, mm3
399
400
.nextrow
401
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
402
403
    ; first set of 2 pixels
404
    movq      mm2, mm1                     ; byte ABCD..
405
    punpcklbw mm1, mm3                     ; byte->word ABCD
406
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
407
    punpckhbw mm2, mm3                     ; byte->word EFGH
408
    punpcklbw mm0, mm3                     ; byte->word CDEF
409
    pshufw    mm1, mm1, 0x94               ; word ABBC
410
    pshufw    mm2, mm2, 0x94               ; word EFFG
411
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
412
    pshufw    mm3, mm0, 0x94               ; word CDDE
413
    movq      mm0, mm3                     ; backup for second set of pixels
414
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
415
    paddd     mm1, mm3                     ; add to 1st 2px cache
416
    movq      mm3, mm2                     ; backup for second set of pixels
417
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
418
    paddd     mm1, mm2                     ; finish 1st 2px
419
420
    ; second set of 2 pixels, use backup of above
421
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
422
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
423
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
424
    paddd     mm0, mm3                     ; add to 2nd 2px cache
425
    pxor      mm3, mm3
426
    punpcklbw mm2, mm3                     ; byte->word FGHI
427
    pshufw    mm2, mm2, 0xE9               ; word GHHI
428
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
429
    paddd     mm0, mm2                     ; finish 2nd 2px
430
431
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
432
    packssdw  mm1, mm0                     ; merge dword->word (4px)
433
    paddsw    mm1, mm7                     ; rounding
434
    psraw     mm1, 7
435
    packuswb  mm1, mm3                     ; clip and word->bytes
436
    movd     [r0], mm1                     ; store
437
438
    ; go to next line
439
    add        r0, r1
440
    add        r2, r3
441 684d608b Ronald S. Bultje
    dec       r4d                          ; next row
442 0178d14f Jason Garrett-Glaser
    jg .nextrow
443
    REP_RET
444
445
INIT_XMM
446 e25dee60 Jason Garrett-Glaser
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
447
    shl      r5d, 5
448 0178d14f Jason Garrett-Glaser
%ifdef PIC
449 e25dee60 Jason Garrett-Glaser
    lea      r11, [fourtap_filter_v_m]
450 0178d14f Jason Garrett-Glaser
%endif
451 e25dee60 Jason Garrett-Glaser
    lea       r5, [fourtap_filter_v+r5-32]
452 0178d14f Jason Garrett-Glaser
    pxor      m7, m7
453 e25dee60 Jason Garrett-Glaser
    mova      m4, [pw_64]
454
    mova      m5, [r5+ 0]
455
    mova      m6, [r5+16]
456
%ifdef m8
457
    mova      m8, [r5+32]
458
    mova      m9, [r5+48]
459
%endif
460 0178d14f Jason Garrett-Glaser
.nextrow
461 e25dee60 Jason Garrett-Glaser
    movq      m0, [r2-1]
462
    movq      m1, [r2-0]
463
    movq      m2, [r2+1]
464
    movq      m3, [r2+2]
465
    punpcklbw m0, m7
466
    punpcklbw m1, m7
467
    punpcklbw m2, m7
468
    punpcklbw m3, m7
469
    pmullw    m0, m5
470
    pmullw    m1, m6
471
%ifdef m8
472
    pmullw    m2, m8
473
    pmullw    m3, m9
474
%else
475
    pmullw    m2, [r5+32]
476
    pmullw    m3, [r5+48]
477
%endif
478
    paddsw    m0, m1
479
    paddsw    m2, m3
480
    paddsw    m0, m2
481
    paddsw    m0, m4
482 0178d14f Jason Garrett-Glaser
    psraw     m0, 7
483
    packuswb  m0, m7
484
    movh    [r0], m0        ; store
485
486
    ; go to next line
487
    add       r0, r1
488
    add       r2, r3
489 684d608b Ronald S. Bultje
    dec      r4d            ; next row
490 0178d14f Jason Garrett-Glaser
    jg .nextrow
491
    REP_RET
492
493 e25dee60 Jason Garrett-Glaser
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
494 0178d14f Jason Garrett-Glaser
    lea      r5d, [r5*3]
495 e25dee60 Jason Garrett-Glaser
    shl      r5d, 4
496 0178d14f Jason Garrett-Glaser
%ifdef PIC
497 e25dee60 Jason Garrett-Glaser
    lea      r11, [sixtap_filter_v_m]
498 0178d14f Jason Garrett-Glaser
%endif
499 e25dee60 Jason Garrett-Glaser
    lea       r5, [sixtap_filter_v+r5-96]
500 0178d14f Jason Garrett-Glaser
    pxor      m7, m7
501 e25dee60 Jason Garrett-Glaser
    mova      m6, [pw_64]
502
%ifdef m8
503
    mova      m8, [r5+ 0]
504
    mova      m9, [r5+16]
505
    mova     m10, [r5+32]
506
    mova     m11, [r5+48]
507
    mova     m12, [r5+64]
508
    mova     m13, [r5+80]
509
%endif
510 0178d14f Jason Garrett-Glaser
.nextrow
511 e25dee60 Jason Garrett-Glaser
    movq      m0, [r2-2]
512
    movq      m1, [r2-1]
513
    movq      m2, [r2-0]
514
    movq      m3, [r2+1]
515
    movq      m4, [r2+2]
516
    movq      m5, [r2+3]
517
    punpcklbw m0, m7
518
    punpcklbw m1, m7
519
    punpcklbw m2, m7
520
    punpcklbw m3, m7
521
    punpcklbw m4, m7
522
    punpcklbw m5, m7
523
%ifdef m8
524
    pmullw    m0, m8
525
    pmullw    m1, m9
526
    pmullw    m2, m10
527
    pmullw    m3, m11
528
    pmullw    m4, m12
529
    pmullw    m5, m13
530
%else
531
    pmullw    m0, [r5+ 0]
532
    pmullw    m1, [r5+16]
533
    pmullw    m2, [r5+32]
534
    pmullw    m3, [r5+48]
535
    pmullw    m4, [r5+64]
536
    pmullw    m5, [r5+80]
537
%endif
538
    paddsw    m1, m4
539
    paddsw    m0, m5
540
    paddsw    m1, m2
541
    paddsw    m0, m3
542
    paddsw    m0, m1
543
    paddsw    m0, m6
544 0178d14f Jason Garrett-Glaser
    psraw     m0, 7
545
    packuswb  m0, m7
546
    movh    [r0], m0        ; store
547
548
    ; go to next line
549
    add       r0, r1
550
    add       r2, r3
551 684d608b Ronald S. Bultje
    dec      r4d            ; next row
552 0178d14f Jason Garrett-Glaser
    jg .nextrow
553
    REP_RET
554
555
%macro FILTER_V 3
556
; 4x4 block, V-only 4-tap filter
557
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
558
    shl      r6d, 5
559
%ifdef PIC
560
    lea      r11, [fourtap_filter_v_m]
561
%endif
562
    lea       r6, [fourtap_filter_v+r6-32]
563
    mova      m6, [pw_64]
564
    pxor      m7, m7
565
    mova      m5, [r6+48]
566
567
    ; read 3 lines
568
    sub       r2, r3
569
    movh      m0, [r2]
570
    movh      m1, [r2+  r3]
571
    movh      m2, [r2+2*r3]
572
    add       r2, r3
573
    punpcklbw m0, m7
574
    punpcklbw m1, m7
575
    punpcklbw m2, m7
576
577
.nextrow
578
    ; first calculate negative taps (to prevent losing positive overflows)
579
    movh      m4, [r2+2*r3]                ; read new row
580
    punpcklbw m4, m7
581
    mova      m3, m4
582
    pmullw    m0, [r6+0]
583
    pmullw    m4, m5
584
    paddsw    m4, m0
585
586
    ; then calculate positive taps
587
    mova      m0, m1
588
    pmullw    m1, [r6+16]
589
    paddsw    m4, m1
590
    mova      m1, m2
591
    pmullw    m2, [r6+32]
592
    paddsw    m4, m2
593
    mova      m2, m3
594
595
    ; round/clip/store
596
    paddsw    m4, m6
597
    psraw     m4, 7
598
    packuswb  m4, m7
599
    movh    [r0], m4
600
601
    ; go to next line
602
    add       r0, r1
603
    add       r2, r3
604 684d608b Ronald S. Bultje
    dec      r4d                           ; next row
605 0178d14f Jason Garrett-Glaser
    jg .nextrow
606
    REP_RET
607
608
609
; 4x4 block, V-only 6-tap filter
610
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
611
    shl      r6d, 4
612
    lea       r6, [r6*3]
613
%ifdef PIC
614
    lea      r11, [sixtap_filter_v_m]
615
%endif
616
    lea       r6, [sixtap_filter_v+r6-96]
617
    pxor      m7, m7
618
619
    ; read 5 lines
620
    sub       r2, r3
621
    sub       r2, r3
622
    movh      m0, [r2]
623
    movh      m1, [r2+r3]
624
    movh      m2, [r2+r3*2]
625
    lea       r2, [r2+r3*2]
626
    add       r2, r3
627
    movh      m3, [r2]
628
    movh      m4, [r2+r3]
629
    punpcklbw m0, m7
630
    punpcklbw m1, m7
631
    punpcklbw m2, m7
632
    punpcklbw m3, m7
633
    punpcklbw m4, m7
634
635
.nextrow
636
    ; first calculate negative taps (to prevent losing positive overflows)
637
    mova      m5, m1
638
    pmullw    m5, [r6+16]
639
    mova      m6, m4
640
    pmullw    m6, [r6+64]
641
    paddsw    m6, m5
642
643
    ; then calculate positive taps
644
    movh      m5, [r2+2*r3]                ; read new row
645
    punpcklbw m5, m7
646
    pmullw    m0, [r6+0]
647
    paddsw    m6, m0
648
    mova      m0, m1
649
    mova      m1, m2
650
    pmullw    m2, [r6+32]
651
    paddsw    m6, m2
652
    mova      m2, m3
653
    pmullw    m3, [r6+48]
654
    paddsw    m6, m3
655
    mova      m3, m4
656
    mova      m4, m5
657
    pmullw    m5, [r6+80]
658
    paddsw    m6, m5
659
660
    ; round/clip/store
661
    paddsw    m6, [pw_64]
662
    psraw     m6, 7
663
    packuswb  m6, m7
664
    movh    [r0], m6
665
666
    ; go to next line
667
    add       r0, r1
668
    add       r2, r3
669 684d608b Ronald S. Bultje
    dec      r4d                           ; next row
670 0178d14f Jason Garrett-Glaser
    jg .nextrow
671
    REP_RET
672
%endmacro
673
674
INIT_MMX
675
FILTER_V mmxext, 4, 0
676
INIT_XMM
677
FILTER_V sse2,   8, 8
678
679 a173aa89 Jason Garrett-Glaser
%macro FILTER_BILINEAR 3
680
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
681
    mov      r5d, 8*16
682
    shl      r6d, 4
683
    sub      r5d, r6d
684
%ifdef PIC
685
    lea      r11, [bilinear_filter_vw_m]
686
%endif
687
    pxor      m6, m6
688 a912da76 Jason Garrett-Glaser
    mova      m4, [bilinear_filter_vw+r5-16]
689
    mova      m5, [bilinear_filter_vw+r6-16]
690 a173aa89 Jason Garrett-Glaser
.nextrow
691
    movh      m0, [r2+r3*0]
692
    movh      m1, [r2+r3*1]
693
    movh      m3, [r2+r3*2]
694
    punpcklbw m0, m6
695
    punpcklbw m1, m6
696
    punpcklbw m3, m6
697
    mova      m2, m1
698
    pmullw    m0, m4
699
    pmullw    m1, m5
700
    pmullw    m2, m4
701
    pmullw    m3, m5
702
    paddsw    m0, m1
703
    paddsw    m2, m3
704
    psraw     m0, 2
705
    psraw     m2, 2
706
    pavgw     m0, m6
707
    pavgw     m2, m6
708
%ifidn %1, mmxext
709
    packuswb  m0, m0
710
    packuswb  m2, m2
711
    movh [r0+r1*0], m0
712
    movh [r0+r1*1], m2
713
%else
714
    packuswb  m0, m2
715
    movh   [r0+r1*0], m0
716
    movhps [r0+r1*1], m0
717
%endif
718
719
    lea       r0, [r0+r1*2]
720
    lea       r2, [r2+r3*2]
721 684d608b Ronald S. Bultje
    sub      r4d, 2
722 a173aa89 Jason Garrett-Glaser
    jg .nextrow
723
    REP_RET
724
725
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
726
    mov      r6d, 8*16
727
    shl      r5d, 4
728
    sub      r6d, r5d
729
%ifdef PIC
730
    lea      r11, [bilinear_filter_vw_m]
731
%endif
732
    pxor      m6, m6
733 a912da76 Jason Garrett-Glaser
    mova      m4, [bilinear_filter_vw+r6-16]
734
    mova      m5, [bilinear_filter_vw+r5-16]
735 a173aa89 Jason Garrett-Glaser
.nextrow
736
    movh      m0, [r2+r3*0+0]
737
    movh      m1, [r2+r3*0+1]
738
    movh      m2, [r2+r3*1+0]
739
    movh      m3, [r2+r3*1+1]
740
    punpcklbw m0, m6
741
    punpcklbw m1, m6
742
    punpcklbw m2, m6
743
    punpcklbw m3, m6
744
    pmullw    m0, m4
745
    pmullw    m1, m5
746
    pmullw    m2, m4
747
    pmullw    m3, m5
748
    paddsw    m0, m1
749
    paddsw    m2, m3
750
    psraw     m0, 2
751
    psraw     m2, 2
752
    pavgw     m0, m6
753
    pavgw     m2, m6
754
%ifidn %1, mmxext
755
    packuswb  m0, m0
756
    packuswb  m2, m2
757
    movh [r0+r1*0], m0
758
    movh [r0+r1*1], m2
759
%else
760
    packuswb  m0, m2
761
    movh   [r0+r1*0], m0
762
    movhps [r0+r1*1], m0
763
%endif
764
765
    lea       r0, [r0+r1*2]
766
    lea       r2, [r2+r3*2]
767 684d608b Ronald S. Bultje
    sub      r4d, 2
768 a173aa89 Jason Garrett-Glaser
    jg .nextrow
769
    REP_RET
770
%endmacro
771
772
INIT_MMX
773
FILTER_BILINEAR mmxext, 4, 0
774
INIT_XMM
775
FILTER_BILINEAR   sse2, 8, 7
776
777 b06855f1 Jason Garrett-Glaser
%macro FILTER_BILINEAR_SSSE3 1
778
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
779 a173aa89 Jason Garrett-Glaser
    shl      r6d, 4
780
%ifdef PIC
781
    lea      r11, [bilinear_filter_vb_m]
782
%endif
783
    pxor      m4, m4
784 a912da76 Jason Garrett-Glaser
    mova      m3, [bilinear_filter_vb+r6-16]
785 a173aa89 Jason Garrett-Glaser
.nextrow
786
    movh      m0, [r2+r3*0]
787
    movh      m1, [r2+r3*1]
788
    movh      m2, [r2+r3*2]
789
    punpcklbw m0, m1
790
    punpcklbw m1, m2
791
    pmaddubsw m0, m3
792
    pmaddubsw m1, m3
793
    psraw     m0, 2
794
    psraw     m1, 2
795
    pavgw     m0, m4
796
    pavgw     m1, m4
797 b06855f1 Jason Garrett-Glaser
%if mmsize==8
798
    packuswb  m0, m0
799
    packuswb  m1, m1
800
    movh [r0+r1*0], m0
801
    movh [r0+r1*1], m1
802
%else
803 a173aa89 Jason Garrett-Glaser
    packuswb  m0, m1
804
    movh   [r0+r1*0], m0
805
    movhps [r0+r1*1], m0
806 b06855f1 Jason Garrett-Glaser
%endif
807 a173aa89 Jason Garrett-Glaser
808
    lea       r0, [r0+r1*2]
809
    lea       r2, [r2+r3*2]
810 684d608b Ronald S. Bultje
    sub      r4d, 2
811 a173aa89 Jason Garrett-Glaser
    jg .nextrow
812
    REP_RET
813
814 b06855f1 Jason Garrett-Glaser
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
815 a173aa89 Jason Garrett-Glaser
    shl      r5d, 4
816
%ifdef PIC
817
    lea      r11, [bilinear_filter_vb_m]
818
%endif
819
    pxor      m4, m4
820
    mova      m2, [filter_h2_shuf]
821 a912da76 Jason Garrett-Glaser
    mova      m3, [bilinear_filter_vb+r5-16]
822 a173aa89 Jason Garrett-Glaser
.nextrow
823
    movu      m0, [r2+r3*0]
824
    movu      m1, [r2+r3*1]
825
    pshufb    m0, m2
826
    pshufb    m1, m2
827
    pmaddubsw m0, m3
828
    pmaddubsw m1, m3
829
    psraw     m0, 2
830
    psraw     m1, 2
831
    pavgw     m0, m4
832
    pavgw     m1, m4
833 b06855f1 Jason Garrett-Glaser
%if mmsize==8
834
    packuswb  m0, m0
835
    packuswb  m1, m1
836
    movh [r0+r1*0], m0
837
    movh [r0+r1*1], m1
838
%else
839 a173aa89 Jason Garrett-Glaser
    packuswb  m0, m1
840
    movh   [r0+r1*0], m0
841
    movhps [r0+r1*1], m0
842 b06855f1 Jason Garrett-Glaser
%endif
843 a173aa89 Jason Garrett-Glaser
844
    lea       r0, [r0+r1*2]
845
    lea       r2, [r2+r3*2]
846 684d608b Ronald S. Bultje
    sub      r4d, 2
847 a173aa89 Jason Garrett-Glaser
    jg .nextrow
848
    REP_RET
849 b06855f1 Jason Garrett-Glaser
%endmacro
850
851
INIT_MMX
852
FILTER_BILINEAR_SSSE3 4
853
INIT_XMM
854
FILTER_BILINEAR_SSSE3 8
855 a173aa89 Jason Garrett-Glaser
856 0fecad09 Jason Garrett-Glaser
cglobal put_vp8_pixels8_mmx, 5,5
857
.nextrow:
858
    movq  mm0, [r2+r3*0]
859
    movq  mm1, [r2+r3*1]
860
    lea    r2, [r2+r3*2]
861
    movq [r0+r1*0], mm0
862
    movq [r0+r1*1], mm1
863
    lea    r0, [r0+r1*2]
864
    sub   r4d, 2
865
    jg .nextrow
866
    REP_RET
867
868
cglobal put_vp8_pixels16_mmx, 5,5
869
.nextrow:
870
    movq  mm0, [r2+r3*0+0]
871
    movq  mm1, [r2+r3*0+8]
872
    movq  mm2, [r2+r3*1+0]
873
    movq  mm3, [r2+r3*1+8]
874
    lea    r2, [r2+r3*2]
875
    movq [r0+r1*0+0], mm0
876
    movq [r0+r1*0+8], mm1
877
    movq [r0+r1*1+0], mm2
878
    movq [r0+r1*1+8], mm3
879
    lea    r0, [r0+r1*2]
880
    sub   r4d, 2
881
    jg .nextrow
882
    REP_RET
883
884
cglobal put_vp8_pixels16_sse, 5,5,2
885
.nextrow:
886
    movups xmm0, [r2+r3*0]
887
    movups xmm1, [r2+r3*1]
888
    lea     r2, [r2+r3*2]
889
    movaps [r0+r1*0], xmm0
890
    movaps [r0+r1*1], xmm1
891
    lea     r0, [r0+r1*2]
892
    sub    r4d, 2
893
    jg .nextrow
894
    REP_RET
895
896 0178d14f Jason Garrett-Glaser
;-----------------------------------------------------------------------------
897
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
898
;-----------------------------------------------------------------------------
899
900 8a467b2d Jason Garrett-Glaser
%macro ADD_DC 4
901
    %4        m2, [r0+%3]
902
    %4        m3, [r0+r2+%3]
903
    %4        m4, [r1+%3]
904
    %4        m5, [r1+r2+%3]
905
    paddusb   m2, %1
906
    paddusb   m3, %1
907
    paddusb   m4, %1
908
    paddusb   m5, %1
909
    psubusb   m2, %2
910
    psubusb   m3, %2
911
    psubusb   m4, %2
912
    psubusb   m5, %2
913
    %4    [r0+%3], m2
914
    %4 [r0+r2+%3], m3
915
    %4    [r1+%3], m4
916
    %4 [r1+r2+%3], m5
917
%endmacro
918
919
INIT_MMX
920 0178d14f Jason Garrett-Glaser
cglobal vp8_idct_dc_add_mmx, 3, 3
921
    ; load data
922 8a467b2d Jason Garrett-Glaser
    movd       m0, [r1]
923 0178d14f Jason Garrett-Glaser
924
    ; calculate DC
925 8a467b2d Jason Garrett-Glaser
    paddw      m0, [pw_4]
926
    pxor       m1, m1
927
    psraw      m0, 3
928
    movd      [r1], m1
929
    psubw      m1, m0
930
    packuswb   m0, m0
931
    packuswb   m1, m1
932
    punpcklbw  m0, m0
933
    punpcklbw  m1, m1
934
    punpcklwd  m0, m0
935
    punpcklwd  m1, m1
936 0178d14f Jason Garrett-Glaser
937
    ; add DC
938 8a467b2d Jason Garrett-Glaser
    lea        r1, [r0+r2*2]
939
    ADD_DC     m0, m1, 0, movh
940 0178d14f Jason Garrett-Glaser
    RET
941
942 8a467b2d Jason Garrett-Glaser
INIT_XMM
943 0178d14f Jason Garrett-Glaser
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
944
    ; load data
945 8a467b2d Jason Garrett-Glaser
    movd       m0, [r1]
946
    pxor       m1, m1
947
948
    ; calculate DC
949
    paddw      m0, [pw_4]
950
    movd     [r1], m1
951
    lea        r1, [r0+r2*2]
952
    movd       m2, [r0]
953
    movd       m3, [r0+r2]
954
    movd       m4, [r1]
955
    movd       m5, [r1+r2]
956
    psraw      m0, 3
957
    pshuflw    m0, m0, 0
958
    punpcklqdq m0, m0
959
    punpckldq  m2, m3
960
    punpckldq  m4, m5
961
    punpcklbw  m2, m1
962
    punpcklbw  m4, m1
963
    paddw      m2, m0
964
    paddw      m4, m0
965
    packuswb   m2, m4
966
    movd      [r0], m2
967
    pextrd [r0+r2], m2, 1
968
    pextrd    [r1], m2, 2
969
    pextrd [r1+r2], m2, 3
970
    RET
971
972
;-----------------------------------------------------------------------------
973 3ae079a3 Jason Garrett-Glaser
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
974 8a467b2d Jason Garrett-Glaser
;-----------------------------------------------------------------------------
975
976
INIT_MMX
977 3ae079a3 Jason Garrett-Glaser
cglobal vp8_idct_dc_add4y_mmx, 3, 3
978 8a467b2d Jason Garrett-Glaser
    ; load data
979
    movd      m0, [r1+32*0] ; A
980
    movd      m1, [r1+32*2] ; C
981
    punpcklwd m0, [r1+32*1] ; A B
982
    punpcklwd m1, [r1+32*3] ; C D
983 51c91564 Jason Garrett-Glaser
    punpckldq m0, m1        ; A B C D
984 8a467b2d Jason Garrett-Glaser
    pxor      m6, m6
985 0178d14f Jason Garrett-Glaser
986
    ; calculate DC
987 8a467b2d Jason Garrett-Glaser
    paddw     m0, [pw_4]
988
    movd [r1+32*0], m6
989
    movd [r1+32*1], m6
990
    movd [r1+32*2], m6
991
    movd [r1+32*3], m6
992
    psraw     m0, 3
993
    psubw     m6, m0
994
    packuswb  m0, m0
995
    packuswb  m6, m6
996
    punpcklbw m0, m0 ; AABBCCDD
997
    punpcklbw m6, m6 ; AABBCCDD
998
    movq      m1, m0
999
    movq      m7, m6
1000
    punpcklbw m0, m0 ; AAAABBBB
1001
    punpckhbw m1, m1 ; CCCCDDDD
1002
    punpcklbw m6, m6 ; AAAABBBB
1003
    punpckhbw m7, m7 ; CCCCDDDD
1004
1005
    ; add DC
1006
    lea       r1, [r0+r2*2]
1007
    ADD_DC    m0, m6, 0, mova
1008
    ADD_DC    m1, m7, 8, mova
1009
    RET
1010
1011
INIT_XMM
1012 3ae079a3 Jason Garrett-Glaser
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
1013 8a467b2d Jason Garrett-Glaser
    ; load data
1014
    movd      m0, [r1+32*0] ; A
1015
    movd      m1, [r1+32*2] ; C
1016
    punpcklwd m0, [r1+32*1] ; A B
1017
    punpcklwd m1, [r1+32*3] ; C D
1018 51c91564 Jason Garrett-Glaser
    punpckldq m0, m1        ; A B C D
1019 8a467b2d Jason Garrett-Glaser
    pxor      m1, m1
1020
1021
    ; calculate DC
1022
    paddw     m0, [pw_4]
1023
    movd [r1+32*0], m1
1024
    movd [r1+32*1], m1
1025
    movd [r1+32*2], m1
1026
    movd [r1+32*3], m1
1027
    psraw     m0, 3
1028
    psubw     m1, m0
1029
    packuswb  m0, m0
1030
    packuswb  m1, m1
1031
    punpcklbw m0, m0
1032
    punpcklbw m1, m1
1033
    punpcklbw m0, m0
1034
    punpcklbw m1, m1
1035
1036
    ; add DC
1037
    lea       r1, [r0+r2*2]
1038
    ADD_DC    m0, m1, 0, mova
1039 0178d14f Jason Garrett-Glaser
    RET
1040 004cda8e Jason Garrett-Glaser
1041
;-----------------------------------------------------------------------------
1042 3ae079a3 Jason Garrett-Glaser
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
1043
;-----------------------------------------------------------------------------
1044
1045
INIT_MMX
1046
cglobal vp8_idct_dc_add4uv_mmx, 3, 3
1047
    ; load data
1048
    movd      m0, [r1+32*0] ; A
1049
    movd      m1, [r1+32*2] ; C
1050
    punpcklwd m0, [r1+32*1] ; A B
1051
    punpcklwd m1, [r1+32*3] ; C D
1052
    punpckldq m0, m1        ; A B C D
1053
    pxor      m6, m6
1054
1055
    ; calculate DC
1056
    paddw     m0, [pw_4]
1057
    movd [r1+32*0], m6
1058
    movd [r1+32*1], m6
1059
    movd [r1+32*2], m6
1060
    movd [r1+32*3], m6
1061
    psraw     m0, 3
1062
    psubw     m6, m0
1063
    packuswb  m0, m0
1064
    packuswb  m6, m6
1065
    punpcklbw m0, m0 ; AABBCCDD
1066
    punpcklbw m6, m6 ; AABBCCDD
1067
    movq      m1, m0
1068
    movq      m7, m6
1069
    punpcklbw m0, m0 ; AAAABBBB
1070
    punpckhbw m1, m1 ; CCCCDDDD
1071
    punpcklbw m6, m6 ; AAAABBBB
1072
    punpckhbw m7, m7 ; CCCCDDDD
1073
1074
    ; add DC
1075
    lea       r1, [r0+r2*2]
1076
    ADD_DC    m0, m6, 0, mova
1077
    lea       r0, [r0+r2*4]
1078
    lea       r1, [r1+r2*4]
1079
    ADD_DC    m1, m7, 0, mova
1080
    RET
1081
1082
;-----------------------------------------------------------------------------
1083 2dd2f716 Ronald S. Bultje
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
1084
;-----------------------------------------------------------------------------
1085
1086
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1087
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
1088
%macro VP8_MULTIPLY_SUMSUB 4
1089
    mova      %3, %1
1090
    mova      %4, %2
1091
    pmulhw    %3, m6 ;20091(1)
1092
    pmulhw    %4, m6 ;20091(2)
1093
    paddw     %3, %1
1094
    paddw     %4, %2
1095 82a8d0f1 Jason Garrett-Glaser
    paddw     %1, %1
1096
    paddw     %2, %2
1097 2dd2f716 Ronald S. Bultje
    pmulhw    %1, m7 ;35468(1)
1098
    pmulhw    %2, m7 ;35468(2)
1099
    psubw     %1, %4
1100
    paddw     %2, %3
1101
%endmacro
1102
1103
; calculate x0=%1+%3; x1=%1-%3
1104
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1105
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1106
;           %5/%6 are temporary registers
1107
;           we assume m6/m7 have constant words 20091/17734 loaded in them
1108
%macro VP8_IDCT_TRANSFORM4x4_1D 6
1109
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
1110
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1111
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
1112
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
1113
    SWAP                 %4,  %1
1114
    SWAP                 %4,  %3
1115
%endmacro
1116
1117
INIT_MMX
1118 c25c7767 Jason Garrett-Glaser
%macro VP8_IDCT_ADD 1
1119
cglobal vp8_idct_add_%1, 3, 3
1120 2dd2f716 Ronald S. Bultje
    ; load block data
1121 c25c7767 Jason Garrett-Glaser
    movq         m0, [r1+ 0]
1122
    movq         m1, [r1+ 8]
1123 2dd2f716 Ronald S. Bultje
    movq         m2, [r1+16]
1124
    movq         m3, [r1+24]
1125
    movq         m6, [pw_20091]
1126
    movq         m7, [pw_17734]
1127 c25c7767 Jason Garrett-Glaser
%ifidn %1, sse
1128
    xorps      xmm0, xmm0
1129
    movaps  [r1+ 0], xmm0
1130
    movaps  [r1+16], xmm0
1131
%else
1132
    pxor         m4, m4
1133
    movq    [r1+ 0], m4
1134
    movq    [r1+ 8], m4
1135
    movq    [r1+16], m4
1136
    movq    [r1+24], m4
1137
%endif
1138 2dd2f716 Ronald S. Bultje
1139
    ; actual IDCT
1140
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1141
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1142
    paddw        m0, [pw_4]
1143
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1144
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1145
1146
    ; store
1147
    pxor         m4, m4
1148
    lea          r1, [r0+2*r2]
1149
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1150
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1151
1152
    RET
1153 c25c7767 Jason Garrett-Glaser
%endmacro
1154
1155
VP8_IDCT_ADD mmx
1156
VP8_IDCT_ADD sse
1157 2dd2f716 Ronald S. Bultje
1158
;-----------------------------------------------------------------------------
1159 004cda8e Jason Garrett-Glaser
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1160
;-----------------------------------------------------------------------------
1161
1162 b8b231b5 Jason Garrett-Glaser
%macro SCATTER_WHT 3
1163
    movd  r1d, m%1
1164
    movd  r2d, m%2
1165
    mov [r0+2*16*(0+%3)], r1w
1166
    mov [r0+2*16*(1+%3)], r2w
1167
    shr   r1d, 16
1168
    shr   r2d, 16
1169
    psrlq m%1, 32
1170
    psrlq m%2, 32
1171
    mov [r0+2*16*(4+%3)], r1w
1172
    mov [r0+2*16*(5+%3)], r2w
1173
    movd  r1d, m%1
1174
    movd  r2d, m%2
1175
    mov [r0+2*16*(8+%3)], r1w
1176
    mov [r0+2*16*(9+%3)], r2w
1177
    shr   r1d, 16
1178
    shr   r2d, 16
1179
    mov [r0+2*16*(12+%3)], r1w
1180
    mov [r0+2*16*(13+%3)], r2w
1181 004cda8e Jason Garrett-Glaser
%endmacro
1182
1183
%macro HADAMARD4_1D 4
1184
    SUMSUB_BADC m%2, m%1, m%4, m%3
1185
    SUMSUB_BADC m%4, m%2, m%3, m%1
1186
    SWAP %1, %4, %3
1187
%endmacro
1188
1189 827d43bb Jason Garrett-Glaser
%macro VP8_DC_WHT 1
1190
cglobal vp8_luma_dc_wht_%1, 2,3
1191 004cda8e Jason Garrett-Glaser
    movq          m0, [r1]
1192
    movq          m1, [r1+8]
1193
    movq          m2, [r1+16]
1194
    movq          m3, [r1+24]
1195 827d43bb Jason Garrett-Glaser
%ifidn %1, sse
1196
    xorps      xmm0, xmm0
1197
    movaps  [r1+ 0], xmm0
1198
    movaps  [r1+16], xmm0
1199
%else
1200
    pxor         m4, m4
1201
    movq    [r1+ 0], m4
1202
    movq    [r1+ 8], m4
1203
    movq    [r1+16], m4
1204
    movq    [r1+24], m4
1205
%endif
1206 004cda8e Jason Garrett-Glaser
    HADAMARD4_1D  0, 1, 2, 3
1207
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1208
    paddw         m0, [pw_3]
1209
    HADAMARD4_1D  0, 1, 2, 3
1210
    psraw         m0, 3
1211
    psraw         m1, 3
1212
    psraw         m2, 3
1213
    psraw         m3, 3
1214 b8b231b5 Jason Garrett-Glaser
    SCATTER_WHT   0, 1, 0
1215
    SCATTER_WHT   2, 3, 2
1216 004cda8e Jason Garrett-Glaser
    RET
1217 827d43bb Jason Garrett-Glaser
%endmacro
1218
1219
INIT_MMX
1220
VP8_DC_WHT mmx
1221
VP8_DC_WHT sse
1222 f2a30bd8 Ronald S. Bultje
1223
;-----------------------------------------------------------------------------
1224
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1225
;-----------------------------------------------------------------------------
1226
1227
; macro called with 7 mm register indexes as argument, and 4 regular registers
1228
;
1229
; first 4 mm registers will carry the transposed pixel data
1230
; the other three are scratchspace (one would be sufficient, but this allows
1231
; for more spreading/pipelining and thus faster execution on OOE CPUs)
1232
;
1233
; first two regular registers are buf+4*stride and buf+5*stride
1234
; third is -stride, fourth is +stride
1235
%macro READ_8x4_INTERLEAVED 11
1236
    ; interleave 8 (A-H) rows of 4 pixels each
1237
    movd          m%1, [%8+%10*4]   ; A0-3
1238
    movd          m%5, [%9+%10*4]   ; B0-3
1239
    movd          m%2, [%8+%10*2]   ; C0-3
1240
    movd          m%6, [%8+%10]     ; D0-3
1241
    movd          m%3, [%8]         ; E0-3
1242
    movd          m%7, [%9]         ; F0-3
1243
    movd          m%4, [%9+%11]     ; G0-3
1244
    punpcklbw     m%1, m%5          ; A/B interleaved
1245
    movd          m%5, [%9+%11*2]   ; H0-3
1246
    punpcklbw     m%2, m%6          ; C/D interleaved
1247
    punpcklbw     m%3, m%7          ; E/F interleaved
1248
    punpcklbw     m%4, m%5          ; G/H interleaved
1249
%endmacro
1250
1251
; macro called with 7 mm register indexes as argument, and 5 regular registers
1252
; first 11 mean the same as READ_8x4_TRANSPOSED above
1253
; fifth regular register is scratchspace to reach the bottom 8 rows, it
1254
; will be set to second regular register + 8*stride at the end
1255
%macro READ_16x4_INTERLEAVED 12
1256
    ; transpose 16 (A-P) rows of 4 pixels each
1257
    lea           %12, [r0+8*r2]
1258
1259
    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1260
    movd          m%1, [%8+%10*4]   ; A0-3
1261
    movd          m%3, [%12+%10*4]  ; I0-3
1262
    movd          m%2, [%8+%10*2]   ; C0-3
1263
    movd          m%4, [%12+%10*2]  ; K0-3
1264
    movd          m%6, [%8+%10]     ; D0-3
1265
    movd          m%5, [%12+%10]    ; L0-3
1266
    movd          m%7, [%12]        ; M0-3
1267
    add           %12, %11
1268
    punpcklbw     m%1, m%3          ; A/I
1269
    movd          m%3, [%8]         ; E0-3
1270
    punpcklbw     m%2, m%4          ; C/K
1271
    punpcklbw     m%6, m%5          ; D/L
1272
    punpcklbw     m%3, m%7          ; E/M
1273
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1274
1275
    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1276
    movd         m%5, [%9+%10*4]   ; B0-3
1277
    movd         m%4, [%12+%10*4]  ; J0-3
1278
    movd         m%7, [%9]         ; F0-3
1279
    movd         m%6, [%12]        ; N0-3
1280
    punpcklbw    m%5, m%4          ; B/J
1281
    punpcklbw    m%7, m%6          ; F/N
1282
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1283
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1284
    movd         m%4, [%9+%11]     ; G0-3
1285
    movd         m%6, [%12+%11]    ; O0-3
1286
    movd         m%5, [%9+%11*2]   ; H0-3
1287
    movd         m%7, [%12+%11*2]  ; P0-3
1288
    punpcklbw    m%4, m%6          ; G/O
1289
    punpcklbw    m%5, m%7          ; H/P
1290
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1291
%endmacro
1292
1293
; write 4 mm registers of 2 dwords each
1294
; first four arguments are mm register indexes containing source data
1295
; last four are registers containing buf+4*stride, buf+5*stride,
1296
; -stride and +stride
1297
%macro WRITE_4x2D 8
1298
    ; write out (2 dwords per register)
1299
    movd    [%5+%7*4], m%1
1300
    movd    [%5+%7*2], m%2
1301
    movd         [%5], m%3
1302
    movd      [%6+%8], m%4
1303
    punpckhdq     m%1, m%1
1304
    punpckhdq     m%2, m%2
1305
    punpckhdq     m%3, m%3
1306
    punpckhdq     m%4, m%4
1307
    movd    [%6+%7*4], m%1
1308
    movd      [%5+%7], m%2
1309
    movd         [%6], m%3
1310
    movd    [%6+%8*2], m%4
1311
%endmacro
1312
1313
; write 4 xmm registers of 4 dwords each
1314
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1315
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1316
; we add 1*stride to the third regular registry in the process
1317 268821e7 Ronald S. Bultje
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1318
; same memory region), or 8 if they cover two separate buffers (third one points to
1319
; a different memory region than the first two), allowing for more optimal code for
1320
; the 16-width case
1321
%macro WRITE_4x4D 10
1322 f2a30bd8 Ronald S. Bultje
    ; write out (4 dwords per register), start with dwords zero
1323
    movd    [%5+%8*4], m%1
1324
    movd         [%5], m%2
1325 268821e7 Ronald S. Bultje
    movd    [%7+%8*4], m%3
1326
    movd         [%7], m%4
1327 f2a30bd8 Ronald S. Bultje
1328
    ; store dwords 1
1329
    psrldq        m%1, 4
1330
    psrldq        m%2, 4
1331
    psrldq        m%3, 4
1332
    psrldq        m%4, 4
1333
    movd    [%6+%8*4], m%1
1334
    movd         [%6], m%2
1335 268821e7 Ronald S. Bultje
%if %10 == 16
1336 f2a30bd8 Ronald S. Bultje
    movd    [%6+%9*4], m%3
1337 268821e7 Ronald S. Bultje
%endif
1338
    movd      [%7+%9], m%4
1339 f2a30bd8 Ronald S. Bultje
1340
    ; write dwords 2
1341
    psrldq        m%1, 4
1342
    psrldq        m%2, 4
1343 268821e7 Ronald S. Bultje
%if %10 == 8
1344
    movd    [%5+%8*2], m%1
1345 b1c32fb5 Reimar Döffinger
    movd          %5d, m%3
1346 268821e7 Ronald S. Bultje
%endif
1347 f2a30bd8 Ronald S. Bultje
    psrldq        m%3, 4
1348
    psrldq        m%4, 4
1349 268821e7 Ronald S. Bultje
%if %10 == 16
1350 f2a30bd8 Ronald S. Bultje
    movd    [%5+%8*2], m%1
1351 268821e7 Ronald S. Bultje
%endif
1352 f2a30bd8 Ronald S. Bultje
    movd      [%6+%9], m%2
1353
    movd    [%7+%8*2], m%3
1354
    movd    [%7+%9*2], m%4
1355
    add            %7, %9
1356
1357
    ; store dwords 3
1358
    psrldq        m%1, 4
1359
    psrldq        m%2, 4
1360
    psrldq        m%3, 4
1361
    psrldq        m%4, 4
1362 268821e7 Ronald S. Bultje
%if %10 == 8
1363
    mov     [%7+%8*4], %5d
1364
    movd    [%6+%8*2], m%1
1365
%else
1366 f2a30bd8 Ronald S. Bultje
    movd      [%5+%8], m%1
1367 268821e7 Ronald S. Bultje
%endif
1368 f2a30bd8 Ronald S. Bultje
    movd    [%6+%9*2], m%2
1369
    movd    [%7+%8*2], m%3
1370
    movd    [%7+%9*2], m%4
1371
%endmacro
1372
1373 6341838f Ronald S. Bultje
; write 4 or 8 words in the mmx/xmm registers as 8 lines
1374
; 1 and 2 are the registers to write, this can be the same (for SSE2)
1375
; for pre-SSE4:
1376
; 3 is a general-purpose register that we will clobber
1377
; for SSE4:
1378
; 3 is a pointer to the destination's 5th line
1379
; 4 is a pointer to the destination's 4th line
1380
; 5/6 is -stride and +stride
1381
%macro WRITE_2x4W 6
1382 b1c32fb5 Reimar Döffinger
    movd            %3d, %1
1383 6341838f Ronald S. Bultje
    punpckhdq        %1, %1
1384
    mov       [%4+%5*4], %3w
1385
    shr              %3, 16
1386
    add              %4, %6
1387
    mov       [%4+%5*4], %3w
1388
1389 b1c32fb5 Reimar Döffinger
    movd            %3d, %1
1390 6341838f Ronald S. Bultje
    add              %4, %5
1391
    mov       [%4+%5*2], %3w
1392
    shr              %3, 16
1393
    mov       [%4+%5  ], %3w
1394
1395 b1c32fb5 Reimar Döffinger
    movd            %3d, %2
1396 6341838f Ronald S. Bultje
    punpckhdq        %2, %2
1397
    mov       [%4     ], %3w
1398
    shr              %3, 16
1399
    mov       [%4+%6  ], %3w
1400
1401 b1c32fb5 Reimar Döffinger
    movd            %3d, %2
1402 6341838f Ronald S. Bultje
    add              %4, %6
1403
    mov       [%4+%6  ], %3w
1404
    shr              %3, 16
1405
    mov       [%4+%6*2], %3w
1406
    add              %4, %5
1407
%endmacro
1408
1409
%macro WRITE_8W_SSE2 5
1410 b1c32fb5 Reimar Döffinger
    movd            %2d, %1
1411 6341838f Ronald S. Bultje
    psrldq           %1, 4
1412
    mov       [%3+%4*4], %2w
1413
    shr              %2, 16
1414
    add              %3, %5
1415
    mov       [%3+%4*4], %2w
1416
1417 b1c32fb5 Reimar Döffinger
    movd            %2d, %1
1418 6341838f Ronald S. Bultje
    psrldq           %1, 4
1419
    add              %3, %4
1420
    mov       [%3+%4*2], %2w
1421
    shr              %2, 16
1422
    mov       [%3+%4  ], %2w
1423
1424 b1c32fb5 Reimar Döffinger
    movd            %2d, %1
1425 6341838f Ronald S. Bultje
    psrldq           %1, 4
1426
    mov       [%3     ], %2w
1427
    shr              %2, 16
1428
    mov       [%3+%5  ], %2w
1429
1430 b1c32fb5 Reimar Döffinger
    movd            %2d, %1
1431 6341838f Ronald S. Bultje
    add              %3, %5
1432
    mov       [%3+%5  ], %2w
1433
    shr              %2, 16
1434
    mov       [%3+%5*2], %2w
1435
%endmacro
1436
1437
%macro WRITE_8W_SSE4 5
1438
    pextrw    [%3+%4*4], %1, 0
1439
    pextrw    [%2+%4*4], %1, 1
1440
    pextrw    [%3+%4*2], %1, 2
1441
    pextrw    [%3+%4  ], %1, 3
1442
    pextrw    [%3     ], %1, 4
1443
    pextrw    [%2     ], %1, 5
1444
    pextrw    [%2+%5  ], %1, 6
1445
    pextrw    [%2+%5*2], %1, 7
1446
%endmacro
1447
1448 e3f7bf77 Ronald S. Bultje
%macro SPLATB_REG_MMX 2-3
1449 b1c32fb5 Reimar Döffinger
    movd           %1, %2d
1450 a711eb48 Ronald S. Bultje
    punpcklbw      %1, %1
1451
    punpcklwd      %1, %1
1452
    punpckldq      %1, %1
1453 e3f7bf77 Ronald S. Bultje
%endmacro
1454
1455
%macro SPLATB_REG_MMXEXT 2-3
1456 b1c32fb5 Reimar Döffinger
    movd           %1, %2d
1457 e3f7bf77 Ronald S. Bultje
    punpcklbw      %1, %1
1458 a711eb48 Ronald S. Bultje
    pshufw         %1, %1, 0x0
1459 e3f7bf77 Ronald S. Bultje
%endmacro
1460
1461
%macro SPLATB_REG_SSE2 2-3
1462 b1c32fb5 Reimar Döffinger
    movd           %1, %2d
1463 e3f7bf77 Ronald S. Bultje
    punpcklbw      %1, %1
1464
    pshuflw        %1, %1, 0x0
1465
    punpcklqdq     %1, %1
1466
%endmacro
1467
1468
%macro SPLATB_REG_SSSE3 3
1469 b1c32fb5 Reimar Döffinger
    movd           %1, %2d
1470 e3f7bf77 Ronald S. Bultje
    pshufb         %1, %3
1471 a711eb48 Ronald S. Bultje
%endmacro
1472
1473 3611c45a Ronald S. Bultje
%macro SIMPLE_LOOPFILTER 4
1474
cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
1475 f2a30bd8 Ronald S. Bultje
%if mmsize == 8 ; mmx/mmxext
1476
    mov            r3, 2
1477
%endif
1478 bcd4aa64 Ronald S. Bultje
%ifnidn %1, sse2
1479
%if mmsize == 16
1480 7dd224a4 Jason Garrett-Glaser
    pxor           m0, m0
1481
%endif
1482 bcd4aa64 Ronald S. Bultje
%endif
1483 e3f7bf77 Ronald S. Bultje
    SPLATB_REG     m7, r2, m0       ; splat "flim" into register
1484 f2a30bd8 Ronald S. Bultje
1485
    ; set up indexes to address 4 rows
1486
    mov            r2, r1
1487
    neg            r1
1488
%ifidn %2, h
1489
    lea            r0, [r0+4*r2-2]
1490
%endif
1491
1492
%if mmsize == 8 ; mmx / mmxext
1493
.next8px
1494
%endif
1495
%ifidn %2, v
1496
    ; read 4 half/full rows of pixels
1497
    mova           m0, [r0+r1*2]    ; p1
1498
    mova           m1, [r0+r1]      ; p0
1499
    mova           m2, [r0]         ; q0
1500
    mova           m3, [r0+r2]      ; q1
1501
%else ; h
1502
    lea            r4, [r0+r2]
1503
1504
%if mmsize == 8 ; mmx/mmxext
1505
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1506
%else ; sse2
1507
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1508
%endif
1509
    TRANSPOSE4x4W         0, 1, 2, 3, 4
1510
%endif
1511
1512
    ; simple_limit
1513
    mova           m5, m2           ; m5=backup of q0
1514
    mova           m6, m1           ; m6=backup of p0
1515
    psubusb        m1, m2           ; p0-q0
1516
    psubusb        m2, m6           ; q0-p0
1517
    por            m1, m2           ; FFABS(p0-q0)
1518
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1519
1520
    mova           m4, m3
1521
    mova           m2, m0
1522
    psubusb        m3, m0           ; q1-p1
1523
    psubusb        m0, m4           ; p1-q1
1524
    por            m3, m0           ; FFABS(p1-q1)
1525
    mova           m0, [pb_80]
1526
    pxor           m2, m0
1527
    pxor           m4, m0
1528
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1529
    pand           m3, [pb_FE]
1530
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1531
    paddusb        m3, m1
1532
    psubusb        m3, m7
1533
    pxor           m1, m1
1534
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1535
1536
    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1537
    mova           m4, m5
1538
    pxor           m5, m0
1539
    pxor           m0, m6
1540
    psubsb         m5, m0           ; q0-p0 (signed)
1541
    paddsb         m2, m5
1542
    paddsb         m2, m5
1543
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1544
    pand           m2, m3           ; apply filter mask (m3)
1545
1546
    mova           m3, [pb_F8]
1547
    mova           m1, m2
1548
    paddsb         m2, [pb_4]       ; f1<<3=a+4
1549
    paddsb         m1, [pb_3]       ; f2<<3=a+3
1550
    pand           m2, m3
1551
    pand           m1, m3           ; cache f2<<3
1552
1553
    pxor           m0, m0
1554
    pxor           m3, m3
1555
    pcmpgtb        m0, m2           ; which values are <0?
1556
    psubb          m3, m2           ; -f1<<3
1557
    psrlq          m2, 3            ; +f1
1558
    psrlq          m3, 3            ; -f1
1559
    pand           m3, m0
1560
    pandn          m0, m2
1561
    psubusb        m4, m0
1562
    paddusb        m4, m3           ; q0-f1
1563
1564
    pxor           m0, m0
1565
    pxor           m3, m3
1566
    pcmpgtb        m0, m1           ; which values are <0?
1567
    psubb          m3, m1           ; -f2<<3
1568
    psrlq          m1, 3            ; +f2
1569
    psrlq          m3, 3            ; -f2
1570
    pand           m3, m0
1571
    pandn          m0, m1
1572
    paddusb        m6, m0
1573
    psubusb        m6, m3           ; p0+f2
1574
1575
    ; store
1576
%ifidn %2, v
1577
    mova         [r0], m4
1578
    mova      [r0+r1], m6
1579
%else ; h
1580 6341838f Ronald S. Bultje
    inc           r0
1581
    SBUTTERFLY    bw, 6, 4, 0
1582 f2a30bd8 Ronald S. Bultje
1583
%if mmsize == 16 ; sse2
1584 6341838f Ronald S. Bultje
%ifidn %1, sse4
1585
    inc            r4
1586
%endif
1587
    WRITE_8W       m6, r4, r0, r1, r2
1588
    lea            r4, [r3+r1+1]
1589
%ifidn %1, sse4
1590
    inc            r3
1591
%endif
1592
    WRITE_8W       m4, r3, r4, r1, r2
1593 f2a30bd8 Ronald S. Bultje
%else ; mmx/mmxext
1594 6341838f Ronald S. Bultje
    WRITE_2x4W     m6, m4, r4, r0, r1, r2
1595 f2a30bd8 Ronald S. Bultje
%endif
1596
%endif
1597
1598
%if mmsize == 8 ; mmx/mmxext
1599
    ; next 8 pixels
1600
%ifidn %2, v
1601
    add            r0, 8            ; advance 8 cols = pixels
1602
%else ; h
1603 6341838f Ronald S. Bultje
    lea            r0, [r0+r2*8-1]  ; advance 8 rows = lines
1604 f2a30bd8 Ronald S. Bultje
%endif
1605
    dec            r3
1606
    jg .next8px
1607
    REP_RET
1608
%else ; sse2
1609
    RET
1610
%endif
1611
%endmacro
1612
1613
INIT_MMX
1614 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_MMX
1615 3611c45a Ronald S. Bultje
SIMPLE_LOOPFILTER mmx,    v, 4, 0
1616
SIMPLE_LOOPFILTER mmx,    h, 5, 0
1617 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_MMXEXT
1618 3611c45a Ronald S. Bultje
SIMPLE_LOOPFILTER mmxext, v, 4, 0
1619
SIMPLE_LOOPFILTER mmxext, h, 5, 0
1620 f2a30bd8 Ronald S. Bultje
INIT_XMM
1621 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_SSE2
1622 6341838f Ronald S. Bultje
%define WRITE_8W   WRITE_8W_SSE2
1623 3611c45a Ronald S. Bultje
SIMPLE_LOOPFILTER sse2,   v, 3, 8
1624
SIMPLE_LOOPFILTER sse2,   h, 5, 8
1625 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_SSSE3
1626 3611c45a Ronald S. Bultje
SIMPLE_LOOPFILTER ssse3,  v, 3, 8
1627
SIMPLE_LOOPFILTER ssse3,  h, 5, 8
1628 6341838f Ronald S. Bultje
%define WRITE_8W   WRITE_8W_SSE4
1629 3611c45a Ronald S. Bultje
SIMPLE_LOOPFILTER sse4,   h, 5, 8
1630 a711eb48 Ronald S. Bultje
1631
;-----------------------------------------------------------------------------
1632 268821e7 Ronald S. Bultje
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1633 a711eb48 Ronald S. Bultje
;                                            int flimE, int flimI, int hev_thr);
1634
;-----------------------------------------------------------------------------
1635
1636 268821e7 Ronald S. Bultje
%macro INNER_LOOPFILTER 5
1637
%if %4 == 8 ; chroma
1638
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1639
%define dst8_reg    r1
1640
%define mstride_reg r2
1641
%define E_reg       r3
1642
%define I_reg       r4
1643
%define hev_thr_reg r5
1644
%else ; luma
1645
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1646 ede1b966 Ronald S. Bultje
%define mstride_reg r1
1647
%define E_reg       r2
1648
%define I_reg       r3
1649
%define hev_thr_reg r4
1650
%ifdef m8 ; x86-64, sse2
1651
%define dst8_reg    r4
1652
%elif mmsize == 16 ; x86-32, sse2
1653
%define dst8_reg    r5
1654
%else ; x86-32, mmx/mmxext
1655
%define cnt_reg     r5
1656
%endif
1657 268821e7 Ronald S. Bultje
%endif
1658
%define dst_reg     r0
1659 ede1b966 Ronald S. Bultje
%define stride_reg  E_reg
1660
%define dst2_reg    I_reg
1661
%ifndef m8
1662
%define stack_reg   hev_thr_reg
1663
%endif
1664
1665 bcd4aa64 Ronald S. Bultje
%ifnidn %1, sse2
1666
%if mmsize == 16
1667 7dd224a4 Jason Garrett-Glaser
    pxor             m7, m7
1668
%endif
1669 bcd4aa64 Ronald S. Bultje
%endif
1670 7dd224a4 Jason Garrett-Glaser
1671 a711eb48 Ronald S. Bultje
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
1672
    ; splat function arguments
1673 e3f7bf77 Ronald S. Bultje
    SPLATB_REG       m0, E_reg, m7   ; E
1674
    SPLATB_REG       m1, I_reg, m7   ; I
1675
    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
1676 a711eb48 Ronald S. Bultje
1677
    ; align stack
1678 ede1b966 Ronald S. Bultje
    mov       stack_reg, rsp         ; backup stack pointer
1679 a711eb48 Ronald S. Bultje
    and             rsp, ~(mmsize-1) ; align stack
1680
%ifidn %2, v
1681
    sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1682
                                     ;               [3]=hev() result
1683
%else ; h
1684 fb9bdf04 Ronald S. Bultje
    sub             rsp, mmsize * 5  ; extra storage space for transposes
1685 a711eb48 Ronald S. Bultje
%endif
1686
1687
%define flim_E   [rsp]
1688
%define flim_I   [rsp+mmsize]
1689
%define hev_thr  [rsp+mmsize*2]
1690
%define mask_res [rsp+mmsize*3]
1691 268821e7 Ronald S. Bultje
%define p0backup [rsp+mmsize*3]
1692
%define q0backup [rsp+mmsize*4]
1693 a711eb48 Ronald S. Bultje
1694
    mova         flim_E, m0
1695
    mova         flim_I, m1
1696
    mova        hev_thr, m2
1697
1698
%else ; sse2 on x86-64
1699
1700
%define flim_E   m9
1701
%define flim_I   m10
1702
%define hev_thr  m11
1703
%define mask_res m12
1704 268821e7 Ronald S. Bultje
%define p0backup m12
1705
%define q0backup m8
1706 a711eb48 Ronald S. Bultje
1707
    ; splat function arguments
1708 e3f7bf77 Ronald S. Bultje
    SPLATB_REG   flim_E, E_reg, m7   ; E
1709
    SPLATB_REG   flim_I, I_reg, m7   ; I
1710
    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
1711 a711eb48 Ronald S. Bultje
%endif
1712
1713 268821e7 Ronald S. Bultje
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1714 ede1b966 Ronald S. Bultje
    mov         cnt_reg, 2
1715 a711eb48 Ronald S. Bultje
%endif
1716 ede1b966 Ronald S. Bultje
    mov      stride_reg, mstride_reg
1717
    neg     mstride_reg
1718 a711eb48 Ronald S. Bultje
%ifidn %2, h
1719 ede1b966 Ronald S. Bultje
    lea         dst_reg, [dst_reg + stride_reg*4-4]
1720 268821e7 Ronald S. Bultje
%if %4 == 8
1721
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
1722
%endif
1723 a711eb48 Ronald S. Bultje
%endif
1724
1725
%if mmsize == 8
1726
.next8px
1727
%endif
1728
    ; read
1729 ede1b966 Ronald S. Bultje
    lea        dst2_reg, [dst_reg + stride_reg]
1730 a711eb48 Ronald S. Bultje
%ifidn %2, v
1731 268821e7 Ronald S. Bultje
%if %4 == 8 && mmsize == 16
1732
%define movrow movh
1733
%else
1734
%define movrow mova
1735
%endif
1736
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
1737
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
1738
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
1739
    movrow           m5, [dst2_reg]               ; q1
1740
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
1741
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
1742
%if mmsize == 16 && %4 == 8
1743
    movhps           m0, [dst8_reg+mstride_reg*4]
1744
    movhps           m2, [dst8_reg+mstride_reg*2]
1745
    add        dst8_reg, stride_reg
1746
    movhps           m1, [dst8_reg+mstride_reg*4]
1747
    movhps           m5, [dst8_reg]
1748
    movhps           m6, [dst8_reg+ stride_reg]
1749
    movhps           m7, [dst8_reg+ stride_reg*2]
1750
    add        dst8_reg, mstride_reg
1751
%endif
1752 a711eb48 Ronald S. Bultje
%elif mmsize == 8 ; mmx/mmxext (h)
1753
    ; read 8 rows of 8px each
1754 ede1b966 Ronald S. Bultje
    movu             m0, [dst_reg +mstride_reg*4]
1755
    movu             m1, [dst2_reg+mstride_reg*4]
1756
    movu             m2, [dst_reg +mstride_reg*2]
1757
    movu             m3, [dst_reg +mstride_reg]
1758
    movu             m4, [dst_reg]
1759
    movu             m5, [dst2_reg]
1760
    movu             m6, [dst2_reg+ stride_reg]
1761 a711eb48 Ronald S. Bultje
1762
    ; 8x8 transpose
1763
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1764 268821e7 Ronald S. Bultje
    mova       q0backup, m1
1765 ede1b966 Ronald S. Bultje
    movu             m7, [dst2_reg+ stride_reg*2]
1766 a711eb48 Ronald S. Bultje
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1767
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1768
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1769
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1770 268821e7 Ronald S. Bultje
    mova             m1, q0backup
1771
    mova       q0backup, m2          ; store q0
1772 a711eb48 Ronald S. Bultje
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1773 268821e7 Ronald S. Bultje
    mova       p0backup, m5          ; store p0
1774 a711eb48 Ronald S. Bultje
    SWAP              1, 4
1775
    SWAP              2, 4
1776
    SWAP              6, 3
1777
    SWAP              5, 3
1778
%else ; sse2 (h)
1779 268821e7 Ronald S. Bultje
%if %4 == 16
1780 ede1b966 Ronald S. Bultje
    lea        dst8_reg, [dst_reg + stride_reg*8]
1781 268821e7 Ronald S. Bultje
%endif
1782 a711eb48 Ronald S. Bultje
1783
    ; read 16 rows of 8px each, interleave
1784 ede1b966 Ronald S. Bultje
    movh             m0, [dst_reg +mstride_reg*4]
1785
    movh             m1, [dst8_reg+mstride_reg*4]
1786
    movh             m2, [dst_reg +mstride_reg*2]
1787
    movh             m5, [dst8_reg+mstride_reg*2]
1788
    movh             m3, [dst_reg +mstride_reg]
1789
    movh             m6, [dst8_reg+mstride_reg]
1790
    movh             m4, [dst_reg]
1791
    movh             m7, [dst8_reg]
1792 a711eb48 Ronald S. Bultje
    punpcklbw        m0, m1          ; A/I
1793
    punpcklbw        m2, m5          ; C/K
1794
    punpcklbw        m3, m6          ; D/L
1795
    punpcklbw        m4, m7          ; E/M
1796
1797 ede1b966 Ronald S. Bultje
    add        dst8_reg, stride_reg
1798
    movh             m1, [dst2_reg+mstride_reg*4]
1799
    movh             m6, [dst8_reg+mstride_reg*4]
1800
    movh             m5, [dst2_reg]
1801
    movh             m7, [dst8_reg]
1802 a711eb48 Ronald S. Bultje
    punpcklbw        m1, m6          ; B/J
1803
    punpcklbw        m5, m7          ; F/N
1804 ede1b966 Ronald S. Bultje
    movh             m6, [dst2_reg+ stride_reg]
1805
    movh             m7, [dst8_reg+ stride_reg]
1806 a711eb48 Ronald S. Bultje
    punpcklbw        m6, m7          ; G/O
1807
1808
    ; 8x16 transpose
1809
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1810 268821e7 Ronald S. Bultje
%ifdef m8
1811 fb9bdf04 Ronald S. Bultje
    SWAP              1, 8
1812 a711eb48 Ronald S. Bultje
%else
1813 268821e7 Ronald S. Bultje
    mova       q0backup, m1
1814 a711eb48 Ronald S. Bultje
%endif
1815 ede1b966 Ronald S. Bultje
    movh             m7, [dst2_reg+ stride_reg*2]
1816
    movh             m1, [dst8_reg+ stride_reg*2]
1817 a711eb48 Ronald S. Bultje
    punpcklbw        m7, m1          ; H/P
1818
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1819
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1820
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1821
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1822 268821e7 Ronald S. Bultje
%ifdef m8
1823 fb9bdf04 Ronald S. Bultje
    SWAP              1, 8
1824
    SWAP              2, 8
1825 a711eb48 Ronald S. Bultje
%else
1826 268821e7 Ronald S. Bultje
    mova             m1, q0backup
1827
    mova       q0backup, m2          ; store q0
1828 a711eb48 Ronald S. Bultje
%endif
1829
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1830 268821e7 Ronald S. Bultje
%ifdef m12
1831 fb9bdf04 Ronald S. Bultje
    SWAP              5, 12
1832 a711eb48 Ronald S. Bultje
%else
1833 268821e7 Ronald S. Bultje
    mova       p0backup, m5          ; store p0
1834 a711eb48 Ronald S. Bultje
%endif
1835
    SWAP              1, 4
1836
    SWAP              2, 4
1837
    SWAP              6, 3
1838
    SWAP              5, 3
1839
%endif
1840
1841
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1842
    mova             m4, m1
1843
    SWAP              4, 1
1844
    psubusb          m4, m0          ; p2-p3
1845
    psubusb          m0, m1          ; p3-p2
1846
    por              m0, m4          ; abs(p3-p2)
1847
1848
    mova             m4, m2
1849
    SWAP              4, 2
1850
    psubusb          m4, m1          ; p1-p2
1851
    psubusb          m1, m2          ; p2-p1
1852
    por              m1, m4          ; abs(p2-p1)
1853
1854
    mova             m4, m6
1855
    SWAP              4, 6
1856
    psubusb          m4, m7          ; q2-q3
1857
    psubusb          m7, m6          ; q3-q2
1858
    por              m7, m4          ; abs(q3-q2)
1859
1860
    mova             m4, m5
1861
    SWAP              4, 5
1862
    psubusb          m4, m6          ; q1-q2
1863
    psubusb          m6, m5          ; q2-q1
1864
    por              m6, m4          ; abs(q2-q1)
1865
1866
%ifidn %1, mmx
1867 268821e7 Ronald S. Bultje
    mova             m4, flim_I
1868 a711eb48 Ronald S. Bultje
    pxor             m3, m3
1869
    psubusb          m0, m4
1870
    psubusb          m1, m4
1871
    psubusb          m7, m4
1872
    psubusb          m6, m4
1873
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
1874
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
1875
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
1876
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
1877
    pand             m0, m1
1878
    pand             m7, m6
1879
    pand             m0, m7
1880
%else ; mmxext/sse2
1881
    pmaxub           m0, m1
1882
    pmaxub           m6, m7
1883
    pmaxub           m0, m6
1884
%endif
1885
1886
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
1887
    SWAP              7, 3           ; now m7 is zero
1888
%ifidn %2, v
1889 268821e7 Ronald S. Bultje
    movrow           m3, [dst_reg +mstride_reg] ; p0
1890
%if mmsize == 16 && %4 == 8
1891
    movhps           m3, [dst8_reg+mstride_reg]
1892
%endif
1893
%elifdef m12
1894 fb9bdf04 Ronald S. Bultje
    SWAP              3, 12
1895 a711eb48 Ronald S. Bultje
%else
1896 268821e7 Ronald S. Bultje
    mova             m3, p0backup
1897 a711eb48 Ronald S. Bultje
%endif
1898
1899
    mova             m1, m2
1900
    SWAP              1, 2
1901
    mova             m6, m3
1902
    SWAP              3, 6
1903
    psubusb          m1, m3          ; p1-p0
1904
    psubusb          m6, m2          ; p0-p1
1905
    por              m1, m6          ; abs(p1-p0)
1906
%ifidn %1, mmx
1907
    mova             m6, m1
1908
    psubusb          m1, m4
1909
    psubusb          m6, hev_thr
1910
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
1911
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
1912
    pand             m0, m1
1913 268821e7 Ronald S. Bultje
    mova       mask_res, m6
1914 a711eb48 Ronald S. Bultje
%else ; mmxext/sse2
1915
    pmaxub           m0, m1          ; max_I
1916
    SWAP              1, 4           ; max_hev_thresh
1917
%endif
1918
1919
    SWAP              6, 4           ; now m6 is I
1920
%ifidn %2, v
1921 268821e7 Ronald S. Bultje
    movrow           m4, [dst_reg]   ; q0
1922
%if mmsize == 16 && %4 == 8
1923
    movhps           m4, [dst8_reg]
1924
%endif
1925
%elifdef m8
1926 fb9bdf04 Ronald S. Bultje
    SWAP              4, 8
1927 a711eb48 Ronald S. Bultje
%else
1928 268821e7 Ronald S. Bultje
    mova             m4, q0backup
1929 a711eb48 Ronald S. Bultje
%endif
1930
    mova             m1, m4
1931
    SWAP              1, 4
1932
    mova             m7, m5
1933
    SWAP              7, 5
1934
    psubusb          m1, m5          ; q0-q1
1935
    psubusb          m7, m4          ; q1-q0
1936
    por              m1, m7          ; abs(q1-q0)
1937
%ifidn %1, mmx
1938
    mova             m7, m1
1939
    psubusb          m1, m6
1940
    psubusb          m7, hev_thr
1941
    pxor             m6, m6
1942
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
1943
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1944 268821e7 Ronald S. Bultje
    mova             m6, mask_res
1945 a711eb48 Ronald S. Bultje
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
1946
    pand             m6, m7
1947
%else ; mmxext/sse2
1948
    pxor             m7, m7
1949
    pmaxub           m0, m1
1950
    pmaxub           m6, m1
1951
    psubusb          m0, flim_I
1952
    psubusb          m6, hev_thr
1953
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
1954
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
1955
%endif
1956
%ifdef m12
1957
    SWAP              6, 12
1958
%else
1959 268821e7 Ronald S. Bultje
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1960 a711eb48 Ronald S. Bultje
%endif
1961
1962
    ; simple_limit
1963
    mova             m1, m3
1964
    SWAP              1, 3
1965
    mova             m6, m4          ; keep copies of p0/q0 around for later use
1966
    SWAP              6, 4
1967
    psubusb          m1, m4          ; p0-q0
1968
    psubusb          m6, m3          ; q0-p0
1969
    por              m1, m6          ; abs(q0-p0)
1970
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
1971
1972
    mova             m7, m2
1973
    SWAP              7, 2
1974
    mova             m6, m5
1975
    SWAP              6, 5
1976
    psubusb          m7, m5          ; p1-q1
1977
    psubusb          m6, m2          ; q1-p1
1978
    por              m7, m6          ; abs(q1-p1)
1979
    pxor             m6, m6
1980
    pand             m7, [pb_FE]
1981
    psrlq            m7, 1           ; abs(q1-p1)/2
1982
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
1983
    psubusb          m7, flim_E
1984
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1985
    pand             m0, m7          ; normal_limit result
1986
1987
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1988
%ifdef m8 ; x86-64 && sse2
1989
    mova             m8, [pb_80]
1990
%define pb_80_var m8
1991
%else ; x86-32 or mmx/mmxext
1992
%define pb_80_var [pb_80]
1993
%endif
1994
    mova             m1, m4
1995
    mova             m7, m3
1996
    pxor             m1, pb_80_var
1997
    pxor             m7, pb_80_var
1998
    psubsb           m1, m7          ; (signed) q0-p0
1999
    mova             m6, m2
2000
    mova             m7, m5
2001
    pxor             m6, pb_80_var
2002
    pxor             m7, pb_80_var
2003
    psubsb           m6, m7          ; (signed) p1-q1
2004
    mova             m7, mask_res
2005
    pandn            m7, m6
2006
    paddsb           m7, m1
2007
    paddsb           m7, m1
2008
    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)
2009
2010
    pand             m7, m0
2011
    mova             m1, [pb_F8]
2012
    mova             m6, m7
2013
    paddsb           m7, [pb_3]
2014
    paddsb           m6, [pb_4]
2015
    pand             m7, m1
2016
    pand             m6, m1
2017
2018
    pxor             m1, m1
2019
    pxor             m0, m0
2020
    pcmpgtb          m1, m7
2021
    psubb            m0, m7
2022
    psrlq            m7, 3           ; +f2
2023
    psrlq            m0, 3           ; -f2
2024
    pand             m0, m1
2025
    pandn            m1, m7
2026
    psubusb          m3, m0
2027
    paddusb          m3, m1          ; p0+f2
2028
2029
    pxor             m1, m1
2030
    pxor             m0, m0
2031
    pcmpgtb          m0, m6
2032
    psubb            m1, m6
2033
    psrlq            m6, 3           ; +f1
2034
    psrlq            m1, 3           ; -f1
2035
    pand             m1, m0
2036
    pandn            m0, m6
2037
    psubusb          m4, m0
2038
    paddusb          m4, m1          ; q0-f1
2039
2040
%ifdef m12
2041
    SWAP              6, 12
2042
%else
2043 268821e7 Ronald S. Bultje
    mova             m6, mask_res
2044 a711eb48 Ronald S. Bultje
%endif
2045
%ifidn %1, mmx
2046
    mova             m7, [pb_1]
2047
%else ; mmxext/sse2
2048
    pxor             m7, m7
2049
%endif
2050
    pand             m0, m6
2051
    pand             m1, m6
2052
%ifidn %1, mmx
2053
    paddusb          m0, m7
2054
    pand             m1, [pb_FE]
2055
    pandn            m7, m0
2056
    psrlq            m1, 1
2057
    psrlq            m7, 1
2058
    SWAP              0, 7
2059
%else ; mmxext/sse2
2060
    psubusb          m1, [pb_1]
2061
    pavgb            m0, m7          ; a
2062
    pavgb            m1, m7          ; -a
2063
%endif
2064
    psubusb          m5, m0
2065
    psubusb          m2, m1
2066
    paddusb          m5, m1          ; q1-a
2067
    paddusb          m2, m0          ; p1+a
2068
2069
    ; store
2070
%ifidn %2, v
2071 268821e7 Ronald S. Bultje
    movrow [dst_reg +mstride_reg*2], m2
2072
    movrow [dst_reg +mstride_reg  ], m3
2073
    movrow    [dst_reg], m4
2074
    movrow [dst_reg + stride_reg  ], m5
2075
%if mmsize == 16 && %4 == 8
2076
    movhps [dst8_reg+mstride_reg*2], m2
2077
    movhps [dst8_reg+mstride_reg  ], m3
2078
    movhps   [dst8_reg], m4
2079
    movhps [dst8_reg+ stride_reg  ], m5
2080
%endif
2081 a711eb48 Ronald S. Bultje
%else ; h
2082 268821e7 Ronald S. Bultje
    add         dst_reg, 2
2083
    add        dst2_reg, 2
2084 a711eb48 Ronald S. Bultje
2085
    ; 4x8/16 transpose
2086
    TRANSPOSE4x4B     2, 3, 4, 5, 6
2087
2088
%if mmsize == 8 ; mmx/mmxext (h)
2089 ede1b966 Ronald S. Bultje
    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
2090 a711eb48 Ronald S. Bultje
%else ; sse2 (h)
2091 819b2dd2 Ronald S. Bultje
    lea        dst8_reg, [dst8_reg+mstride_reg+2]
2092 268821e7 Ronald S. Bultje
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2093 a711eb48 Ronald S. Bultje
%endif
2094
%endif
2095
2096
%if mmsize == 8
2097 268821e7 Ronald S. Bultje
%if %4 == 8 ; chroma
2098
%ifidn %2, h
2099
    sub         dst_reg, 2
2100
%endif
2101
    cmp         dst_reg, dst8_reg
2102
    mov         dst_reg, dst8_reg
2103
    jnz .next8px
2104
%else
2105 a711eb48 Ronald S. Bultje
%ifidn %2, h
2106 ede1b966 Ronald S. Bultje
    lea         dst_reg, [dst_reg + stride_reg*8-2]
2107 a711eb48 Ronald S. Bultje
%else ; v
2108 ede1b966 Ronald S. Bultje
    add         dst_reg, 8
2109 a711eb48 Ronald S. Bultje
%endif
2110 ede1b966 Ronald S. Bultje
    dec         cnt_reg
2111 a711eb48 Ronald S. Bultje
    jg .next8px
2112
%endif
2113 268821e7 Ronald S. Bultje
%endif
2114 a711eb48 Ronald S. Bultje
2115
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2116 ede1b966 Ronald S. Bultje
    mov             rsp, stack_reg   ; restore stack pointer
2117 a711eb48 Ronald S. Bultje
%endif
2118 526e831a Ronald S. Bultje
    RET
2119 a711eb48 Ronald S. Bultje
%endmacro
2120
2121
INIT_MMX
2122 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_MMX
2123 7dd224a4 Jason Garrett-Glaser
INNER_LOOPFILTER mmx,    v, 6, 16, 0
2124
INNER_LOOPFILTER mmx,    h, 6, 16, 0
2125
INNER_LOOPFILTER mmx,    v, 6,  8, 0
2126
INNER_LOOPFILTER mmx,    h, 6,  8, 0
2127 e3f7bf77 Ronald S. Bultje
2128
%define SPLATB_REG SPLATB_REG_MMXEXT
2129
INNER_LOOPFILTER mmxext, v, 6, 16, 0
2130
INNER_LOOPFILTER mmxext, h, 6, 16, 0
2131 7dd224a4 Jason Garrett-Glaser
INNER_LOOPFILTER mmxext, v, 6,  8, 0
2132
INNER_LOOPFILTER mmxext, h, 6,  8, 0
2133 268821e7 Ronald S. Bultje
2134 a711eb48 Ronald S. Bultje
INIT_XMM
2135 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_SSE2
2136 268821e7 Ronald S. Bultje
INNER_LOOPFILTER sse2,   v, 5, 16, 13
2137 ede1b966 Ronald S. Bultje
%ifdef m8
2138 268821e7 Ronald S. Bultje
INNER_LOOPFILTER sse2,   h, 5, 16, 13
2139 ede1b966 Ronald S. Bultje
%else
2140 268821e7 Ronald S. Bultje
INNER_LOOPFILTER sse2,   h, 6, 16, 13
2141 ede1b966 Ronald S. Bultje
%endif
2142 268821e7 Ronald S. Bultje
INNER_LOOPFILTER sse2,   v, 6,  8, 13
2143
INNER_LOOPFILTER sse2,   h, 6,  8, 13
2144 e9e456d8 Ronald S. Bultje
2145 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_SSSE3
2146 7dd224a4 Jason Garrett-Glaser
INNER_LOOPFILTER ssse3,  v, 5, 16, 13
2147
%ifdef m8
2148
INNER_LOOPFILTER ssse3,  h, 5, 16, 13
2149
%else
2150
INNER_LOOPFILTER ssse3,  h, 6, 16, 13
2151
%endif
2152
INNER_LOOPFILTER ssse3,  v, 6,  8, 13
2153
INNER_LOOPFILTER ssse3,  h, 6,  8, 13
2154
2155 e9e456d8 Ronald S. Bultje
;-----------------------------------------------------------------------------
2156
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
2157
;                                            int flimE, int flimI, int hev_thr);
2158
;-----------------------------------------------------------------------------
2159
2160
%macro MBEDGE_LOOPFILTER 5
2161
%if %4 == 8 ; chroma
2162
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
2163
%define dst8_reg    r1
2164
%define mstride_reg r2
2165
%define E_reg       r3
2166
%define I_reg       r4
2167
%define hev_thr_reg r5
2168
%else ; luma
2169
cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
2170
%define mstride_reg r1
2171
%define E_reg       r2
2172
%define I_reg       r3
2173
%define hev_thr_reg r4
2174
%ifdef m8 ; x86-64, sse2
2175
%define dst8_reg    r4
2176
%elif mmsize == 16 ; x86-32, sse2
2177
%define dst8_reg    r5
2178
%else ; x86-32, mmx/mmxext
2179
%define cnt_reg     r5
2180
%endif
2181
%endif
2182
%define dst_reg     r0
2183
%define stride_reg  E_reg
2184
%define dst2_reg    I_reg
2185
%ifndef m8
2186
%define stack_reg   hev_thr_reg
2187
%endif
2188
2189 ab4d0318 Ronald S. Bultje
%define ssse3_or_higher 0
2190 bcd4aa64 Ronald S. Bultje
%ifnidn %1, sse2
2191
%if mmsize == 16
2192 ab4d0318 Ronald S. Bultje
%define ssse3_or_higher 1
2193
%endif
2194 7dd224a4 Jason Garrett-Glaser
%endif
2195 ab4d0318 Ronald S. Bultje
2196
%if ssse3_or_higher
2197
    pxor             m7, m7
2198 bcd4aa64 Ronald S. Bultje
%endif
2199 7dd224a4 Jason Garrett-Glaser
2200 e9e456d8 Ronald S. Bultje
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
2201
    ; splat function arguments
2202 e3f7bf77 Ronald S. Bultje
    SPLATB_REG       m0, E_reg, m7   ; E
2203
    SPLATB_REG       m1, I_reg, m7   ; I
2204
    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
2205 e9e456d8 Ronald S. Bultje
2206
    ; align stack
2207
    mov       stack_reg, rsp         ; backup stack pointer
2208
    and             rsp, ~(mmsize-1) ; align stack
2209 48adb7e7 Ronald S. Bultje
%if mmsize == 16
2210 2a180c69 Ronald S. Bultje
    sub             rsp, mmsize * 7
2211
%else
2212 e9e456d8 Ronald S. Bultje
    sub             rsp, mmsize * 8  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2213
                                     ;               [3]=hev() result
2214
                                     ;               [4]=filter tmp result
2215
                                     ;               [5]/[6] = p2/q2 backup
2216
                                     ;               [7]=lim_res sign result
2217 2a180c69 Ronald S. Bultje
%endif
2218 e9e456d8 Ronald S. Bultje
2219
%define flim_E   [rsp]
2220
%define flim_I   [rsp+mmsize]
2221
%define hev_thr  [rsp+mmsize*2]
2222
%define mask_res [rsp+mmsize*3]
2223
%define lim_res  [rsp+mmsize*4]
2224
%define p0backup [rsp+mmsize*3]
2225
%define q0backup [rsp+mmsize*4]
2226
%define p2backup [rsp+mmsize*5]
2227
%define q2backup [rsp+mmsize*6]
2228 48adb7e7 Ronald S. Bultje
%if mmsize == 16
2229 2a180c69 Ronald S. Bultje
%define lim_sign [rsp]
2230
%else
2231 e9e456d8 Ronald S. Bultje
%define lim_sign [rsp+mmsize*7]
2232 2a180c69 Ronald S. Bultje
%endif
2233 e9e456d8 Ronald S. Bultje
2234
    mova         flim_E, m0
2235
    mova         flim_I, m1
2236
    mova        hev_thr, m2
2237
2238
%else ; sse2 on x86-64
2239
2240
%define flim_E   m9
2241
%define flim_I   m10
2242
%define hev_thr  m11
2243
%define mask_res m12
2244
%define lim_res  m8
2245
%define p0backup m12
2246
%define q0backup m8
2247
%define p2backup m13
2248
%define q2backup m14
2249 2a180c69 Ronald S. Bultje
%define lim_sign m9
2250 e9e456d8 Ronald S. Bultje
2251
    ; splat function arguments
2252 e3f7bf77 Ronald S. Bultje
    SPLATB_REG   flim_E, E_reg, m7   ; E
2253
    SPLATB_REG   flim_I, I_reg, m7   ; I
2254
    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
2255 e9e456d8 Ronald S. Bultje
%endif
2256
2257
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
2258
    mov         cnt_reg, 2
2259
%endif
2260
    mov      stride_reg, mstride_reg
2261
    neg     mstride_reg
2262
%ifidn %2, h
2263
    lea         dst_reg, [dst_reg + stride_reg*4-4]
2264
%if %4 == 8
2265
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
2266
%endif
2267
%endif
2268
2269
%if mmsize == 8
2270
.next8px
2271
%endif
2272
    ; read
2273
    lea        dst2_reg, [dst_reg + stride_reg]
2274
%ifidn %2, v
2275
%if %4 == 8 && mmsize == 16
2276
%define movrow movh
2277
%else
2278
%define movrow mova
2279
%endif
2280
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
2281
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
2282
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
2283
    movrow           m5, [dst2_reg]               ; q1
2284
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
2285
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
2286
%if mmsize == 16 && %4 == 8
2287
    movhps           m0, [dst8_reg+mstride_reg*4]
2288
    movhps           m2, [dst8_reg+mstride_reg*2]
2289
    add        dst8_reg, stride_reg
2290
    movhps           m1, [dst8_reg+mstride_reg*4]
2291
    movhps           m5, [dst8_reg]
2292
    movhps           m6, [dst8_reg+ stride_reg]
2293
    movhps           m7, [dst8_reg+ stride_reg*2]
2294
    add        dst8_reg, mstride_reg
2295
%endif
2296
%elif mmsize == 8 ; mmx/mmxext (h)
2297
    ; read 8 rows of 8px each
2298
    movu             m0, [dst_reg +mstride_reg*4]
2299
    movu             m1, [dst2_reg+mstride_reg*4]
2300
    movu             m2, [dst_reg +mstride_reg*2]
2301
    movu             m3, [dst_reg +mstride_reg]
2302
    movu             m4, [dst_reg]
2303
    movu             m5, [dst2_reg]
2304
    movu             m6, [dst2_reg+ stride_reg]
2305
2306
    ; 8x8 transpose
2307
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2308
    mova       q0backup, m1
2309
    movu             m7, [dst2_reg+ stride_reg*2]
2310
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2311
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2312
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2313
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2314
    mova             m1, q0backup
2315
    mova       q0backup, m2          ; store q0
2316
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2317
    mova       p0backup, m5          ; store p0
2318
    SWAP              1, 4
2319
    SWAP              2, 4
2320
    SWAP              6, 3
2321
    SWAP              5, 3
2322
%else ; sse2 (h)
2323
%if %4 == 16
2324
    lea        dst8_reg, [dst_reg + stride_reg*8]
2325
%endif
2326
2327
    ; read 16 rows of 8px each, interleave
2328
    movh             m0, [dst_reg +mstride_reg*4]
2329
    movh             m1, [dst8_reg+mstride_reg*4]
2330
    movh             m2, [dst_reg +mstride_reg*2]
2331
    movh             m5, [dst8_reg+mstride_reg*2]
2332
    movh             m3, [dst_reg +mstride_reg]
2333
    movh             m6, [dst8_reg+mstride_reg]
2334
    movh             m4, [dst_reg]
2335
    movh             m7, [dst8_reg]
2336
    punpcklbw        m0, m1          ; A/I
2337
    punpcklbw        m2, m5          ; C/K
2338
    punpcklbw        m3, m6          ; D/L
2339
    punpcklbw        m4, m7          ; E/M
2340
2341
    add        dst8_reg, stride_reg
2342
    movh             m1, [dst2_reg+mstride_reg*4]
2343
    movh             m6, [dst8_reg+mstride_reg*4]
2344
    movh             m5, [dst2_reg]
2345
    movh             m7, [dst8_reg]
2346
    punpcklbw        m1, m6          ; B/J
2347
    punpcklbw        m5, m7          ; F/N
2348
    movh             m6, [dst2_reg+ stride_reg]
2349
    movh             m7, [dst8_reg+ stride_reg]
2350
    punpcklbw        m6, m7          ; G/O
2351
2352
    ; 8x16 transpose
2353
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2354
%ifdef m8
2355
    SWAP              1, 8
2356
%else
2357
    mova       q0backup, m1
2358
%endif
2359
    movh             m7, [dst2_reg+ stride_reg*2]
2360
    movh             m1, [dst8_reg+ stride_reg*2]
2361
    punpcklbw        m7, m1          ; H/P
2362
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2363
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2364
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2365
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2366
%ifdef m8
2367
    SWAP              1, 8
2368
    SWAP              2, 8
2369
%else
2370
    mova             m1, q0backup
2371
    mova       q0backup, m2          ; store q0
2372
%endif
2373
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2374
%ifdef m12
2375
    SWAP              5, 12
2376
%else
2377
    mova       p0backup, m5          ; store p0
2378
%endif
2379
    SWAP              1, 4
2380
    SWAP              2, 4
2381
    SWAP              6, 3
2382
    SWAP              5, 3
2383
%endif
2384
2385
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
2386
    mova             m4, m1
2387
    SWAP              4, 1
2388
    psubusb          m4, m0          ; p2-p3
2389
    psubusb          m0, m1          ; p3-p2
2390
    por              m0, m4          ; abs(p3-p2)
2391
2392
    mova             m4, m2
2393
    SWAP              4, 2
2394
    psubusb          m4, m1          ; p1-p2
2395
    mova       p2backup, m1
2396
    psubusb          m1, m2          ; p2-p1
2397
    por              m1, m4          ; abs(p2-p1)
2398
2399
    mova             m4, m6
2400
    SWAP              4, 6
2401
    psubusb          m4, m7          ; q2-q3
2402
    psubusb          m7, m6          ; q3-q2
2403
    por              m7, m4          ; abs(q3-q2)
2404
2405
    mova             m4, m5
2406
    SWAP              4, 5
2407
    psubusb          m4, m6          ; q1-q2
2408
    mova       q2backup, m6
2409
    psubusb          m6, m5          ; q2-q1
2410
    por              m6, m4          ; abs(q2-q1)
2411
2412
%ifidn %1, mmx
2413
    mova             m4, flim_I
2414
    pxor             m3, m3
2415
    psubusb          m0, m4
2416
    psubusb          m1, m4
2417
    psubusb          m7, m4
2418
    psubusb          m6, m4
2419
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
2420
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
2421
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
2422
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
2423
    pand             m0, m1
2424
    pand             m7, m6
2425
    pand             m0, m7
2426
%else ; mmxext/sse2
2427
    pmaxub           m0, m1
2428
    pmaxub           m6, m7
2429
    pmaxub           m0, m6
2430
%endif
2431
2432
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
2433
    SWAP              7, 3           ; now m7 is zero
2434
%ifidn %2, v
2435
    movrow           m3, [dst_reg +mstride_reg] ; p0
2436
%if mmsize == 16 && %4 == 8
2437
    movhps           m3, [dst8_reg+mstride_reg]
2438
%endif
2439
%elifdef m12
2440
    SWAP              3, 12
2441
%else
2442
    mova             m3, p0backup
2443
%endif
2444
2445
    mova             m1, m2
2446
    SWAP              1, 2
2447
    mova             m6, m3
2448
    SWAP              3, 6
2449
    psubusb          m1, m3          ; p1-p0
2450
    psubusb          m6, m2          ; p0-p1
2451
    por              m1, m6          ; abs(p1-p0)
2452
%ifidn %1, mmx
2453
    mova             m6, m1
2454
    psubusb          m1, m4
2455
    psubusb          m6, hev_thr
2456
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
2457
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
2458
    pand             m0, m1
2459
    mova       mask_res, m6
2460
%else ; mmxext/sse2
2461
    pmaxub           m0, m1          ; max_I
2462
    SWAP              1, 4           ; max_hev_thresh
2463
%endif
2464
2465
    SWAP              6, 4           ; now m6 is I
2466
%ifidn %2, v
2467
    movrow           m4, [dst_reg]   ; q0
2468
%if mmsize == 16 && %4 == 8
2469
    movhps           m4, [dst8_reg]
2470
%endif
2471
%elifdef m8
2472
    SWAP              4, 8
2473
%else
2474
    mova             m4, q0backup
2475
%endif
2476
    mova             m1, m4
2477
    SWAP              1, 4
2478
    mova             m7, m5
2479
    SWAP              7, 5
2480
    psubusb          m1, m5          ; q0-q1
2481
    psubusb          m7, m4          ; q1-q0
2482
    por              m1, m7          ; abs(q1-q0)
2483
%ifidn %1, mmx
2484
    mova             m7, m1
2485
    psubusb          m1, m6
2486
    psubusb          m7, hev_thr
2487
    pxor             m6, m6
2488
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
2489
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
2490
    mova             m6, mask_res
2491
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
2492
    pand             m6, m7
2493
%else ; mmxext/sse2
2494
    pxor             m7, m7
2495
    pmaxub           m0, m1
2496
    pmaxub           m6, m1
2497
    psubusb          m0, flim_I
2498
    psubusb          m6, hev_thr
2499
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
2500
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
2501
%endif
2502
%ifdef m12
2503
    SWAP              6, 12
2504
%else
2505
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2506
%endif
2507
2508
    ; simple_limit
2509
    mova             m1, m3
2510
    SWAP              1, 3
2511
    mova             m6, m4          ; keep copies of p0/q0 around for later use
2512
    SWAP              6, 4
2513
    psubusb          m1, m4          ; p0-q0
2514
    psubusb          m6, m3          ; q0-p0
2515
    por              m1, m6          ; abs(q0-p0)
2516
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
2517
2518
    mova             m7, m2
2519
    SWAP              7, 2
2520
    mova             m6, m5
2521
    SWAP              6, 5
2522
    psubusb          m7, m5          ; p1-q1
2523
    psubusb          m6, m2          ; q1-p1
2524
    por              m7, m6          ; abs(q1-p1)
2525
    pxor             m6, m6
2526
    pand             m7, [pb_FE]
2527
    psrlq            m7, 1           ; abs(q1-p1)/2
2528
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
2529
    psubusb          m7, flim_E
2530
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
2531
    pand             m0, m7          ; normal_limit result
2532
2533
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
2534
%ifdef m8 ; x86-64 && sse2
2535
    mova             m8, [pb_80]
2536
%define pb_80_var m8
2537
%else ; x86-32 or mmx/mmxext
2538
%define pb_80_var [pb_80]
2539
%endif
2540
    mova             m1, m4
2541
    mova             m7, m3
2542
    pxor             m1, pb_80_var
2543
    pxor             m7, pb_80_var
2544
    psubsb           m1, m7          ; (signed) q0-p0
2545
    mova             m6, m2
2546
    mova             m7, m5
2547
    pxor             m6, pb_80_var
2548
    pxor             m7, pb_80_var
2549
    psubsb           m6, m7          ; (signed) p1-q1
2550
    mova             m7, mask_res
2551
    paddsb           m6, m1
2552
    paddsb           m6, m1
2553
    paddsb           m6, m1
2554
    pand             m6, m0
2555
%ifdef m8
2556
    mova        lim_res, m6          ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
2557
    pand        lim_res, m7
2558
%else
2559
    mova             m0, m6
2560
    pand             m0, m7
2561
    mova        lim_res, m0
2562
%endif
2563
    pandn            m7, m6          ; 3*(q0-p0)+(p1-q1) masked for filter_common
2564
2565
    mova             m1, [pb_F8]
2566
    mova             m6, m7
2567
    paddsb           m7, [pb_3]
2568
    paddsb           m6, [pb_4]
2569
    pand             m7, m1
2570
    pand             m6, m1
2571
2572
    pxor             m1, m1
2573
    pxor             m0, m0
2574
    pcmpgtb          m1, m7
2575
    psubb            m0, m7
2576
    psrlq            m7, 3           ; +f2
2577
    psrlq            m0, 3           ; -f2
2578
    pand             m0, m1
2579
    pandn            m1, m7
2580
    psubusb          m3, m0
2581
    paddusb          m3, m1          ; p0+f2
2582
2583
    pxor             m1, m1
2584
    pxor             m0, m0
2585
    pcmpgtb          m0, m6
2586
    psubb            m1, m6
2587
    psrlq            m6, 3           ; +f1
2588
    psrlq            m1, 3           ; -f1
2589
    pand             m1, m0
2590
    pandn            m0, m6
2591
    psubusb          m4, m0
2592
    paddusb          m4, m1          ; q0-f1
2593
2594
    ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2595 ab4d0318 Ronald S. Bultje
%if ssse3_or_higher
2596
    mova             m7, [pb_1]
2597
%else
2598 e9e456d8 Ronald S. Bultje
    mova             m7, [pw_63]
2599 ab4d0318 Ronald S. Bultje
%endif
2600 e9e456d8 Ronald S. Bultje
%ifdef m8
2601
    SWAP              1, 8
2602
%else
2603
    mova             m1, lim_res
2604
%endif
2605
    pxor             m0, m0
2606
    mova             m6, m1
2607
    pcmpgtb          m0, m1         ; which are negative
2608 ab4d0318 Ronald S. Bultje
%if ssse3_or_higher
2609
    punpcklbw        m6, m7         ; interleave with "1" for rounding
2610
    punpckhbw        m1, m7
2611
%else
2612 e9e456d8 Ronald S. Bultje
    punpcklbw        m6, m0         ; signed byte->word
2613
    punpckhbw        m1, m0
2614 ab4d0318 Ronald S. Bultje
%endif
2615 e9e456d8 Ronald S. Bultje
    mova       lim_sign, m0
2616 ab4d0318 Ronald S. Bultje
%if ssse3_or_higher
2617
    mova             m7, [pb_27_63]
2618
%ifndef m8
2619
    mova        lim_res, m1
2620
%endif
2621
%ifdef m10
2622
    SWAP              0, 10         ; don't lose lim_sign copy
2623
%endif
2624
    mova             m0, m7
2625
    pmaddubsw        m7, m6
2626
    SWAP              6, 7
2627
    pmaddubsw        m0, m1
2628
    SWAP              1, 0
2629
%ifdef m10
2630
    SWAP              0, 10
2631
%else
2632
    mova             m0, lim_sign
2633
%endif
2634
%else
2635 e9e456d8 Ronald S. Bultje
    mova       mask_res, m6         ; backup for later in filter
2636
    mova        lim_res, m1
2637
    pmullw          m6, [pw_27]
2638
    pmullw          m1, [pw_27]
2639
    paddw           m6, m7
2640
    paddw           m1, m7
2641 ab4d0318 Ronald S. Bultje
%endif
2642 e9e456d8 Ronald S. Bultje
    psraw           m6, 7
2643
    psraw           m1, 7
2644
    packsswb        m6, m1          ; a0
2645
    pxor            m1, m1
2646
    psubb           m1, m6
2647
    pand            m1, m0          ; -a0
2648
    pandn           m0, m6          ; +a0
2649 ab4d0318 Ronald S. Bultje
%if ssse3_or_higher
2650
    mova            m6, [pb_18_63]  ; pipelining
2651
%endif
2652 e9e456d8 Ronald S. Bultje
    psubusb         m3, m1
2653
    paddusb         m4, m1
2654
    paddusb         m3, m0          ; p0+a0
2655
    psubusb         m4, m0          ; q0-a0
2656
2657 ab4d0318 Ronald S. Bultje
%if ssse3_or_higher
2658
    SWAP             6, 7
2659
%ifdef m10
2660
    SWAP             1, 10
2661
%else
2662 e9e456d8 Ronald S. Bultje
    mova            m1, lim_res
2663 ab4d0318 Ronald S. Bultje
%endif
2664
    mova            m0, m7
2665
    pmaddubsw       m7, m6
2666
    SWAP             6, 7
2667
    pmaddubsw       m0, m1
2668
    SWAP             1, 0
2669
%ifdef m10
2670
    SWAP             0, 10
2671
%endif
2672 e9e456d8 Ronald S. Bultje
    mova            m0, lim_sign
2673 ab4d0318 Ronald S. Bultje
%else
2674
    mova            m6, mask_res
2675
    mova            m1, lim_res
2676 e9e456d8 Ronald S. Bultje
    pmullw          m6, [pw_18]
2677
    pmullw          m1, [pw_18]
2678
    paddw           m6, m7
2679
    paddw           m1, m7
2680 ab4d0318 Ronald S. Bultje
%endif
2681
    mova            m0, lim_sign
2682 e9e456d8 Ronald S. Bultje
    psraw           m6, 7
2683
    psraw           m1, 7
2684
    packsswb        m6, m1          ; a1
2685
    pxor            m1, m1
2686
    psubb           m1, m6
2687
    pand            m1, m0          ; -a1
2688
    pandn           m0, m6          ; +a1
2689 ab4d0318 Ronald S. Bultje
%if ssse3_or_higher
2690
    mova            m6, [pb_9_63]
2691
%endif
2692 e9e456d8 Ronald S. Bultje
    psubusb         m2, m1
2693
    paddusb         m5, m1
2694
    paddusb         m2, m0          ; p1+a1
2695
    psubusb         m5, m0          ; q1-a1
2696
2697 ab4d0318 Ronald S. Bultje
%if ssse3_or_higher
2698
    SWAP             6, 7
2699
%ifdef m10
2700
    SWAP             1, 10
2701
%else
2702
    mova            m1, lim_res
2703
%endif
2704
    mova            m0, m7
2705
    pmaddubsw       m7, m6
2706
    SWAP             6, 7
2707
    pmaddubsw       m0, m1
2708
    SWAP             1, 0
2709
%else
2710 e9e456d8 Ronald S. Bultje
%ifdef m8
2711
    SWAP             6, 12
2712
    SWAP             1, 8
2713
%else
2714
    mova            m6, mask_res
2715
    mova            m1, lim_res
2716
%endif
2717
    pmullw          m6, [pw_9]
2718
    pmullw          m1, [pw_9]
2719
    paddw           m6, m7
2720
    paddw           m1, m7
2721 ab4d0318 Ronald S. Bultje
%endif
2722 2a180c69 Ronald S. Bultje
%ifdef m9
2723
    SWAP             7, 9
2724 e9e456d8 Ronald S. Bultje
%else
2725
    mova            m7, lim_sign
2726
%endif
2727
    psraw           m6, 7
2728
    psraw           m1, 7
2729
    packsswb        m6, m1          ; a1
2730
    pxor            m0, m0
2731
    psubb           m0, m6
2732
    pand            m0, m7          ; -a1
2733
    pandn           m7, m6          ; +a1
2734
%ifdef m8
2735
    SWAP             1, 13
2736
    SWAP             6, 14
2737
%else
2738
    mova            m1, p2backup
2739
    mova            m6, q2backup
2740
%endif
2741
    psubusb         m1, m0
2742
    paddusb         m6, m0
2743
    paddusb         m1, m7          ; p1+a1
2744
    psubusb         m6, m7          ; q1-a1
2745
2746
    ; store
2747
%ifidn %2, v
2748
    movrow [dst2_reg+mstride_reg*4], m1
2749
    movrow [dst_reg +mstride_reg*2], m2
2750
    movrow [dst_reg +mstride_reg  ], m3
2751
    movrow    [dst_reg], m4
2752
    movrow   [dst2_reg], m5
2753
    movrow [dst2_reg+ stride_reg  ], m6
2754
%if mmsize == 16 && %4 == 8
2755
    add        dst8_reg, mstride_reg
2756
    movhps [dst8_reg+mstride_reg*2], m1
2757
    movhps [dst8_reg+mstride_reg  ], m2
2758
    movhps   [dst8_reg], m3
2759
    add        dst8_reg, stride_reg
2760
    movhps   [dst8_reg], m4
2761
    movhps [dst8_reg+ stride_reg  ], m5
2762
    movhps [dst8_reg+ stride_reg*2], m6
2763
%endif
2764
%else ; h
2765
    inc         dst_reg
2766
    inc        dst2_reg
2767
2768
    ; 4x8/16 transpose
2769
    TRANSPOSE4x4B     1, 2, 3, 4, 0
2770
    SBUTTERFLY       bw, 5, 6, 0
2771
2772
%if mmsize == 8 ; mmx/mmxext (h)
2773
    WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
2774
    add         dst_reg, 4
2775 2208053b Ronald S. Bultje
    WRITE_2x4W       m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
2776 e9e456d8 Ronald S. Bultje
%else ; sse2 (h)
2777
    lea        dst8_reg, [dst8_reg+mstride_reg+1]
2778
    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2779 003243c3 Ronald S. Bultje
    lea         dst_reg, [dst2_reg+mstride_reg+4]
2780
    lea        dst8_reg, [dst8_reg+mstride_reg+4]
2781 2208053b Ronald S. Bultje
%ifidn %1, sse4
2782
    add        dst2_reg, 4
2783
%endif
2784
    WRITE_8W         m5, dst2_reg, dst_reg,  mstride_reg, stride_reg
2785
%ifidn %1, sse4
2786 6de5b7c6 Ronald S. Bultje
    lea        dst2_reg, [dst8_reg+ stride_reg]
2787 dc5eec80 Ronald S. Bultje
%endif
2788 2208053b Ronald S. Bultje
    WRITE_8W         m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
2789 e9e456d8 Ronald S. Bultje
%endif
2790
%endif
2791
2792
%if mmsize == 8
2793
%if %4 == 8 ; chroma
2794
%ifidn %2, h
2795
    sub         dst_reg, 5
2796
%endif
2797
    cmp         dst_reg, dst8_reg
2798
    mov         dst_reg, dst8_reg
2799
    jnz .next8px
2800
%else
2801
%ifidn %2, h
2802
    lea         dst_reg, [dst_reg + stride_reg*8-5]
2803
%else ; v
2804
    add         dst_reg, 8
2805
%endif
2806
    dec         cnt_reg
2807
    jg .next8px
2808
%endif
2809
%endif
2810
2811
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2812
    mov             rsp, stack_reg   ; restore stack pointer
2813
%endif
2814
    RET
2815
%endmacro
2816
2817
INIT_MMX
2818 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_MMX
2819 7dd224a4 Jason Garrett-Glaser
MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
2820
MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
2821
MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
2822
MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
2823 e3f7bf77 Ronald S. Bultje
2824
%define SPLATB_REG SPLATB_REG_MMXEXT
2825
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
2826
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2827 7dd224a4 Jason Garrett-Glaser
MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
2828
MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
2829 e9e456d8 Ronald S. Bultje
2830
INIT_XMM
2831 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_SSE2
2832 2208053b Ronald S. Bultje
%define WRITE_8W   WRITE_8W_SSE2
2833 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER sse2,   v, 5, 16, 15
2834 e9e456d8 Ronald S. Bultje
%ifdef m8
2835 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER sse2,   h, 5, 16, 15
2836 e9e456d8 Ronald S. Bultje
%else
2837 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER sse2,   h, 6, 16, 15
2838 e9e456d8 Ronald S. Bultje
%endif
2839 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER sse2,   v, 6,  8, 15
2840
MBEDGE_LOOPFILTER sse2,   h, 6,  8, 15
2841 7dd224a4 Jason Garrett-Glaser
2842 e3f7bf77 Ronald S. Bultje
%define SPLATB_REG SPLATB_REG_SSSE3
2843 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 15
2844 7dd224a4 Jason Garrett-Glaser
%ifdef m8
2845 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 15
2846 7dd224a4 Jason Garrett-Glaser
%else
2847 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 15
2848 7dd224a4 Jason Garrett-Glaser
%endif
2849 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 15
2850
MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 15
2851 dc5eec80 Ronald S. Bultje
2852 2208053b Ronald S. Bultje
%define WRITE_8W   WRITE_8W_SSE4
2853 dc5eec80 Ronald S. Bultje
%ifdef m8
2854 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER sse4,   h, 5, 16, 15
2855 dc5eec80 Ronald S. Bultje
%else
2856 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER sse4,   h, 6, 16, 15
2857 dc5eec80 Ronald S. Bultje
%endif
2858 2a180c69 Ronald S. Bultje
MBEDGE_LOOPFILTER sse4,   h, 6,  8, 15